diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 00000000000000..4077f91d3a683f --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,7 @@ +version: 2 +jobs: + build: + docker: + - image: circleci/python:3.7-node-browsers + steps: + - run: echo "hello world" diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh index 345e89ccf879fb..30108031f72308 100755 --- a/.jenkins/caffe2/build.sh +++ b/.jenkins/caffe2/build.sh @@ -155,6 +155,9 @@ if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then export LANG=C.UTF-8 export LC_ALL=C.UTF-8 export HCC_AMDGPU_TARGET=gfx900 + + ########## HIPIFY Caffe2 operators + ${PYTHON} "${ROOT_DIR}/tools/amd_build/build_caffe2_amd.py" fi # Try to include Redis support for Linux builds @@ -195,6 +198,7 @@ else fi + ############################################################################### # Configure and make ############################################################################### diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index e4e0e9fc1d66f0..1e05bbdcb9b600 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -104,5 +104,5 @@ if [[ "$BUILD_TEST_LIBTORCH" == "1" ]]; then echo "Building libtorch" # NB: Install outside of source directory (at the same level as the root # pytorch folder) so that it doesn't get cleaned away prior to docker push. - WERROR=1 VERBOSE=1 tools/cpp_build/build_all.sh "$PWD/../cpp-build" + WERROR=1 VERBOSE=1 tools/cpp_build/build_caffe2.sh "$PWD/../cpp-build" fi diff --git a/.jenkins/pytorch/disabled-configs.txt b/.jenkins/pytorch/disabled-configs.txt index cdd51d3fb54a56..c7041697026085 100644 --- a/.jenkins/pytorch/disabled-configs.txt +++ b/.jenkins/pytorch/disabled-configs.txt @@ -3,3 +3,5 @@ # fail. You can use this to temporarily reserve a test name to # turn on CI side before PyTorch repository supports it. This # file has the same format as .jenkins/enabled-configs.txt + +py2-clang3.8-rocmnightly-ubuntu16.04-test diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh index 7dc760c06683f4..92ef7ad191adb0 100755 --- a/.jenkins/pytorch/macos-test.sh +++ b/.jenkins/pytorch/macos-test.sh @@ -57,7 +57,7 @@ test_cpp_api() { CPP_BUILD="$PWD/../cpp-build" rm -rf $CPP_BUILD mkdir -p $CPP_BUILD - WERROR=1 VERBOSE=1 tools/cpp_build/build_all.sh "$CPP_BUILD" + WERROR=1 VERBOSE=1 tools/cpp_build/build_caffe2.sh "$CPP_BUILD" python tools/download_mnist.py --quiet -d test/cpp/api/mnist @@ -65,7 +65,7 @@ test_cpp_api() { # without these paths being set export DYLD_LIBRARY_PATH="$DYLD_LIBRARY_PATH:$PWD/miniconda3/lib" export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/miniconda3/lib" - "$CPP_BUILD"/libtorch/bin/test_api + "$CPP_BUILD"/caffe2/bin/test_api } if [ -z "${JOB_BASE_NAME}" ] || [[ "${JOB_BASE_NAME}" == *-test ]]; then diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index bc2762860dd2bd..7e5b98ee628cd1 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -9,11 +9,6 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh" echo "Testing pytorch" -if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then - echo "Skipping ROCm tests for now" - exit 0 -fi - # JIT C++ extensions require ninja. git clone https://github.com/ninja-build/ninja --quiet pushd ninja @@ -49,13 +44,10 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_aten_asan(3)") fi -export ATEN_DISABLE_AVX= -export ATEN_DISABLE_AVX2= if [[ "${JOB_BASE_NAME}" == *-NO_AVX-* ]]; then - export ATEN_DISABLE_AVX=1 -fi -if [[ "${JOB_BASE_NAME}" == *-NO_AVX2-* ]]; then - export ATEN_DISABLE_AVX2=1 + export ATEN_CPU_CAPABILITY=default +elif [[ "${JOB_BASE_NAME}" == *-NO_AVX2-* ]]; then + export ATEN_CPU_CAPABILITY=avx fi test_python_nn() { @@ -104,12 +96,12 @@ test_libtorch() { echo "Testing libtorch" CPP_BUILD="$PWD/../cpp-build" if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then - "$CPP_BUILD"/libtorch/bin/test_jit + "$CPP_BUILD"/caffe2/bin/test_jit else - "$CPP_BUILD"/libtorch/bin/test_jit "[cpu]" + "$CPP_BUILD"/caffe2/bin/test_jit "[cpu]" fi python tools/download_mnist.py --quiet -d test/cpp/api/mnist - OMP_NUM_THREADS=2 "$CPP_BUILD"/libtorch/bin/test_api + OMP_NUM_THREADS=2 "$CPP_BUILD"/caffe2/bin/test_api fi } diff --git a/CMakeLists.txt b/CMakeLists.txt index 1cb5d13bc10f31..9e302869a8a8ad 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -53,6 +53,7 @@ endif() # Note to developers: if you add an option below, make sure you also add it to # cmake/Summary.cmake so that the summary prints out the option values. include(CMakeDependentOption) +option(BUILD_TORCH "Build Torch" OFF) option(BUILD_CAFFE2 "Build Caffe2" ON) option(BUILD_ATEN "Build ATen" OFF) option(BUILD_BINARY "Build C++ binaries" ON) @@ -214,6 +215,7 @@ if(NOT MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-variable") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-function") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-strict-overflow") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-strict-aliasing") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations") # These flags are not available in GCC-4.8.5. Set only when using clang. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 805cfd72573b76..08ff783dea4657 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -72,6 +72,9 @@ For example: You do not need to repeatedly install after modifying python files. +In case you want to reinstall, make sure that you uninstall pytorch first by running `pip uninstall torch` +and `python setup.py clean`. Then you can install in `build develop` mode again. + ## Unit testing PyTorch's testing is located under `test/`. Run the entire test suite with @@ -146,9 +149,7 @@ working on: - Working on `torch/lib` and want to run your changes / rerun cmake? Run `python setup.py build_deps`. Note that this will rerun cmake for - every subdirectory in TH; if you are only working on one project, - consider editing `torch/lib/build_all.sh` and commenting out the - `build` lines of libraries you are not working on. + every subdirectory in TH. On the initial build, you can also speed things up with the environment variables `DEBUG` and `NO_CUDA`. diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt index eb7cf48e316a24..462a12b086d2d0 100644 --- a/aten/CMakeLists.txt +++ b/aten/CMakeLists.txt @@ -80,14 +80,20 @@ add_subdirectory(src/TH) set(TH_CPU_INCLUDE # dense ${CMAKE_CURRENT_SOURCE_DIR}/src/TH - ${CMAKE_CURRENT_SOURCE_DIR}/src/THC ${CMAKE_CURRENT_BINARY_DIR}/src/TH - ${CMAKE_CURRENT_BINARY_DIR}/src/THC - ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_BINARY_DIR}/src ${CMAKE_BINARY_DIR}/aten/src) list(APPEND ATen_CPU_INCLUDE ${TH_CPU_INCLUDE}) + +if(USE_CUDA OR USE_ROCM) + set(TH_CUDA_INCLUDE + # dense + ${CMAKE_CURRENT_SOURCE_DIR}/src/THC + ${CMAKE_CURRENT_BINARY_DIR}/src/THC) + list(APPEND ATen_CUDA_INCLUDE ${TH_CUDA_INCLUDE}) +endif() + add_subdirectory(src/THNN) # Find the HIP package, set the HIP paths, load the HIP CMake. diff --git a/aten/src/ATen/ATen.h b/aten/src/ATen/ATen.h index a7084d474d8ab2..2ae326a68ecebc 100644 --- a/aten/src/ATen/ATen.h +++ b/aten/src/ATen/ATen.h @@ -21,3 +21,4 @@ #include "ATen/TensorOptions.h" #include "ATen/Layout.h" #include "ATen/OptionsGuard.h" +#include "ATen/CUDAGuard.h" diff --git a/aten/src/ATen/Allocator.h b/aten/src/ATen/Allocator.h index 867ae4cb5f54bb..c1c78102a0fef8 100644 --- a/aten/src/ATen/Allocator.h +++ b/aten/src/ATen/Allocator.h @@ -30,6 +30,9 @@ class DataPtr { DataPtr(void* data, void* ctx, DeleterFnPtr ctx_deleter, Device device) : ptr_(data, ctx, ctx_deleter), device_(device) {} void* operator->() const { return ptr_.get(); } + void clear() { + ptr_.clear(); + } void* get() const { return ptr_.get(); } void* get_context() const { return ptr_.get_context(); } void* release_context() { return ptr_.release_context(); } diff --git a/aten/src/ATen/CUDAGuard.h b/aten/src/ATen/CUDAGuard.h new file mode 100644 index 00000000000000..8027084caa7c30 --- /dev/null +++ b/aten/src/ATen/CUDAGuard.h @@ -0,0 +1,110 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include + +namespace at { + +/// A variant of `DeviceGuard` that augments it with an understanding of CUDA +/// streams. This guard can not only set and reset the current CUDA device, but +/// also set and reset the current CUDA stream. It is important to note that +/// because a CUDA stream is intrinsically associated with the CUDA device to +/// which it is bound, setting the CUDA stream *also* sets the current CUDA +/// device to that of the stream. +struct CUDAGuard { + /// Default constructor, does nothing and causes no change in the current + /// stream or device until `set_stream` or `set_device` is called. + CUDAGuard() = default; + + /// Sets the CUDA stream and its associated device as the current one (calls + /// `set_stream`). + explicit CUDAGuard(const CUDAStream& stream) { + set_stream(stream); + } + + /// Calls `set_device` with the given index. + explicit CUDAGuard(int32_t device) { + set_device(device); + } + + CUDAGuard(const CUDAGuard&) = delete; + CUDAGuard& operator=(const CUDAGuard&) = delete; + + /// Move-constructs this `CUDAGuard` from another `CUDAGuard`. The + /// moved-from `CUDAGuard` is modified such that its destruction has no + /// effect (does not reset the stream or device). + CUDAGuard(CUDAGuard&& other) noexcept = default; + + /// Move-assigns this `CUDAGuard` from another `CUDAGuard`. The + /// moved-from `CUDAGuard` is modified such that its destruction has no + /// effect (does not reset the stream or device). + CUDAGuard& operator=(CUDAGuard&& other) { + device_guard_ = std::move(other.device_guard_); + original_streams_ = std::move(other.original_streams_); + other.original_streams_.clear(); + return *this; + } + + /// Resets the CUDA stream on each device to the one that was active upon + /// construction. + ~CUDAGuard() { + if (!original_streams_.empty()) { + for (size_t device = 0; device < original_streams_.size(); ++device) { + globalContext().uncheckedSetCurrentCUDAStreamOnDevice( + device, original_streams_[device]); + } + } + } + + /// Sets the current CUDA device to the device associated with the given + /// stream, and then sets the current stream on that device to the one given. + void set_stream(const CUDAStream& stream) { + device_guard_.set_index(stream.device()); + // If we haven't stored the current stream yet, store it now. + if (original_streams_.empty()) { + const size_t device_count = globalContext().getNumGPUs(); + original_streams_.reserve(device_count); + for (size_t device = 0; device < device_count; ++device) { + original_streams_.push_back( + globalContext().getCurrentCUDAStreamOnDevice(device)); + } + } + globalContext().setCurrentCUDAStreamOnDevice( + device_guard_.last_index(), stream); + } + + /// Sets the CUDA device to the given one. + void set_device(int32_t device) { + device_guard_.set_index(device); + } + + /// Returns the CUDA streams that were active in the first call to + /// `set_stream`. If there was no such call, the returned container is + /// empty. + ArrayRef original_streams() const noexcept { + return original_streams_; + } + + /// Returns the device that was set upon construction of the guard. + int32_t original_device() const noexcept { + return device_guard_.original_index(); + } + + /// Returns the last device that was set via `set_device`, if any. + int32_t last_device() const noexcept { + return device_guard_.last_index(); + } + + private: + /// The guard for the current device. + DeviceGuard device_guard_; + /// The original streams that were active on all devices. + std::vector original_streams_; +}; + +} // namespace at diff --git a/aten/src/ATen/CUDAStream.cpp b/aten/src/ATen/CUDAStream.cpp index ad9d51cbf46f2b..b8b8d588ffbfc4 100644 --- a/aten/src/ATen/CUDAStream.cpp +++ b/aten/src/ATen/CUDAStream.cpp @@ -1,10 +1,10 @@ -#include "ATen/CUDAStream.h" + #include "ATen/CUDAStream.h" #include "ATen/Error.h" #include "ATen/detail/CUDAHooksInterface.h" #include -// Internal implementation is entirely hidden +// Internal implementation is entirely hidden struct CUDAStreamInternals { bool is_destructible; std::atomic refcount; @@ -29,7 +29,7 @@ namespace detail { // Creates a(n indestructible) default stream for each device // Note: the default stream on each device is signified by a zero // value for the pointer, and so is not actually created as usual. - // In particular, we don't need to switch devices when creating the + // In particular, we don't need to switch devices when creating the // streams. static void initDefaultCUDAStreams() { num_gpus = getCUDAHooks().getNumGPUs(); @@ -46,8 +46,8 @@ namespace detail { static void initCUDAStreamsOnce() { // Inits default streams (once, globally) std::call_once(init_flag, initDefaultCUDAStreams); - - // Inits current streams (thread local) to default streams + + // Inits current streams (thread local) to default streams if (current_streams) return; current_streams = (CUDAStreamInternals**) malloc(num_gpus * sizeof(CUDAStreamInternals*)); for (auto i = decltype(num_gpus){0}; i < num_gpus; ++i) { @@ -68,7 +68,7 @@ namespace detail { // Helper to verify the GPU index is valid static inline void check_gpu(int64_t device) { - AT_CHECK(device >= 0 && device < num_gpus); + AT_ASSERT(device >= 0 && device < num_gpus); } CUDAStreamInternals* CUDAStream_getDefaultStreamOnDevice(int64_t device) { @@ -91,14 +91,14 @@ namespace detail { } // Note: despite not being "unsafe," is using these methods in a multithreaded - // environment then the caller must be sure that streams are valid + // environment then the caller must be sure that streams are valid // when they're requested. These methods will throw an error if an // invalid stream is requested. CUDAStreamInternals* CUDAStream_getAndRetainCurrentStreamOnDevice(int64_t device) { initCUDAStreamsOnce(); check_gpu(device); auto cur = current_streams[device]; - AT_CHECK(CUDAStream_retain(cur)); + AT_ASSERT(CUDAStream_retain(cur)); return cur; } CUDAStreamInternals* CUDAStream_getAndRetainCurrentStream() { @@ -120,44 +120,58 @@ namespace detail { void CUDAStream_setStreamOnDevice(int64_t device, CUDAStreamInternals* ptr) { initCUDAStreamsOnce(); check_gpu(device); - AT_CHECK(ptr); - AT_CHECK(ptr->device == device); - AT_CHECK(CUDAStream_retain(ptr)); + AT_ASSERT(ptr); + AT_ASSERT(ptr->device == device); + AT_ASSERT(CUDAStream_retain(ptr)); CUDAStream_free(current_streams[device]); current_streams[device] = ptr; } + + void CUDAStream_uncheckedSetStreamOnDevice(int64_t device, CUDAStreamInternals* ptr) { + initCUDAStreamsOnce(); + CUDAStream_uncheckedFree(current_streams[device]); + current_streams[device] = ptr; + } + void CUDAStream_setStream(CUDAStreamInternals* ptr) { CUDAStream_setStreamOnDevice(current_device(), ptr); } // Getters cudaStream_t CUDAStream_stream(CUDAStreamInternals* ptr) { - AT_CHECK(ptr); + AT_ASSERT(ptr); return ptr->stream; } int64_t CUDAStream_device(CUDAStreamInternals* ptr) { - AT_CHECK(ptr); + AT_ASSERT(ptr); return ptr->device; } // Memory management // Note: only destructible (non-default) streams are ref counted bool CUDAStream_retain(CUDAStreamInternals* ptr) { - AT_CHECK(ptr); + AT_ASSERT(ptr); if (ptr->is_destructible) return(++ptr->refcount > 1); return true; } void CUDAStream_free(CUDAStreamInternals*& ptr) { if (ptr && ptr->stream && ptr->is_destructible && --ptr->refcount <= 0) { - AT_CHECK(ptr->refcount == 0); + AT_ASSERT(ptr->refcount == 0); DynamicCUDAInterface::cuda_stream_destroy(ptr->stream); free(ptr); ptr = nullptr; } } + void CUDAStream_uncheckedFree(CUDAStreamInternals*& ptr) { + if (ptr && ptr->stream && ptr->is_destructible && --ptr->refcount <= 0) { + DynamicCUDAInterface::unchecked_cuda_stream_destroy(ptr->stream); + free(ptr); + ptr = nullptr; + } + } } // namespace detail @@ -167,17 +181,17 @@ namespace detail { // Copy constructor CUDAStream::CUDAStream(const CUDAStream& other) { - AT_CHECK(other.internals_); - AT_CHECK(detail::CUDAStream_retain(other.internals_)); + AT_ASSERT(other.internals_); + AT_ASSERT(detail::CUDAStream_retain(other.internals_)); internals_ = other.internals_; } // Move constructor CUDAStream::CUDAStream(CUDAStream&& other) { - AT_CHECK(other.internals_); + AT_ASSERT(other.internals_); std::swap(internals_, other.internals_); } - + } // namespace at diff --git a/aten/src/ATen/CUDAStream.h b/aten/src/ATen/CUDAStream.h index 34a1295b712da8..c5fc8111e13bef 100644 --- a/aten/src/ATen/CUDAStream.h +++ b/aten/src/ATen/CUDAStream.h @@ -5,9 +5,9 @@ /* * A CUDA stream interface with no CUDA build dependency. -* +* * Includes the CUDAStream RAII class and a pointer-based stream API. -* +* * The ATen Context interface should be preferred when working with streams. */ @@ -39,6 +39,9 @@ CUDAStreamInternals* CUDAStream_getCurrentStreamOnDeviceUnsafe(int64_t device); CUDAStreamInternals* CUDAStream_getCurrentStreamUnsafe(); void CUDAStream_setStreamOnDevice(int64_t device, CUDAStreamInternals* internals); +void CUDAStream_uncheckedSetStreamOnDevice( + int64_t device, + CUDAStreamInternals* internals); void CUDAStream_setStream(CUDAStreamInternals* internals); cudaStream_t CUDAStream_stream(CUDAStreamInternals*); @@ -46,6 +49,7 @@ int64_t CUDAStream_device(CUDAStreamInternals*); bool CUDAStream_retain(CUDAStreamInternals*); void CUDAStream_free(CUDAStreamInternals*&); +void CUDAStream_uncheckedFree(CUDAStreamInternals*&); } // namespace detail @@ -58,23 +62,33 @@ struct CUDAStream { // Constructors CUDAStream() = default; - CUDAStream(CUDAStreamInternals* internals) : internals_{internals} { } - + /* implicit */ CUDAStream(CUDAStreamInternals* internals, bool retain = false) + : internals_{internals} { + if (retain) { + detail::CUDAStream_retain(internals_); + } + } + // Destructor - ~CUDAStream() { detail::CUDAStream_free(internals_); } + ~CUDAStream() { detail::CUDAStream_uncheckedFree(internals_); } // Copy constructor CUDAStream(const CUDAStream& other); // Move constructor - CUDAStream(CUDAStream&& other); + CUDAStream(CUDAStream&& other); // Assignment operator - CUDAStream& operator=(CUDAStream other) { + CUDAStream& operator=(CUDAStream other) noexcept { std::swap(internals_, other.internals_); return *this; } + // Returns true if the CUDAStream is not null. + explicit operator bool() const noexcept { + return internals_ != nullptr; + } + // Implicit conversion to cudaStream_t operator cudaStream_t() const { return detail::CUDAStream_stream(internals_); } diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index accb57bbda67b4..9a9125ccedfe0a 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -114,6 +114,12 @@ class AT_API Context { return detail::CUDAStream_setStreamOnDevice(device, stream.internals()); } + void uncheckedSetCurrentCUDAStreamOnDevice(int64_t device, CUDAStream stream) + const { + return detail::CUDAStream_uncheckedSetStreamOnDevice( + device, stream.internals()); + } + #ifndef __HIP_PLATFORM_HCC__ cusparseHandle_t getCurrentCUDASparseHandle() const { return detail::getCUDAHooks().getCurrentCUDASparseHandle(thc_state.get()); diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap index ceb6123301acf6..f38d4e8a674cb1 100644 --- a/aten/src/ATen/Declarations.cwrap +++ b/aten/src/ATen/Declarations.cwrap @@ -2479,21 +2479,7 @@ - THTensor* other ]] [[ - name: clamp - cname: clamp - variants: - - method - - function - return: argument 0 - arguments: - - arg: THTensor* result - output: True - - THTensor* self - - real min - - real max -]] -[[ - name: clamp_ + name: _th_clamp_ cname: clamp variants: - method @@ -2506,20 +2492,7 @@ - real max ]] [[ - name: clamp_min - cname: cmaxValue - variants: - - method - - function - return: argument 0 - arguments: - - arg: THTensor* result - output: True - - THTensor* self - - real min -]] -[[ - name: clamp_min_ + name: _th_clamp_min_ cname: cmaxValue variants: - method @@ -2531,20 +2504,7 @@ - real min ]] [[ - name: clamp_max - cname: cminValue - variants: - - method - - function - return: argument 0 - arguments: - - arg: THTensor* result - output: True - - THTensor* self - - real max -]] -[[ - name: clamp_max_ + name: _th_clamp_max_ cname: cminValue variants: - method @@ -3130,7 +3090,7 @@ default: S ]] [[ - name: inverse + name: _getri cname: getri types: - Float @@ -3691,38 +3651,6 @@ - THTensor* src ]] -[[ - name: as_strided - variants: [method,function] - cpu_half: True - return: argument 0 - arguments: - - arg: THTensor* result - output: True - - THTensor* self - - THSize* size - - THStride* stride - - arg: int64_t storage_offset - aten_custom_call: | - ${THTensor}_setStorage(${state,}result_->tensor, self_->tensor->storage, storage_offset, size_, stride_); - result_->maybeScalar(size.size() == 0); -]] - -[[ - name: as_strided_ - variants: [method] - cpu_half: True - return: argument 0 - arguments: - - THTensor* self - - THSize* size - - THStride* stride - - arg: int64_t storage_offset - aten_custom_call: | - ${THTensor}_setStorage(${state,}self_->tensor, self_->tensor->storage, storage_offset, size_, stride_); - self_->maybeScalar(size.size() == 0); -]] - [[ name: _cat cname: catArray diff --git a/aten/src/ATen/Device.h b/aten/src/ATen/Device.h index a6afa187e55062..2d9c27f8d4cc3d 100644 --- a/aten/src/ATen/Device.h +++ b/aten/src/ATen/Device.h @@ -6,6 +6,7 @@ #include #include #include +#include namespace at { /// Represents a a compute device on which a tensor is located. A device is @@ -112,3 +113,16 @@ struct Device { std::ostream& operator<<(std::ostream& stream, at::Device::Type type); std::ostream& operator<<(std::ostream& stream, const at::Device& device); + +namespace std { + template<> struct hash + { + size_t operator()(const at::Device& device) const noexcept { + size_t hash_val = static_cast(device.index() + 1); + if (device.is_cuda()) { + hash_val += 2; + } + return hash_val; + } + }; +} // namespace std diff --git a/aten/src/ATen/DeviceGuard.h b/aten/src/ATen/DeviceGuard.h index 6a3b84dcde9445..142eddab3d7345 100644 --- a/aten/src/ATen/DeviceGuard.h +++ b/aten/src/ATen/DeviceGuard.h @@ -28,7 +28,7 @@ struct DeviceGuard { } } - /// Calls `set_device` with the given index. + /// Calls `set_index` with the given index. explicit DeviceGuard(int32_t index) { set_index(index); } @@ -46,6 +46,29 @@ struct DeviceGuard { } } + /// Copy is disallowed. + DeviceGuard(const DeviceGuard&) = delete; + DeviceGuard& operator=(const DeviceGuard&) = delete; + + /// Move-constructs this `DeviceGuard` from another `DeviceGuard`. The + /// moved-from `DeviceGuard` is modified such that its destruction has no + /// effect (does not reset the device). + DeviceGuard(DeviceGuard&& other) noexcept { + *this = std::move(other); + } + + /// Move-assigns this `DeviceGuard` from another `DeviceGuard`. The + /// moved-from `DeviceGuard` is modified such that its destruction has no + /// effect (does not reset the device). + DeviceGuard& operator=(DeviceGuard&& other) noexcept { + this->original_index_ = other.original_index_; + this->last_index_ = other.last_index_; + // Set other's original index to the unspecified/default state, so that it + // doesn't also reset the device in its constructor. + other.original_index_ = -1; + return *this; + } + /// Resets the device to the index that was active at construction of the /// guard. ~DeviceGuard() { @@ -88,7 +111,7 @@ struct DeviceGuard { return original_index_; } - // /// Returns the last device that was set via `set_device`, if any. + /// Returns the last device that was set via `set_index`, if any. int32_t last_index() const noexcept { return last_index_; } @@ -96,7 +119,7 @@ struct DeviceGuard { private: /// The original device that was active at construction of this object. int32_t original_index_ = -1; - /// The last index that was set via `set_device`. + /// The last index that was set via `set_index`. int32_t last_index_ = -1; }; } // namespace at diff --git a/aten/src/ATen/THLongStorageView.h b/aten/src/ATen/THLongStorageView.h index 11c6ca8103e218..d1e97e194561f5 100644 --- a/aten/src/ATen/THLongStorageView.h +++ b/aten/src/ATen/THLongStorageView.h @@ -1,7 +1,7 @@ #pragma once #include "TH/TH.h" -#include "TH/THStorage.hpp" +#include "TH/THStorageFunctions.hpp" #include "TH/THTypeConversion.hpp" namespace at { @@ -37,7 +37,7 @@ class THLongStorageView { */ THLongStorageView(ArrayRef ref, THLongStorageViewKind kind) - : zero_dim_to_null(false) + : storage(at::CTypeToScalarType>::to(), 0, getTHDefaultAllocator(), 0), zero_dim_to_null(false) { // zero_dim_to_one converts an empty ArrayRef into [1] // zero_dim_to_null converts an empty ArrayRef into a null THLongStorage diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp index 296992468916f0..5872764a905ce7 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp +++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp @@ -64,15 +64,22 @@ void cuda_stream_destroy(cudaStream_t stream) { check_status(cudaStreamDestroy(stream)); } +void unchecked_cuda_stream_destroy(cudaStream_t stream) { + const auto return_code = cudaStreamDestroy(stream); + (void)return_code; +} + struct DynamicCUDAInterfaceSetter { DynamicCUDAInterfaceSetter() { - at::detail::DynamicCUDAInterface::set_device = set_device; - at::detail::DynamicCUDAInterface::get_device = get_device; - at::detail::DynamicCUDAInterface::unchecked_set_device = - unchecked_set_device; - at::detail::DynamicCUDAInterface::cuda_stream_create_with_priority = - cuda_stream_create_with_priority; - at::detail::DynamicCUDAInterface::cuda_stream_destroy = cuda_stream_destroy; + using at::detail::DynamicCUDAInterface; + DynamicCUDAInterface::set_device = set_device; + DynamicCUDAInterface::get_device = get_device; + DynamicCUDAInterface::unchecked_set_device = unchecked_set_device; + DynamicCUDAInterface::cuda_stream_create_with_priority = + cuda_stream_create_with_priority; + DynamicCUDAInterface::cuda_stream_destroy = cuda_stream_destroy; + DynamicCUDAInterface::unchecked_cuda_stream_destroy = + unchecked_cuda_stream_destroy; } }; diff --git a/aten/src/ATen/detail/CUDAHooksInterface.cpp b/aten/src/ATen/detail/CUDAHooksInterface.cpp index b6897ed0d6e270..288b066feafeb1 100644 --- a/aten/src/ATen/detail/CUDAHooksInterface.cpp +++ b/aten/src/ATen/detail/CUDAHooksInterface.cpp @@ -29,14 +29,20 @@ void default_unchecked_set_device(int32_t) { void default_cuda_stream_create_with_priority(cudaStream_t*, int32_t, int32_t) { AT_ERROR( - "DynamicCUDAInterface::cuda_stream_create_with_priority called " - "before CUDA library was loaded"); + "DynamicCUDAInterface::cuda_stream_create_with_priority called " + "before CUDA library was loaded"); } void default_cuda_stream_destroy(cudaStream_t) { AT_ERROR( - "DynamicCUDAInterface::cuda_stream_destroy called " - "before CUDA library was loaded"); + "DynamicCUDAInterface::cuda_stream_destroy called " + "before CUDA library was loaded"); +} + +void default_unchecked_cuda_stream_destroy(cudaStream_t) { + AT_ERROR( + "DynamicCUDAInterface::unchecked_cuda_stream_destroy called " + "before CUDA library was loaded"); } // Default the static members of DynamicCUDAInterface. @@ -44,11 +50,14 @@ void (*DynamicCUDAInterface::set_device)(int32_t) = default_set_device; void (*DynamicCUDAInterface::get_device)(int32_t*) = default_get_device; void (*DynamicCUDAInterface::unchecked_set_device)(int32_t) = default_unchecked_set_device; -void (*DynamicCUDAInterface::cuda_stream_create_with_priority)(cudaStream_t*, int32_t, int32_t) - = default_cuda_stream_create_with_priority; -void (*DynamicCUDAInterface::cuda_stream_destroy)(cudaStream_t) - = default_cuda_stream_destroy; - +void (*DynamicCUDAInterface::cuda_stream_create_with_priority)( + cudaStream_t*, + int32_t, + int32_t) = default_cuda_stream_create_with_priority; +void (*DynamicCUDAInterface::cuda_stream_destroy)(cudaStream_t) = + default_cuda_stream_destroy; +void (*DynamicCUDAInterface::unchecked_cuda_stream_destroy)(cudaStream_t) = + default_unchecked_cuda_stream_destroy; const CUDAHooksInterface& getCUDAHooks() { static std::unique_ptr cuda_hooks; diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h index e15cf363bba038..f0596d01949d8f 100644 --- a/aten/src/ATen/detail/CUDAHooksInterface.h +++ b/aten/src/ATen/detail/CUDAHooksInterface.h @@ -177,6 +177,7 @@ struct AT_API DynamicCUDAInterface { static void (*unchecked_set_device)(int32_t); static void (*cuda_stream_create_with_priority)(cudaStream_t*, int32_t, int32_t); static void (*cuda_stream_destroy)(cudaStream_t); + static void (*unchecked_cuda_stream_destroy)(cudaStream_t); }; } // namespace detail } // namespace at diff --git a/aten/src/ATen/detail/UniqueVoidPtr.h b/aten/src/ATen/detail/UniqueVoidPtr.h index 866c0efc011b5e..e277014a7935d6 100644 --- a/aten/src/ATen/detail/UniqueVoidPtr.h +++ b/aten/src/ATen/detail/UniqueVoidPtr.h @@ -45,6 +45,10 @@ class UniqueVoidPtr { UniqueVoidPtr(void* data, void* ctx, DeleterFnPtr ctx_deleter) : data_(data), ctx_(ctx, ctx_deleter ? ctx_deleter : &deleteNothing) {} void* operator->() const { return data_; } + void clear() { + ctx_ = nullptr; + data_ = nullptr; + } void* get() const { return data_; } void* get_context() const { return ctx_.get(); } void* release_context() { return ctx_.release(); } diff --git a/aten/src/ATen/native/DispatchStub.cpp b/aten/src/ATen/native/DispatchStub.cpp new file mode 100644 index 00000000000000..662ae580c599a8 --- /dev/null +++ b/aten/src/ATen/native/DispatchStub.cpp @@ -0,0 +1,44 @@ +#include "DispatchStub.h" + +#include + +#include +#include +#include + +namespace at { namespace native { + +static CPUCapability compute_cpu_capability() { + auto envar = std::getenv("ATEN_CPU_CAPABILITY"); + if (envar) { + if (strcmp(envar, "avx2") == 0) { + return CPUCapability::AVX2; + } + if (strcmp(envar, "avx") == 0) { + return CPUCapability::AVX; + } + if (strcmp(envar, "default") == 0) { + return CPUCapability::DEFAULT; + } + AT_WARN("ignoring invalid value for ATEN_CPU_CAPABILITY: ", envar); + } + +#ifndef __powerpc__ + if (cpuinfo_initialize()) { + if (cpuinfo_has_x86_avx2() && cpuinfo_has_x86_fma3()) { + return CPUCapability::AVX2; + } + if (cpuinfo_has_x86_avx()) { + return CPUCapability::AVX; + } + } +#endif + return CPUCapability::DEFAULT; +} + +CPUCapability get_cpu_capability() { + static CPUCapability capability = compute_cpu_capability(); + return capability; +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/cpu/CapabilityDispatch.h b/aten/src/ATen/native/DispatchStub.h similarity index 51% rename from aten/src/ATen/native/cpu/CapabilityDispatch.h rename to aten/src/ATen/native/DispatchStub.h index 6cb0f279872d66..bbdf07a8458bf1 100644 --- a/aten/src/ATen/native/cpu/CapabilityDispatch.h +++ b/aten/src/ATen/native/DispatchStub.h @@ -1,8 +1,8 @@ #pragma once -#include +#include +#include #include -#include // Implements instruction set specific function dispatch. // @@ -23,72 +23,82 @@ // REGISTER_DISPATCH(stub, &kernel); // // To call: -// stub(tensor); +// stub(kCPU, tensor); // namespace at { namespace native { -enum class CPUCapability { DEFAULT, AVX, AVX2, NUM_OPTIONS }; +enum class CPUCapability { + DEFAULT = 0, + AVX = 1, + AVX2 = 2, + NUM_OPTIONS +}; + +CPUCapability get_cpu_capability(); template struct DispatchStub { static_assert(std::is_pointer::value, "FnPtr should be a pointer type"); template - void operator()(ArgTypes... args) { - if (!dispatch_ptr) { - dispatch_ptr = choose_impl(); + void operator()(Backend backend, ArgTypes... args) { + if (backend == Backend::CPU) { + if (!dispatch_ptr) { + dispatch_ptr = choose_cpu_impl(); + } + (*dispatch_ptr)(args...); + } else if (backend == Backend::CUDA) { + AT_ASSERTM(cuda_dispatch_ptr, "DispatchStub: missing CUDA kernel"); + (*cuda_dispatch_ptr)(args...); + } else { + AT_ERROR("DispatchStub: unsupported backend", backend); } - (*dispatch_ptr)(args...); } - FnPtr choose_impl() { -// Do not use cpuinfo on PowerPC as it shows confusing errors when run on ppc -#ifndef __powerpc__ - if (cpuinfo_initialize()) { - int avx2 = static_cast(CPUCapability::AVX2); - if (!std::getenv("ATEN_DISABLE_AVX2") && cpuinfo_has_x86_avx2() && - cpuinfo_has_x86_fma3() && table[avx2]) { - return table[avx2]; - } - int avx = static_cast(CPUCapability::AVX); - if (!std::getenv("ATEN_DISABLE_AVX") && cpuinfo_has_x86_avx() && table[avx]) { - return table[avx]; - } - } -#endif + FnPtr choose_cpu_impl() { int def = static_cast(CPUCapability::DEFAULT); + int avx = static_cast(CPUCapability::AVX); + int avx2 = static_cast(CPUCapability::AVX2); + + auto capability = static_cast(get_cpu_capability()); + if (capability >= avx2 && table[avx2]) { + return table[avx2]; + } + if (capability >= avx && table[avx]) { + return table[avx]; + } AT_ASSERTM(table[def], "DispatchStub: missing default kernel"); return table[def]; } FnPtr dispatch_ptr = nullptr; + FnPtr cuda_dispatch_ptr = nullptr; FnPtr table[static_cast(CPUCapability::NUM_OPTIONS)]; }; -#if defined(CPU_CAPABILITY) +#if defined(CPU_CAPABILITY) || defined(__CUDACC__) -constexpr CPUCapability CURRENT_CAPABILITY = CPUCapability::CPU_CAPABILITY; +namespace { -// Registers an implementation a kernel for the current CPU capability. -template +template struct RegisterDispatch { RegisterDispatch(DispatchStub& stub, FnPtr value) { - stub.table[static_cast(CURRENT_CAPABILITY)] = value; +#if defined(__CUDACC__) + stub.cuda_dispatch_ptr = value; +#else + int cap = static_cast(CPUCapability::CPU_CAPABILITY); + AT_ASSERT(!stub.table[cap]) + stub.table[cap] = value; +#endif } }; -// We only define the stub once in the DEFAULT capability compilation -#if defined(CPU_CAPABILITY_DEFAULT) -#define _DEFINE_STUB(stub, fn) DispatchStub stub -#else -#define _DEFINE_STUB(stub, fn) -#endif +} // anonymous namespace #define REGISTER_DISPATCH(stub, fn) \ - _DEFINE_STUB(stub, fn); \ static RegisterDispatch stub ## __register(stub, fn); #endif diff --git a/aten/src/ATen/native/Indexing.cpp b/aten/src/ATen/native/Indexing.cpp index 42ea2813bc0726..9720adb4895769 100644 --- a/aten/src/ATen/native/Indexing.cpp +++ b/aten/src/ATen/native/Indexing.cpp @@ -294,10 +294,6 @@ Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Ten AT_ERROR( "index_copy_(): When source is scalar, index should have one element (got ", numIndices, ")"); } - if (source.dim() > 0 && numIndices != source.size(dim)) { - AT_ERROR( - "index_copy_(): Number of indices (", numIndices, ") should be equal to source.size(dim) (", source.size(dim), ")"); - } if (index.type().scalarType() != ScalarType::Long) { AT_ERROR("index_copy_(): Expected LongTensor for index"); } @@ -309,7 +305,7 @@ Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Ten } auto sourceSlicedSizes = std::vector(source.sizes()); if (sourceSlicedSizes.size() > 0) { - sourceSlicedSizes.erase(sourceSlicedSizes.begin()); + sourceSlicedSizes.erase(sourceSlicedSizes.begin() + dim); } if (selfSlicedSizes.size() != sourceSlicedSizes.size() || !std::equal(selfSlicedSizes.begin(), selfSlicedSizes.end(), @@ -320,6 +316,10 @@ Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Ten ss << " and source slice shape: " << sourceSlicedSizes << " at dimension 0."; throw std::runtime_error(ss.str()); } + if (source.dim() > 0 && numIndices != source.size(dim)) { + AT_ERROR( + "index_copy_(): Number of indices (", numIndices, ") should be equal to source.size(dim) (", source.size(dim), ")"); + } return self._indexCopy_(dim, index, source); } diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index ea87d42dfa58f0..388d704a834d48 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -83,10 +83,33 @@ std::tuple slogdet(const Tensor& self) { return std::make_tuple(det.sign(), diag_U.abs_().log_().sum()); } +Tensor inverse(const Tensor& self) { + Tensor result = self.type().tensor(); + return at::native::inverse_out(result, self); +} + +Tensor& inverse_out(Tensor &result, const Tensor &self) { + AT_CHECK(self.type().backend() == kCPU || self.type().backend() == kCUDA, + "tensor should have CPU or CUDA backend"); + AT_CHECK(self.dim() == 2, "tensor should be 2 dimensional"); + AT_CHECK(self.size(0) == self.size(1), "tensor should be square"); + AT_CHECK(at::isFloatingType(self.type().scalarType()), "tensor should be of floating-point type"); + if (self.size(0) == 0) { + return result.resize_({0, 0}); + } else { + return at::_getri_out(result, self); + } +} + Tensor pinverse(const Tensor& self, double rcond) { AT_CHECK(at::isFloatingType(self.type().scalarType()) && self.dim() == 2, "pinverse(", self.type(), "{", self.sizes(), "}): expected a 2D tensor " "of floating types"); + AT_CHECK(self.dim() == 2, "tensor should be 2 dimensional"); + if (self.numel() == 0) { + // Match NumPy + return self.type().tensor({self.size(1), self.size(0)}); + } Tensor U, S, V; std::tie(U, S, V) = self.svd(); double max_val = S[0].toCDouble(); diff --git a/aten/src/ATen/native/README.md b/aten/src/ATen/native/README.md index fd40bb3ab1f92c..d4ad799948b0c0 100644 --- a/aten/src/ATen/native/README.md +++ b/aten/src/ATen/native/README.md @@ -95,7 +95,9 @@ bound to ATen (in practice, C++ and Python.) **Argument names.** Argument names are meaningful; downstream binding code may make use of the specific argument name you provide, and a rename of an argument name is considered a BC-breaking change (e.g., you will probably need to update `tools/autograd/derivatives.yaml` at -least). +least). In `native_functions.yaml`, if your function (usually functions named with 'out' affix) args +include the result Tensor, you need to call the argument `Tensor result`. And if there are more +than one result Tensors, you need to name the args `Tensor result0, Tensor result1, ...`. TODO: Do argument names affect Python keyword arguments? diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp index affa9d24059d99..d055a91afa7596 100644 --- a/aten/src/ATen/native/ReduceOps.cpp +++ b/aten/src/ATen/native/ReduceOps.cpp @@ -17,6 +17,9 @@ namespace at { namespace native { +DispatchStub sum_kernel; +DispatchStub prod_kernel; + static inline Tensor integer_upcast(const Tensor& self, optional dtype) { ScalarType scalarType = self.type().scalarType(); ScalarType upcast_scalarType = dtype.value_or(at::isIntegralType(scalarType) ? ScalarType::Long : scalarType); @@ -127,7 +130,7 @@ Tensor sum(const Tensor &self) { Tensor _sum_cpu(const Tensor& self) { if (self.is_contiguous()) { Tensor result = at::empty({}, self.type()); - sum_kernel(result, self, at::nullopt); + sum_kernel(kCPU, result, self, at::nullopt); return result; } return self._sumall(); @@ -148,7 +151,7 @@ Tensor prod(const Tensor &self) { Tensor _prod_cpu(const Tensor &self) { if (self.is_contiguous()) { Tensor result = at::empty({}, self.type()); - prod_kernel(result, self, at::nullopt); + prod_kernel(kCPU, result, self, at::nullopt); return result; } return self._prodall(); @@ -222,7 +225,7 @@ Tensor &_sum_out_cpu(Tensor &result, const Tensor &self, int64_t dim_, return result; if (self.is_contiguous() && result.is_contiguous()) { _dimreduce_setup(result, self, dim); - sum_kernel(result, self, dim); + sum_kernel(kCPU, result, self, dim); if (!keepdim) result.squeeze_(dim); return result; } @@ -260,7 +263,7 @@ Tensor &_prod_out_cpu(Tensor &result, const Tensor &self, int64_t dim_, return result; if (self.is_contiguous() && result.is_contiguous()) { _dimreduce_setup(result, self, dim); - prod_kernel(result, self, dim); + prod_kernel(kCPU, result, self, dim); if (!keepdim) result.squeeze_(dim); return result; } diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp index ff83c78f4554f6..56e0ab6ca6ba10 100644 --- a/aten/src/ATen/native/SoftMax.cpp +++ b/aten/src/ATen/native/SoftMax.cpp @@ -14,6 +14,9 @@ template void host_softmax(Tensor output, const Tensor& input, const int64_t dim) { int64_t outer_size = 1; int64_t dim_size = input.size(dim); + if (input.numel() == 0) { + return; + } int64_t inner_size = 1; for (int64_t i = 0; i < dim; ++i) outer_size *= input.size(i); @@ -125,7 +128,7 @@ Tensor softmax_cpu(const Tensor& input_, const int64_t dim_) { dim >= 0 && dim < input.dim(), "dim must be non-negative and less than input dimensions"); if (input.ndimension() > 0 && dim == input.ndimension() - 1) { - softmax_lastdim_kernel(output, input); + softmax_lastdim_kernel(kCPU, output, input); } else { AT_DISPATCH_FLOATING_TYPES(input.type(), "softmax", [&] { host_softmax(output, input, dim); @@ -144,7 +147,7 @@ Tensor log_softmax_cpu(const Tensor& input_, const int64_t dim_) { dim >= 0 && dim < input.dim(), "dim must be non-negative and less than input dimensions"); if (input.ndimension() > 0 && dim == input.ndimension() - 1) { - log_softmax_lastdim_kernel(output, input); + log_softmax_lastdim_kernel(kCPU, output, input); } else { AT_DISPATCH_FLOATING_TYPES(input.type(), "log_softmax", [&] { host_softmax(output, input, dim); @@ -173,7 +176,7 @@ Tensor softmax_backward_cpu( dim >= 0 && dim < grad.dim(), "dim must be non-negative and less than input dimensions"); if (grad.ndimension() > 0 && dim == grad.ndimension() - 1) { - softmax_backward_lastdim_kernel(grad_input, grad, output); + softmax_backward_lastdim_kernel(kCPU, grad_input, grad, output); } else { AT_DISPATCH_FLOATING_TYPES(grad.type(), "softmax_backward", [&] { host_softmax_backward(grad_input, grad, output, dim); @@ -202,7 +205,7 @@ Tensor log_softmax_backward_cpu( dim >= 0 && dim < grad.dim(), "dim must be non-negative and less than input dimensions"); if (grad.ndimension() > 0 && dim == grad.ndimension() - 1) { - log_softmax_backward_lastdim_kernel(grad_input, grad, output); + log_softmax_backward_lastdim_kernel(kCPU, grad_input, grad, output); } else { AT_DISPATCH_FLOATING_TYPES(grad.type(), "log_softmax_backward", [&] { host_softmax_backward(grad_input, grad, output, dim); @@ -210,5 +213,11 @@ Tensor log_softmax_backward_cpu( } return grad_input; } + +DispatchStub softmax_lastdim_kernel; +DispatchStub log_softmax_lastdim_kernel; +DispatchStub softmax_backward_lastdim_kernel; +DispatchStub log_softmax_backward_lastdim_kernel; + } } diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp index 17770cba906534..5d1c883bee3f1f 100644 --- a/aten/src/ATen/native/SpectralOps.cpp +++ b/aten/src/ATen/native/SpectralOps.cpp @@ -188,26 +188,24 @@ Tensor irfft(const Tensor& self, const int64_t signal_ndim, const bool normalize } -Tensor stft(const Tensor& self, const int64_t frame_length, - const int64_t hop, const int64_t fft_size, - const bool normalized, const bool onesided, - const Tensor& window, const int64_t pad_end) { +Tensor stft(const Tensor& self, const int64_t n_fft, const int64_t hop_length, + const int64_t win_length, const Tensor& window, + const bool normalized, const bool onesided) { #define REPR(SS) \ - SS << "stft(" << self.type() << self.sizes() << ", frame_length=" \ - << frame_length << ", hop=" << hop << ", fft_size=" << fft_size \ - << ", normalized=" << normalized << ", onesided=" << onesided << \ - ", window="; \ + SS << "stft(" << self.type() << self.sizes() << ", n_fft=" << n_fft \ + << ", hop_length=" << hop_length << ", win_length=" << win_length \ + << ", window="; \ if (window.defined()) { \ SS << window.type() << "{" << window.sizes() << "}"; \ } else { \ SS << "None"; \ } \ - SS << ", pad_end=" << pad_end << ")" + SS << ", normalized=" << normalized << ", onesided=" << onesided << ")" if (!at::isFloatingType(self.type().scalarType()) || self.dim() > 2 || self.dim() < 1) { std::ostringstream ss; REPR(ss) << ": expected a 1D or 2D tensor of floating types"; - throw std::runtime_error(ss.str()); + AT_ERROR(ss.str()); } Tensor input = self; if (self.dim() == 1) { @@ -215,66 +213,52 @@ Tensor stft(const Tensor& self, const int64_t frame_length, } int64_t batch = input.size(0); int64_t len = input.size(1); - if (pad_end < 0) { - std::ostringstream ss; - REPR(ss) << ": expected pad_end >= 0, but got pad_end=" << pad_end; - throw std::runtime_error(ss.str()); - } - // pad zeros - if (pad_end != 0) { - Tensor padded_input = at::zeros({batch, len + pad_end}, self.type()); - padded_input.narrow(1, 0, len).copy_(input); - len += pad_end; - input = padded_input; - } - if (frame_length <= 0 || frame_length > len) { + if (n_fft <= 0 || n_fft > len) { std::ostringstream ss; - REPR(ss) << ": expected 0 < frame_length < " << len - << ", but got frame_length=" << frame_length; - throw std::runtime_error(ss.str()); + REPR(ss) << ": expected 0 < n_fft < " << len + << ", but got n_fft=" << win_length; + AT_ERROR(ss.str()); } - if (hop <= 0) { + if (hop_length <= 0) { std::ostringstream ss; - REPR(ss) << " expected hop > 0, but got hop=" << hop; + REPR(ss) << ": expected hop_length > 0, but got hop_length=" << hop_length; throw std::runtime_error(ss.str()); } - if (fft_size <= 0) { + if (win_length <= 0 || win_length > n_fft) { std::ostringstream ss; - REPR(ss) << " expected fft_size > 0, but got fft_size=" << fft_size; - throw std::runtime_error(ss.str()); + REPR(ss) << ": expected 0 < win_length <= n_fft, but got win_length=" + << win_length; + AT_ERROR(ss.str()); } - if (window.defined() && (window.dim() != 1 || window.size(0) != frame_length)) { + if (window.defined() && (window.dim() != 1 || window.size(0) != win_length)) { std::ostringstream ss; - REPR(ss) << ": expected a 1D window tensor of size equal to " - << "frame_length=" << frame_length - << ", but got window with size " << window.sizes(); - throw std::runtime_error(ss.str()); + REPR(ss) << ": expected a 1D window tensor of size equal to win_length=" + << win_length << ", but got window with size " << window.sizes(); + AT_ERROR(ss.str()); } #undef REPR - int64_t return_size = onesided ? infer_ft_real_to_complex_onesided_size(fft_size) : fft_size; - // build ft kernel - // k[omega, t] = cos (2 pi omega t / N) - j sin (2 pi omega t / N) - double N = static_cast(fft_size); - auto freq_arange = at::arange(0, return_size, self.type()).mul_(M_PI * 2. / N); - auto time_arange = at::arange(0, frame_length, self.type()); - auto arange_2d = at::ger(freq_arange, time_arange); - auto re_kernel = arange_2d.cos(); - auto im_kernel = arange_2d.sin().neg_(); - auto kernel = at::cat({re_kernel, im_kernel}, 0); - if (window.defined()) { - kernel *= window.view({1, -1}); + auto window_ = window; + if (win_length < n_fft) { + // pad center + window_ = at::zeros({n_fft}, self.options()); + auto left = (n_fft - win_length) / 2; + if (window.defined()) { + window_.narrow(0, left, win_length).copy_(window); + } else { + window_.narrow(0, left, win_length).fill_(1); + } } - if (normalized) { - double T = static_cast(frame_length); - kernel.div_(std::sqrt(T)); + int64_t n_frames = 1 + (len - n_fft) / hop_length; + // time2col + input = input.as_strided( + {batch, n_frames, n_fft}, + {input.stride(0), hop_length * input.stride(1), input.stride(1)} + ); + if (window_.defined()) { + input = input.mul(window_); } - // prepare for conv1d - input = input.view({batch, 1, len}); - kernel = kernel.view({return_size * 2, 1, frame_length}); - // conv is actually correlation, so we are good - auto conv_out = at::conv1d(input, kernel, {}, hop).squeeze_(-1); - // transpose to [batch x time x freq x (re/im)] - auto out = conv_out.view({batch, 2, return_size, -1}).transpose_(1, -1); + // rfft and transpose to get (batch x fft_size x num_frames) + auto out = input.rfft(1, normalized, onesided).transpose_(1, 2); if (self.dim() == 1) { return out.squeeze_(0); } else { diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index c8f6158994acb3..13887a52633bc2 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -135,12 +135,20 @@ Tensor expand_as(const Tensor& self, const Tensor& other) { return self.expand(other.sizes()); } +Tensor as_strided(const Tensor& self, IntList size, IntList stride, int64_t storage_offset) { + return self.type().tensor().set_(*self.storage(), storage_offset, size, stride); +} + +Tensor &as_strided_(Tensor& self, IntList size, IntList stride, int64_t storage_offset) { + return self.set_(*self.storage(), storage_offset, size, stride); +} + Tensor as_strided(const Tensor& self, IntList size, IntList stride) { - return self.as_strided(size, stride, self.storage_offset()); + return at::as_strided(self, size, stride, self.storage_offset()); } Tensor &as_strided_(Tensor& self, IntList size, IntList stride) { - return self.as_strided_(size, stride, self.storage_offset()); + return at::as_strided_(self, size, stride, self.storage_offset()); } Tensor narrow(const Tensor& self, int64_t dim, int64_t start, int64_t length) { @@ -265,6 +273,10 @@ Tensor reshape(const Tensor& self, IntList proposed_shape) { return at::_unsafe_view(self.clone(), shape); } +Tensor reshape_as(const Tensor& self, const Tensor& other) { + return self.reshape(other.sizes()); +} + Tensor select(const Tensor& self, int64_t dim, int64_t index) { int64_t ndim = self.dim(); AT_CHECK(ndim > 0, "select() cannot be applied to a 0-dim tensor."); diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp index f32a206123ad75..17f5a437b00155 100644 --- a/aten/src/ATen/native/UnaryOps.cpp +++ b/aten/src/ATen/native/UnaryOps.cpp @@ -24,6 +24,55 @@ namespace at { namespace native { +Tensor clamp(const Tensor& self, Scalar min, Scalar max) { + Tensor result = self.type().tensor(); + return clamp_out(result, self, min, max); +} + +Tensor clamp_max(const Tensor& self, Scalar max) { + Tensor result = self.type().tensor(); + return clamp_max_out(result, self, max); +} + +Tensor clamp_min(const Tensor& self, Scalar min) { + Tensor result = self.type().tensor(); + return clamp_min_out(result, self, min); +} + +Tensor& _clamp__cpu(Tensor& self, Scalar min, Scalar max) { + return _th_clamp_(self, min, max); +} + +Tensor& _clamp_out_cpu( + Tensor& result, + const Tensor& self, + Scalar min, + Scalar max) { + result.resize_(self.sizes()); + result.copy_(self); + return _th_clamp_(result, min, max); +} + +Tensor& _clamp_max__cpu(Tensor& self, Scalar max) { + return _th_clamp_max_(self, max); +} + +Tensor& _clamp_max_out_cpu(Tensor& result, const Tensor& self, Scalar max) { + result.resize_(self.sizes()); + result.copy_(self); + return _th_clamp_max_(result, max); +} + +Tensor& _clamp_min__cpu(Tensor& self, Scalar min) { + return _th_clamp_min_(self, min); +} + +Tensor& _clamp_min_out_cpu(Tensor& result, const Tensor& self, Scalar min) { + result.resize_(self.sizes()); + result.copy_(self); + return _th_clamp_min_(result, min); +} + Tensor& fill_(Tensor& self, Scalar value) { return self._fill_(value); } @@ -43,14 +92,14 @@ Tensor& fill_(Tensor& self, const Tensor& value) { Tensor& _##op##__cpu(Tensor& self_) { \ if (self_.numel() > 0) { \ Tensor self = sort_strides(self_); \ - op##Impl(self, self); \ + op##Impl(kCPU, self, self); \ } \ return self_; \ } \ Tensor& _##op##_out_cpu(Tensor& result, const Tensor& self) { \ result.resize_(self.sizes()); \ if (result.numel() > 0) { \ - op##Impl(result, self); \ + op##Impl(kCPU, result, self); \ } \ return result; \ } @@ -96,5 +145,29 @@ IMPLEMENT_UNARY_OP_VEC(tan) IMPLEMENT_UNARY_OP_VEC(tanh) IMPLEMENT_UNARY_OP_VEC(trunc) +DispatchStub absImpl; +DispatchStub acosImpl; +DispatchStub asinImpl; +DispatchStub atanImpl; +DispatchStub ceilImpl; +DispatchStub cosImpl; +DispatchStub erfImpl; +DispatchStub erfcImpl; +DispatchStub expImpl; +DispatchStub expm1Impl; +DispatchStub floorImpl; +DispatchStub logImpl; +DispatchStub log10Impl; +DispatchStub log1pImpl; +DispatchStub log2Impl; +DispatchStub roundImpl; +DispatchStub rsqrtImpl; +DispatchStub sigmoidImpl; +DispatchStub sinImpl; +DispatchStub sqrtImpl; +DispatchStub tanImpl; +DispatchStub tanhImpl; +DispatchStub truncImpl; + } } // namespace at diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.h b/aten/src/ATen/native/cpu/ReduceOpsKernel.h index 9481b90fe76965..5a7854d0094cd5 100644 --- a/aten/src/ATen/native/cpu/ReduceOpsKernel.h +++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.h @@ -1,8 +1,8 @@ #pragma once #include +#include #include -#include "CapabilityDispatch.h" namespace at { namespace native { diff --git a/aten/src/ATen/native/cpu/SoftmaxKernel.h b/aten/src/ATen/native/cpu/SoftmaxKernel.h index dbd703b6d3c028..39d7e68a8b20c9 100644 --- a/aten/src/ATen/native/cpu/SoftmaxKernel.h +++ b/aten/src/ATen/native/cpu/SoftmaxKernel.h @@ -1,7 +1,7 @@ #pragma once #include -#include "CapabilityDispatch.h" +#include namespace at { namespace native { diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp index 7416923cfd8867..459838a9b6c689 100644 --- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp @@ -4,7 +4,7 @@ #include "ATen/Dispatch.h" #include "ATen/cpu/vml.h" #include "ATen/CPUApplyUtils.h" -#include "ATen/native/cpu/CapabilityDispatch.h" +#include "ATen/native/DispatchStub.h" #ifdef __AVX2__ #include "ATen/native/cpu/avx_mathfun.h" #endif diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.h b/aten/src/ATen/native/cpu/UnaryOpsKernel.h index d9bffadd1e1fbd..d4845760f7248d 100644 --- a/aten/src/ATen/native/cpu/UnaryOpsKernel.h +++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.h @@ -1,8 +1,8 @@ #pragma once #include +#include #include -#include "CapabilityDispatch.h" namespace at { namespace native { diff --git a/aten/src/ATen/native/cuda/CUDAUnaryOps.cpp b/aten/src/ATen/native/cuda/CUDAUnaryOps.cpp index 2e524f4d8e62d4..2bef41ee251955 100644 --- a/aten/src/ATen/native/cuda/CUDAUnaryOps.cpp +++ b/aten/src/ATen/native/cuda/CUDAUnaryOps.cpp @@ -2,6 +2,40 @@ namespace at { namespace native { +Tensor& _clamp__cuda(Tensor& self, Scalar min, Scalar max) { + return _th_clamp_(self, min, max); +} + +Tensor& _clamp_out_cuda( + Tensor& result, + const Tensor& self, + Scalar min, + Scalar max) { + result.resize_(self.sizes()); + result.copy_(self); + return _th_clamp_(result, min, max); +} + +Tensor& _clamp_max__cuda(Tensor& self, Scalar max) { + return _th_clamp_max_(self, max); +} + +Tensor& _clamp_max_out_cuda(Tensor& result, const Tensor& self, Scalar max) { + result.resize_(self.sizes()); + result.copy_(self); + return _th_clamp_max_(result, max); +} + +Tensor& _clamp_min__cuda(Tensor& self, Scalar min) { + return _th_clamp_min_(self, min); +} + +Tensor& _clamp_min_out_cuda(Tensor& result, const Tensor& self, Scalar min) { + result.resize_(self.sizes()); + result.copy_(self); + return _th_clamp_min_(result, min); +} + // These are just forwarding stubs #define IMPLEMENT_UNARY_OP_PREQUEL(op) \ diff --git a/aten/src/ATen/native/cuda/TensorTransformations.cu b/aten/src/ATen/native/cuda/TensorTransformations.cu index fa1c7628afb9e3..cc8e78c292dbc9 100644 --- a/aten/src/ATen/native/cuda/TensorTransformations.cu +++ b/aten/src/ATen/native/cuda/TensorTransformations.cu @@ -39,7 +39,8 @@ kernel_pointwise_flip_apply2(const cuda::detail::TensorInfo template __global__ -void flip_cuda_kernel(scalar_t* in_tensor, scalar_t* out_tensor, int64_t N, int64_t* flip_dims, int64_t flip_dims_size, int64_t* strides, int64_t* strides_contiguous, int64_t* shape, int64_t total_dims) { +void flip_cuda_kernel(scalar_t* in_tensor, scalar_t* out_tensor, int64_t N, int64_t* flip_dims, int64_t flip_dims_size, + int64_t* strides, int64_t* strides_contiguous, int64_t* shape, int64_t total_dims) { int64_t linear_index = blockIdx.x * blockDim.x + threadIdx.x; if (linear_index >= N) { @@ -99,18 +100,22 @@ Tensor flip_cuda(const Tensor& self, IntList dims) { auto out_tensor = at::empty_like(in_tensor); - // stride_contiguous is the stride of non-contiguous tensor after called contiguous(), it is used to compute indices for each element in non-contiguous tensor + // stride_contiguous is the stride of non-contiguous tensor after calling contiguous(), + // it is used to compute indices for each element in non-contiguous tensor Tensor stride_contiguous = at::zeros({total_dims}, kLong); int64_t* stride_contiguous_d = stride_contiguous.data(); - int64_t tmp = N; - for (int64_t i = 0; i < total_dims; i++) { - tmp = tmp / shape[i]; - stride_contiguous_d[i] = tmp; + for (int64_t i = total_dims - 1; i >= 0; i--) { + if (i == total_dims - 1) { + stride_contiguous_d[i] = 1; + } else { + stride_contiguous_d[i] = std::max(shape[i+1], 1) * stride_contiguous_d[i + 1]; + } } AT_DISPATCH_ALL_TYPES_AND_HALF(in_tensor.type(), "flip_cuda", [&] { flip_cuda_kernel<<>>( - in_tensor.data(), out_tensor.data(), N, flip_dims_t.toType(CUDA(kLong)).data(), flip_dims_size, strides_t.toType(CUDA(kLong)).data(), stride_contiguous.toType(CUDA(kLong)).data(), shape_t.toType(CUDA(kLong)).data(), total_dims); + in_tensor.data(), out_tensor.data(), N, flip_dims_t.toType(CUDA(kLong)).data(), flip_dims_size, + strides_t.toType(CUDA(kLong)).data(), stride_contiguous.toType(CUDA(kLong)).data(), shape_t.toType(CUDA(kLong)).data(), total_dims); }); return out_tensor; diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 74e6481e7d63d8..89f2771b8dadf3 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -159,11 +159,18 @@ - func: argmin(Tensor self) -> Tensor - func: _argmin(Tensor self, int64_t dim, bool keepdim=false) -> Tensor -# The actual implementations live in Declarations.cwrap. These are just to -# provide default values for storage_offset=self.storage_offset() - func: as_strided(Tensor self, IntList size, IntList stride) -> Tensor + - func: as_strided_(Tensor self, IntList size, IntList stride) -> Tensor +- func: as_strided(Tensor self, IntList size, IntList stride, int64_t storage_offset) -> Tensor + python_default_init: + storage_offset: self.storage_offset() + +- func: as_strided_(Tensor self, IntList size, IntList stride, int64_t storage_offset) -> Tensor + python_default_init: + storage_offset: self.storage_offset() + - func: asin(Tensor self) -> Tensor - func: asin_(Tensor self) -> Tensor @@ -246,6 +253,45 @@ - func: chunk(Tensor self, int64_t chunks, int64_t dim=0) -> TensorList +- func: clamp(Tensor self, Scalar min, Scalar max) -> Tensor + +- func: clamp_(Tensor self, Scalar min, Scalar max) -> Tensor + dispatch: + CPU: _clamp__cpu + CUDA: _clamp__cuda + +- func: clamp_out(Tensor result, Tensor self, Scalar min, Scalar max) -> Tensor + variants: function + dispatch: + CPU: _clamp_out_cpu + CUDA: _clamp_out_cuda + +- func: clamp_max(Tensor self, Scalar max) -> Tensor + +- func: clamp_max_(Tensor self, Scalar max) -> Tensor + dispatch: + CPU: _clamp_max__cpu + CUDA: _clamp_max__cuda + +- func: clamp_max_out(Tensor result, Tensor self, Scalar max) -> Tensor + variants: function + dispatch: + CPU: _clamp_max_out_cpu + CUDA: _clamp_max_out_cuda + +- func: clamp_min(Tensor self, Scalar min) -> Tensor + +- func: clamp_min_(Tensor self, Scalar min) -> Tensor + dispatch: + CPU: _clamp_min__cpu + CUDA: _clamp_min__cuda + +- func: clamp_min_out(Tensor result, Tensor self, Scalar min) -> Tensor + variants: function + dispatch: + CPU: _clamp_min_out_cpu + CUDA: _clamp_min_out_cuda + - func: cudnn_is_acceptable(Tensor self) -> bool variants: function device_guard: false @@ -718,6 +764,11 @@ - func: index_put_(Tensor self, TensorList indices, Tensor values) -> Tensor +- func: inverse(Tensor self) -> Tensor + +- func: inverse_out(Tensor result, Tensor self) -> Tensor + variants: function + - func: isclose(Tensor self, Tensor other, double rtol=1e-5, double atol=1e-8, bool equal_nan=False) -> Tensor - func: is_cuda(Tensor self) -> bool @@ -1096,6 +1147,9 @@ - func: reshape(Tensor self, IntList shape) -> Tensor +- func: reshape_as(Tensor self, Tensor other) -> Tensor + variants: method + - func: RoiPooling2d_forward(Tensor input, Tensor rois, int64_t pooledHeight, int64_t pooledWidth, double spatialScale) -> (Tensor, Tensor) variants: function dispatch: @@ -1250,9 +1304,14 @@ - func: stack_out(Tensor result, TensorList tensors, int64_t dim=0) -> Tensor variants: function -- func: stft(Tensor self, int64_t frame_length, int64_t hop, int64_t fft_size, bool normalized=false, bool onesided=true, Tensor? window={}, int64_t pad_end=0) -> Tensor +# The signature is designed to be consistent with librosa except that it is +# missing the `pad_mode` and `center` arguments, which are taken care of at +# `torch.functional.py`. They shall be moved here once we have mapping between +# Python strings and C++ Enum in codegen. +- func: stft(Tensor self, int64_t n_fft, int64_t hop_length, int64_t win_length, Tensor? window={}, bool normalized=false, bool onesided=true) -> Tensor python_default_init: - fft_size: frame_length + hop_length: n_fft >> 2 + win_length: n_fft - func: stride(Tensor self, int64_t dim) -> int64_t device_guard: false diff --git a/aten/src/ATen/templates/TensorDense.cpp b/aten/src/ATen/templates/TensorDense.cpp index 92ffeb32352f5a..9f977d50ead2b5 100644 --- a/aten/src/ATen/templates/TensorDense.cpp +++ b/aten/src/ATen/templates/TensorDense.cpp @@ -1,7 +1,9 @@ // included as 'TensorDenseOrSparse' in TensorDerived.cpp IntList ${Tensor}::strides() const { - return IntList(tensor->stride,dim()); + // NB: THTensor doesn't agree with Tensor for scalars, so we + // have to construct a fresh IntList + return IntList(THTensor_getStridePtr(tensor), dim()); } Scalar ${Tensor}::localScalar() { int64_t numel = ${THTensor}_nElement(${state,}tensor); diff --git a/aten/src/ATen/templates/TensorDerived.cpp b/aten/src/ATen/templates/TensorDerived.cpp index e15eb5fcb07dda..d89e84ee5e702e 100644 --- a/aten/src/ATen/templates/TensorDerived.cpp +++ b/aten/src/ATen/templates/TensorDerived.cpp @@ -31,7 +31,9 @@ const char * ${Tensor}::toString() const { } IntList ${Tensor}::sizes() const { - return IntList(tensor->size,dim()); + // NB: dim in tensor is not synchronized with THTensor, so it's + // important to apply dim here + return IntList(THTensor_getSizePtr(tensor), dim()); } int64_t ${Tensor}::dim() const { diff --git a/aten/src/ATen/test/stream_test.cpp b/aten/src/ATen/test/stream_test.cpp index 894602639b462e..1f877e3e8b5987 100644 --- a/aten/src/ATen/test/stream_test.cpp +++ b/aten/src/ATen/test/stream_test.cpp @@ -5,16 +5,18 @@ #include "cuda_runtime.h" -#include #include +#include /* Tests related to ATen streams. */ -TEST_CASE("Copying and Moving Streams", "Verifies streams are live through copying and moving") { +TEST_CASE( + "Copying and Moving Streams", + "Verifies streams are live through copying and moving") { int32_t device = -1; cudaStream_t cuda_stream; - + // Tests that copying works as expected and preserves the stream at::CUDAStream copyStream; { @@ -23,7 +25,7 @@ TEST_CASE("Copying and Moving Streams", "Verifies streams are live through copyi cuda_stream = s.stream(); copyStream = s; - + REQUIRE(copyStream.internals() == s.internals()); REQUIRE(copyStream.device() == device); REQUIRE(copyStream.stream() == cuda_stream); @@ -57,7 +59,7 @@ TEST_CASE("Getting and Setting Streams", "Verifies streams are set properly") { // Sets and gets at::globalContext().setCurrentCUDAStream(myStream); at::CUDAStream curStream = at::globalContext().getCurrentCUDAStream(); - + REQUIRE(myStream == curStream); // Gets, sets, and gets default stream @@ -71,8 +73,7 @@ TEST_CASE("Getting and Setting Streams", "Verifies streams are set properly") { TEST_CASE("Stream API retain/free", "Ensures streams are destroyed properly") { auto ptr = at::detail::CUDAStream_createAndRetainWithOptions( - at::CUDAStream::DEFAULT_FLAGS - , at::CUDAStream::DEFAULT_PRIORITY); + at::CUDAStream::DEFAULT_FLAGS, at::CUDAStream::DEFAULT_PRIORITY); at::detail::CUDAStream_free(ptr); REQUIRE(ptr == nullptr); @@ -85,7 +86,9 @@ void thread_fun(at::CUDAStream& cur_thread_stream) { REQUIRE(cur_thread_stream == new_stream); } -TEST_CASE("Multithread Getting and Setting", "Ensures streams are thread local") { +TEST_CASE( + "Multithread Getting and Setting", + "Ensures streams are thread local") { at::CUDAStream s0, s1; std::thread t0{thread_fun, std::ref(s0)}; @@ -101,3 +104,98 @@ TEST_CASE("Multithread Getting and Setting", "Ensures streams are thread local") REQUIRE(cur_stream != s1); REQUIRE(s0 != s1); } + +TEST_CASE("CUDAGuard") { + if (at::globalContext().getNumGPUs() < 2) { + return; + } + + // -- begin setup + + REQUIRE(at::current_device() == 0); + std::vector streams0 = { + at::globalContext().getDefaultCUDAStream(), + at::globalContext().createCUDAStream()}; + REQUIRE(streams0[0].device() == 0); + REQUIRE(streams0[1].device() == 0); + at::globalContext().setCurrentCUDAStreamOnDevice(0, streams0[0]); + + std::vector streams1; + { + at::DeviceGuard device_guard(1); + streams1.push_back(at::globalContext().getDefaultCUDAStream()); + streams1.push_back(at::globalContext().createCUDAStream()); + } + REQUIRE(streams1[0].device() == 1); + REQUIRE(streams1[1].device() == 1); + at::globalContext().setCurrentCUDAStreamOnDevice(1, streams1[0]); + + REQUIRE(at::current_device() == 0); + + // -- end setup + + // Test that all original streams are recorded. + { + at::CUDAGuard guard; + REQUIRE(guard.original_streams().empty()); + guard.set_stream(streams0[0]); + REQUIRE( + guard.original_streams().size() == at::globalContext().getNumGPUs()); + REQUIRE(guard.original_streams()[0] == streams0[0]); + REQUIRE(guard.original_streams()[1] == streams1[0]); + } + + // Setting a stream changes the current device and the stream on that device + { + at::CUDAGuard guard(streams1[1]); + REQUIRE(guard.last_device() == 1); + REQUIRE(at::current_device() == 1); + REQUIRE(at::globalContext().getCurrentCUDAStreamOnDevice(1) == streams1[1]); + } + + // Device and stream are now reset + REQUIRE(at::current_device() == 0); + REQUIRE(at::globalContext().getCurrentCUDAStreamOnDevice(1) == streams1[0]); + + // Setting only the device changes only the current device and not the stream + { + at::CUDAGuard guard(/*device=*/1); + REQUIRE(guard.last_device() == 1); + REQUIRE(at::current_device() == 1); + REQUIRE(at::globalContext().getCurrentCUDAStreamOnDevice(1) == streams1[0]); + } + + REQUIRE(at::current_device() == 0); + REQUIRE(at::globalContext().getCurrentCUDAStreamOnDevice(0) == streams0[0]); + + // Setting the stream first, and then the device, first changes the devices + // back, and then resets the stream on the initial device. + + { + at::CUDAGuard guard(streams0[1]); + guard.set_device(1); + } + + REQUIRE(at::current_device() == 0); + REQUIRE(at::globalContext().getCurrentCUDAStreamOnDevice(0) == streams0[0]); + REQUIRE(at::globalContext().getCurrentCUDAStreamOnDevice(1) == streams1[0]); +} + +TEST_CASE("CUDAGuardIsMovable") { + if (at::globalContext().getNumGPUs() < 2) { + return; + } + const auto stream = at::globalContext().createCUDAStream(); + const auto device_count = at::globalContext().getNumGPUs(); + at::CUDAGuard first(stream); + first.set_device(1); + at::CUDAGuard second(std::move(first)); + REQUIRE(second.original_streams().size() == device_count); + REQUIRE(second.original_device() == 0); + REQUIRE(second.last_device() == 1); + at::CUDAGuard third; + third = std::move(second); + REQUIRE(third.original_streams().size() == device_count); + REQUIRE(third.original_device() == 0); + REQUIRE(third.last_device() == 1); +} diff --git a/aten/src/README.md b/aten/src/README.md index a641ea1b5ffb8f..530d8dd6b48c6e 100644 --- a/aten/src/README.md +++ b/aten/src/README.md @@ -75,7 +75,7 @@ under some conditions you have to have to call, e.g., `newContiguous`, to get it into the correct form: ``` - if (!(k_->stride[3] == 1) || !(k_->stride[2] == k_->size[3])) { + if (!(k_->stride(3) == 1) || !(k_->stride[2] == k_->size(3))) { kernel = THTensor_(newContiguous)(k_); } else { THTensor_(retain)(k_); diff --git a/aten/src/TH/CMakeLists.txt b/aten/src/TH/CMakeLists.txt index 5d588df14ada2e..86fd8db5ff55c6 100644 --- a/aten/src/TH/CMakeLists.txt +++ b/aten/src/TH/CMakeLists.txt @@ -21,7 +21,7 @@ IF(C_AVX2_FOUND) ENDIF(C_AVX2_FOUND) SET(hdr - THGeneral.h THHalf.h THAllocator.h THSize.h THStorage.h THTensor.h THTensorApply.h THBlas.h THMath.h + THGeneral.h THHalf.h THAllocator.h THSize.h THStorage.h THStorageFunctions.h THTensor.h THTensorApply.h THBlas.h THMath.h THLapack.h THLogAdd.h THRandom.h THVector.h ) set(ATen_TH_SRCS @@ -29,7 +29,8 @@ set(ATen_TH_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/THHalf.cpp ${CMAKE_CURRENT_SOURCE_DIR}/THAllocator.cpp ${CMAKE_CURRENT_SOURCE_DIR}/THSize.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/THStorage.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/THStorageClass.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/THStorageFunctions.cpp ${CMAKE_CURRENT_SOURCE_DIR}/THTensor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/THBlas.cpp ${CMAKE_CURRENT_SOURCE_DIR}/THLapack.cpp @@ -86,13 +87,15 @@ INSTALL(FILES THRandom.h THSize.h THStorage.h + THStorageFunctions.h THTensor.h THTensorApply.h THTensorDimApply.h THVector.h THHalf.h THTensor.hpp - THStorage.hpp + THStorageClass.hpp + THStorageFunctions.hpp THGenerator.hpp THTypeConversion.hpp DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/TH") diff --git a/aten/src/TH/TH.h b/aten/src/TH/TH.h index 08bdde867ce45e..1faf6e52b5a468 100644 --- a/aten/src/TH/TH.h +++ b/aten/src/TH/TH.h @@ -12,7 +12,7 @@ #include "THLogAdd.h" #include "THRandom.h" #include "THSize.h" -#include "THStorage.h" +#include "THStorageFunctions.h" #include "THTensor.h" #include "THTensorApply.h" #include "THTensorDimApply.h" diff --git a/aten/src/TH/THAllocator.h b/aten/src/TH/THAllocator.h index c054c0c07c052c..460f23873fff08 100644 --- a/aten/src/TH/THAllocator.h +++ b/aten/src/TH/THAllocator.h @@ -25,7 +25,7 @@ typedef struct at_THAllocator THAllocator; /* default malloc/free allocator. malloc and realloc raise an error (using * THError) on allocation failure. */ -TH_API THAllocator* getTHDefaultAllocator(); +TH_API THAllocator* getTHDefaultAllocator(void); #ifdef __cplusplus // Sentinel value/type to help distinguish the file descriptor constructor from diff --git a/aten/src/TH/THFile.cpp b/aten/src/TH/THFile.cpp index f3e17419dfaca9..ae0fdf10455b6e 100644 --- a/aten/src/TH/THFile.cpp +++ b/aten/src/TH/THFile.cpp @@ -1,5 +1,5 @@ #include "THFile.h" -#include "THStorage.hpp" +#include "THStorageFunctions.hpp" #include "THFilePrivate.h" #define IMPLEMENT_THFILE_RW(TYPEC, TYPE) \ diff --git a/aten/src/TH/THFile.h b/aten/src/TH/THFile.h index 27041f51c70982..8844b0eca66d26 100644 --- a/aten/src/TH/THFile.h +++ b/aten/src/TH/THFile.h @@ -1,7 +1,7 @@ #ifndef TH_FILE_INC #define TH_FILE_INC -#include "THStorage.h" +#include "THStorageFunctions.h" typedef struct THFile__ THFile; diff --git a/aten/src/TH/THGeneral.cpp b/aten/src/TH/THGeneral.cpp index 667d7fbf253d47..1b89e17dce997c 100644 --- a/aten/src/TH/THGeneral.cpp +++ b/aten/src/TH/THGeneral.cpp @@ -303,7 +303,7 @@ TH_API void THInferNumThreads(void) #endif } -TH_API THDescBuff _THSizeDesc(const int64_t *size, const int64_t ndim) { +THDescBuff _THSizeDesc(const int64_t *size, const int64_t ndim) { const int L = TH_DESC_BUFF_LEN; THDescBuff buf; char *str = buf.str; diff --git a/aten/src/TH/THMemoryFile.cpp b/aten/src/TH/THMemoryFile.cpp index e13b02f8c29c0b..46582c913270cb 100644 --- a/aten/src/TH/THMemoryFile.cpp +++ b/aten/src/TH/THMemoryFile.cpp @@ -1,5 +1,5 @@ #include "THMemoryFile.h" -#include "THStorage.hpp" +#include "THStorageFunctions.hpp" #include "THFilePrivate.h" #include "THDiskFile.h" #include "stdint.h" diff --git a/aten/src/TH/THMemoryFile.h b/aten/src/TH/THMemoryFile.h index b54cdcc2f2cfa0..c8cab3667b8ffc 100644 --- a/aten/src/TH/THMemoryFile.h +++ b/aten/src/TH/THMemoryFile.h @@ -2,7 +2,7 @@ #define TH_MEMORY_FILE_INC #include "THFile.h" -#include "THStorage.h" +#include "THStorageFunctions.h" TH_API THFile *THMemoryFile_newWithStorage(THCharStorage *storage, const char *mode); TH_API THFile *THMemoryFile_new(const char *mode); diff --git a/aten/src/TH/THStorage.h b/aten/src/TH/THStorage.h index ce53827b9f6fce..3d25c2129682ff 100644 --- a/aten/src/TH/THStorage.h +++ b/aten/src/TH/THStorage.h @@ -1,25 +1,4 @@ #pragma once +#include "THStorageFunctions.h" -#include "THGeneral.h" -#include "THAllocator.h" - -#define THStorage_(NAME) TH_CONCAT_4(TH,Real,Storage_,NAME) - -#include "generic/THStorage.h" -#include "THGenerateAllTypes.h" - -#include "generic/THStorage.h" -#include "THGenerateHalfType.h" - -#include "generic/THStorageCopy.h" -#include "THGenerateAllTypes.h" - -#include "generic/THStorageCopy.h" -#include "THGenerateHalfType.h" - -// This exists to have a data-type independent way of freeing (necessary for THPPointer). -TH_API void THStorage_free(THStorage *storage); -TH_API void THStorage_weakFree(THStorage *storage); - -TH_API THDescBuff THLongStorage_sizeDesc(const THLongStorage *size); -TH_API THLongStorage *THLongStorage_newInferSize(THLongStorage *size, ptrdiff_t nElement); +// Compatability header. Use THStorageFunctions.h instead if you need this. diff --git a/aten/src/TH/THStorageClass.cpp b/aten/src/TH/THStorageClass.cpp new file mode 100644 index 00000000000000..49e9a6b6c8c20c --- /dev/null +++ b/aten/src/TH/THStorageClass.cpp @@ -0,0 +1,28 @@ +#include "THStorageClass.hpp" + +THStorage::THStorage( + at::ScalarType scalar_type, + ptrdiff_t size, + at::DataPtr data_ptr, + at::Allocator* allocator, + char flag) + : scalar_type(scalar_type), + data_ptr(std::move(data_ptr)), + size(size), + refcount(1), + weakcount(1), // from the strong reference + flag(flag), + allocator(allocator), + finalizer(nullptr) {} + +THStorage::THStorage( + at::ScalarType scalar_type, + ptrdiff_t size, + at::Allocator* allocator, + char flag) + : THStorage( + scalar_type, + size, + allocator->allocate(at::elementSize(scalar_type) * size), + allocator, + flag) {} diff --git a/aten/src/TH/THStorageClass.hpp b/aten/src/TH/THStorageClass.hpp new file mode 100644 index 00000000000000..99031f635b7035 --- /dev/null +++ b/aten/src/TH/THStorageClass.hpp @@ -0,0 +1,76 @@ +#pragma once + +// STOP!!! Thinking of including this header directly? Please +// read Note [TH abstraction violation] + +#include + +#include +#include +#include "THTypeConversion.hpp" +#include + +// Note [Weak references for intrusive refcounting] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// Here's the scheme: +// +// - refcount == number of strong references to the object +// weakcount == number of weak references to the object, +// plus one more if refcount > 0 +// +// - THStorage stays live as long as there are any strong +// or weak pointers to it (weakcount > 0, since strong +// references count as a +1 to weakcount) +// +// - finalizers are called and data_ptr is deallocated when refcount == 0 +// +// - Once refcount == 0, it can never again be > 0 (the transition +// from > 0 to == 0 is monotonic) +// +// - When you access THStorage via a weak pointer, you must +// atomically increment the use count, if it is greater than 0. +// If it is not, you must report that the storage is dead. +// + +struct THFinalizer { + virtual void operator()() = 0; + virtual ~THFinalizer() {}; +}; + +struct THStorage +{ + THStorage() = delete; + THStorage(at::ScalarType, ptrdiff_t, at::DataPtr, at::Allocator*, char); + THStorage(at::ScalarType, ptrdiff_t, at::Allocator*, char); + at::ScalarType scalar_type; + at::DataPtr data_ptr; + ptrdiff_t size; + std::atomic refcount; + std::atomic weakcount; + char flag; + at::Allocator* allocator; + std::unique_ptr finalizer; + struct THStorage* view; + THStorage(THStorage&) = delete; + THStorage(const THStorage&) = delete; + THStorage(THStorage&&) = delete; + THStorage(const THStorage&&) = delete; + + template + inline T* data() const { + auto scalar_type_T = at::CTypeToScalarType>::to(); + if (scalar_type != scalar_type_T) { + AT_ERROR( + "Attempt to access Storage having data type ", + at::toString(scalar_type), + " as data type ", + at::toString(scalar_type_T)); + } + return unsafe_data(); + } + + template + inline T* unsafe_data() const { + return static_cast(this->data_ptr.get()); + } +}; diff --git a/aten/src/TH/THStorage.cpp b/aten/src/TH/THStorageFunctions.cpp similarity index 59% rename from aten/src/TH/THStorage.cpp rename to aten/src/TH/THStorageFunctions.cpp index f4910c3f07fe32..c3db776b632e75 100644 --- a/aten/src/TH/THStorage.cpp +++ b/aten/src/TH/THStorageFunctions.cpp @@ -1,6 +1,6 @@ #include -#include "THStorage.hpp" +#include "THStorageFunctions.hpp" #include "generic/THStorage.cpp" #include "THGenerateAllTypes.h" @@ -25,8 +25,8 @@ void THStorage_free(THStorage *storage) { if (storage->finalizer) { (*storage->finalizer)(); } - storage->finalizer.~unique_ptr(); - storage->data_ptr.~DataPtr(); + storage->finalizer = nullptr; + storage->data_ptr.clear(); THStorage_weakFree(storage); } } @@ -40,9 +40,7 @@ void THStorage_weakRetain(THStorage *weak_storage) { // Releases a weak reference void THStorage_weakFree(THStorage *weak_storage) { if (--weak_storage->weakcount == 0) { - weak_storage->refcount.~atomic(); - weak_storage->weakcount.~atomic(); - THFree(weak_storage); + delete weak_storage; } } @@ -91,62 +89,11 @@ THLongStorage *THLongStorage_newInferSize(THLongStorage *size, ptrdiff_t nElemen return copy; } -THStorage* THStorage_new(at::ScalarType scalar_type) -{ - return THStorage_newWithSize(scalar_type, 0); -} - -THStorage* THStorage_newWithSize(at::ScalarType scalar_type, ptrdiff_t size) -{ - return THStorage_newWithAllocator(scalar_type, size, getTHDefaultAllocator()); -} - -THStorage* THStorage_newWithAllocator(at::ScalarType scalar_type, ptrdiff_t size, - at::Allocator *allocator) -{ - THStorage *storage = static_cast(THAlloc(sizeof(THStorage))); - storage->scalar_type = scalar_type; - new (&storage->data_ptr) at::DataPtr(allocator->allocate(at::elementSize(scalar_type)*size)); - storage->size = size; - new (&storage->refcount) std::atomic(1); - new (&storage->weakcount) std::atomic(1); // from the strong reference - new (&storage->finalizer) std::unique_ptr(nullptr); - storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE; - storage->allocator = allocator; - return storage; -} - ptrdiff_t THStorage_size(const THStorage *self) { return self->size; } -size_t THStorage_elementSize(const THStorage *self) -{ - return at::elementSize(self->scalar_type); -} - -THStorage* THStorage_newWithMapping(at::ScalarType scalar_type, const char *filename, ptrdiff_t size, int flags) -{ - size_t actual_size = -1; - THStorage *storage = THStorage_newWithDataAndAllocator(scalar_type, - THMapAllocator::makeDataPtr( - filename, - flags, - size * at::elementSize(scalar_type), - &actual_size), - size, - /* allocator */ nullptr); - - if (size <= 0) { - storage->size = actual_size/THStorage_elementSize(storage); - } - - THStorage_clearFlag(storage, TH_STORAGE_RESIZABLE); - - return storage; -} - void THStorage_setFlag(THStorage *storage, const char flag) { storage->flag |= flag; @@ -173,21 +120,6 @@ THStorage* THStorage_newWithData(at::ScalarType scalar_type, std::unique_ptr(THAlloc(sizeof(THStorage))); - storage->scalar_type = scalar_type; - new (&storage->data_ptr) at::DataPtr(std::move(data)); - storage->size = size; - new (&storage->refcount) std::atomic(1); - new (&storage->weakcount) std::atomic(1); // from the strong reference - new (&storage->finalizer) std::unique_ptr(nullptr); - storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE; - storage->allocator = allocator; - return storage; -} - void THStorage_resize(THStorage *storage, ptrdiff_t size) { if (storage->flag & TH_STORAGE_RESIZABLE) diff --git a/aten/src/TH/THStorageFunctions.h b/aten/src/TH/THStorageFunctions.h new file mode 100644 index 00000000000000..ce53827b9f6fce --- /dev/null +++ b/aten/src/TH/THStorageFunctions.h @@ -0,0 +1,25 @@ +#pragma once + +#include "THGeneral.h" +#include "THAllocator.h" + +#define THStorage_(NAME) TH_CONCAT_4(TH,Real,Storage_,NAME) + +#include "generic/THStorage.h" +#include "THGenerateAllTypes.h" + +#include "generic/THStorage.h" +#include "THGenerateHalfType.h" + +#include "generic/THStorageCopy.h" +#include "THGenerateAllTypes.h" + +#include "generic/THStorageCopy.h" +#include "THGenerateHalfType.h" + +// This exists to have a data-type independent way of freeing (necessary for THPPointer). +TH_API void THStorage_free(THStorage *storage); +TH_API void THStorage_weakFree(THStorage *storage); + +TH_API THDescBuff THLongStorage_sizeDesc(const THLongStorage *size); +TH_API THLongStorage *THLongStorage_newInferSize(THLongStorage *size, ptrdiff_t nElement); diff --git a/aten/src/TH/THStorage.hpp b/aten/src/TH/THStorageFunctions.hpp similarity index 50% rename from aten/src/TH/THStorage.hpp rename to aten/src/TH/THStorageFunctions.hpp index e02e265062d94b..9ef48dcfbd870e 100644 --- a/aten/src/TH/THStorage.hpp +++ b/aten/src/TH/THStorageFunctions.hpp @@ -3,7 +3,8 @@ // STOP!!! Thinking of including this header directly? Please // read Note [TH abstraction violation] -#include "THStorage.h" +#include "THStorageClass.hpp" +#include "THStorageFunctions.h" #include #include @@ -32,52 +33,11 @@ // If it is not, you must report that the storage is dead. // -struct THFinalizer { - virtual void operator()() = 0; - virtual ~THFinalizer() {}; -}; - -typedef struct THStorage -{ - at::ScalarType scalar_type; - at::DataPtr data_ptr; - ptrdiff_t size; - std::atomic refcount; - std::atomic weakcount; - char flag; - at::Allocator *allocator; - std::unique_ptr finalizer; - - template - inline T * data() const { - auto scalar_type_T = at::CTypeToScalarType>::to(); - if (scalar_type != scalar_type_T) { - AT_ERROR("Attempt to access Storage having data type ", at::toString(scalar_type), - " as data type ", at::toString(scalar_type_T)); - } - return unsafe_data(); - } - - template - inline T * unsafe_data() const { - return static_cast(this->data_ptr.get()); - } -} THStorage; - -TH_API THStorage* THStorage_new(at::ScalarType scalar_type); -TH_API THStorage* THStorage_newWithSize(at::ScalarType scalar_type, ptrdiff_t size); -TH_API THStorage* THStorage_newWithAllocator(at::ScalarType scalar_type, ptrdiff_t size, - at::Allocator *allocator); - ptrdiff_t THStorage_size(const THStorage *self); -size_t THStorage_elementSize(); -THStorage* THStorage_newWithMapping(at::ScalarType scalar_type, const char *filename, ptrdiff_t size, int flags); + void THStorage_setFlag(THStorage *storage, const char flag); void THStorage_clearFlag(THStorage *storage, const char flag); void THStorage_retain(THStorage *storage); -THStorage* THStorage_newWithDataAndAllocator(at::ScalarType scalar_type, - at::DataPtr&& data, ptrdiff_t size, - at::Allocator* allocator); void THStorage_resize(THStorage *storage, ptrdiff_t size); void THStorage_swap(THStorage *storage1, THStorage *storage2); diff --git a/aten/src/TH/THTensor.cpp b/aten/src/TH/THTensor.cpp index b2815ca4868dcd..48ddcd2c57ba10 100644 --- a/aten/src/TH/THTensor.cpp +++ b/aten/src/TH/THTensor.cpp @@ -43,17 +43,9 @@ void THTensor_free(THTensor *self) if(!self) return; - if(self->flag & TH_TENSOR_REFCOUNTED) + if(--self->refcount == 0) { - if(--self->refcount == 0) - { - THFree(self->size); - THFree(self->stride); - if(self->storage) - THStorage_free(self->storage); - self->refcount.~atomic(); - THFree(self); - } + delete self; } } diff --git a/aten/src/TH/THTensor.h b/aten/src/TH/THTensor.h index 3984bf9172ff0c..3335a6f5d8cc50 100644 --- a/aten/src/TH/THTensor.h +++ b/aten/src/TH/THTensor.h @@ -1,7 +1,7 @@ #ifndef TH_TENSOR_INC #define TH_TENSOR_INC -#include "THStorage.h" +#include "THStorageFunctions.h" #include "THTensorApply.h" #define THTensor_(NAME) TH_CONCAT_4(TH,Real,Tensor_,NAME) diff --git a/aten/src/TH/THTensor.hpp b/aten/src/TH/THTensor.hpp index 71236afa4b5626..bc9f23ee5e3cee 100644 --- a/aten/src/TH/THTensor.hpp +++ b/aten/src/TH/THTensor.hpp @@ -4,24 +4,38 @@ // read Note [TH abstraction violation] #include "THTensor.h" -#include "THStorage.hpp" +#include "THStorageFunctions.hpp" #include #include -typedef struct THTensor +struct THTensor { - int64_t *size; - int64_t *stride; - int64_t dim_; + THTensor(THStorage* storage) + : refcount(1) + , storage(storage) + , storageOffset(0) + , sizes_{0} + , strides_{1} + , dim_(1) + {} + + ~THTensor() { + if (storage) { + THStorage_free(storage); + } + } + + std::atomic refcount; // Note: storage->size may be greater than the recorded size // of a tensor THStorage *storage; ptrdiff_t storageOffset; - std::atomic refcount; - char flag; + std::vector sizes_; + std::vector strides_; + int64_t dim_; template inline T * data() const { @@ -47,21 +61,65 @@ typedef struct THTensor // represents that numel() == 0. inline bool is_empty() const { for (int64_t i = 0; i < dim_; ++i) { - if (size[i] == 0) { - return true; + if (sizes_[i] == 0) { + return true; } } return false; } + int64_t size(int64_t d) const { + d = at::maybe_wrap_dim(d, dim(), false); + return sizes_[d]; + } + + int64_t stride(int64_t d) const { + d = at::maybe_wrap_dim(d, dim(), false); + return strides_[d]; + } + inline at::IntList sizes() { - return at::IntList(size, dim_); + return sizes_; } -} THTensor; + + inline at::IntList strides() { + return strides_; + } +}; #include "generic/THTensorFastGetSet.hpp" #include "THGenerateAllTypes.h" +inline int64_t* THTensor_getSizePtr(THTensor* tensor) { + return tensor->sizes_.data(); +} + +inline int64_t* THTensor_getStridePtr(THTensor* tensor) { + return tensor->strides_.data(); +} + +inline void THTensor_resizeDim(THTensor* tensor, int64_t ndim) { + tensor->dim_ = ndim; + // NB: This is *truly* a resize; calling code (e.g., squeeze) + // assumes that old values are preserved + tensor->sizes_.resize(ndim); + tensor->strides_.resize(ndim); +} + +inline void THTensor_setSizesAndStrides(THTensor* tensor, std::vector&& new_size, std::vector&& new_stride) { + tensor->dim_ = new_size.size(); + tensor->sizes_ = std::move(new_size); + tensor->strides_ = std::move(new_stride); +} + +inline void THTensor_setSizeAtDim(THTensor* tensor, int dim, int64_t new_size) { + tensor->sizes_[dim] = new_size; +} + +inline void THTensor_setStrideAtDim(THTensor* tensor, int dim, int64_t new_stride) { + tensor->strides_[dim] = new_stride; +} + TH_API void THTensor_free(THTensor *self); at::optional> THTensor_compute_stride(at::IntList oldshape, at::IntList oldstride, at::IntList newshape); diff --git a/aten/src/TH/THTensorApply.h b/aten/src/TH/THTensorApply.h index 0b699e89d832c6..514a4969df83e2 100644 --- a/aten/src/TH/THTensorApply.h +++ b/aten/src/TH/THTensorApply.h @@ -37,7 +37,7 @@ int TENSOR##_contiguous = ALLOW_CONTIGUOUS && DIM < 0; \ TENSOR##_n = 1; \ for(TENSOR##_i = 0; TENSOR##_i < TENSOR->dim(); TENSOR##_i++) \ - TENSOR##_n *= TENSOR->size[TENSOR##_i]; \ + TENSOR##_n *= TENSOR->size(TENSOR##_i); \ \ if(TENSOR->is_empty()) \ TH_TENSOR_APPLY_hasFinished = 1; \ @@ -47,9 +47,9 @@ TENSOR##_size = 1; \ TENSOR##_stride = 1; \ for(TENSOR##_i = TENSOR->_dim()-1; TENSOR##_i >= 0; TENSOR##_i--) { \ - if(TENSOR->size[TENSOR##_i] != 1) { \ - if(TENSOR->stride[TENSOR##_i] == TENSOR##_size && TENSOR##_i != DIM) \ - TENSOR##_size *= TENSOR->size[TENSOR##_i]; \ + if(TENSOR->size(TENSOR##_i) != 1) { \ + if(TENSOR->stride(TENSOR##_i) == TENSOR##_size && TENSOR##_i != DIM) \ + TENSOR##_size *= TENSOR->size(TENSOR##_i); \ else{ \ TENSOR##_contiguous = 0; \ break; \ @@ -61,7 +61,7 @@ TENSOR##_dim = 1; \ for(TENSOR##_i = TENSOR->_dim()-2; TENSOR##_i >= 0; TENSOR##_i--) \ { \ - if(TENSOR->stride[TENSOR##_i] != TENSOR->stride[TENSOR##_i+1] * TENSOR->size[TENSOR##_i+1] || TENSOR##_i == DIM || TENSOR##_i+1 == DIM) \ + if(TENSOR->stride(TENSOR##_i) != TENSOR->stride(TENSOR##_i+1) * TENSOR->size(TENSOR##_i+1) || TENSOR##_i == DIM || TENSOR##_i+1 == DIM) \ TENSOR##_dim++; \ } \ /* Allocate an array of 3*dim elements, where dim is the number of contiguous sections */ \ @@ -70,8 +70,8 @@ TENSOR##_strides = TENSOR##_counter + 2*TENSOR##_dim; \ TH_TENSOR_dim_index = TENSOR##_dim-1; \ TENSOR##_dimOffset = (DIM == TENSOR->_dim()-1) ? &TENSOR##_i : &TENSOR##_counter[DIM]; \ - TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size[TENSOR->_dim()-1]; \ - TENSOR##_strides[TH_TENSOR_dim_index] = TENSOR->stride[TENSOR->_dim()-1]; \ + TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size(TENSOR->_dim()-1); \ + TENSOR##_strides[TH_TENSOR_dim_index] = TENSOR->stride(TENSOR->_dim()-1); \ /* TENSOR##_counter tracks where we are in the storage. The offset into the */ \ /* storage is given by storage_offset + (i * j), where i is the stride */ \ /* vector and j is tensor_counter vector. This sets the starting position for the loop. */ \ @@ -79,14 +79,14 @@ TENSOR##_counter[TENSOR##_i] = 0; \ } \ for(TENSOR##_i = TENSOR->_dim()-2; TENSOR##_i >= 0; --TENSOR##_i) { \ - if (TENSOR->stride[TENSOR##_i] == TENSOR->stride[TENSOR##_i+1] * TENSOR->size[TENSOR##_i+1] && TENSOR##_i != DIM && TENSOR##_i+1 != DIM) { \ - TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size[TENSOR##_i] * TENSOR##_sizes[TH_TENSOR_dim_index]; \ + if (TENSOR->stride(TENSOR##_i) == TENSOR->stride(TENSOR##_i+1) * TENSOR->size(TENSOR##_i+1) && TENSOR##_i != DIM && TENSOR##_i+1 != DIM) { \ + TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size(TENSOR##_i) * TENSOR##_sizes[TH_TENSOR_dim_index]; \ if (DIM != TENSOR->_dim()-1 && TENSOR##_i < DIM) \ TENSOR##_dimOffset--; \ } else { \ --TH_TENSOR_dim_index; \ - TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size[TENSOR##_i]; \ - TENSOR##_strides[TH_TENSOR_dim_index] = TENSOR->stride[TENSOR##_i]; \ + TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size(TENSOR##_i); \ + TENSOR##_strides[TH_TENSOR_dim_index] = TENSOR->stride(TENSOR##_i); \ } \ } \ /* Size of the inner most section */ \ @@ -160,13 +160,12 @@ elements_equal = 0; \ } \ if (elements_equal == 0) { \ - THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->dim()); \ - THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->dim()); \ - THDescBuff T3buff = _THSizeDesc(TENSOR3->size, TENSOR3->dim()); \ - THError("inconsistent tensor size, expected %s %s, %s %s and %s %s to have the same " \ - "number of elements, but got %d, %d and %d elements respectively", \ - #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, #TENSOR3, T3buff.str, \ - TENSOR1##_n, TENSOR2##_n, TENSOR3##_n); \ + AT_ERROR("inconsistent tensor size, expected ", \ + #TENSOR1, " ", TENSOR1->sizes(), ", ", \ + #TENSOR2, " ", TENSOR2->sizes(), " and ", \ + #TENSOR3, " ", TENSOR3->sizes(), " to have the same " \ + "number of elements, but got ", TENSOR1##_n, ", ", \ + TENSOR2##_n, " and ", TENSOR3##_n, " elements respectively"); \ } \ \ while(!TH_TENSOR_APPLY_hasFinished) \ @@ -199,11 +198,11 @@ __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, DIM, 1) \ \ if(TENSOR1##_n != TENSOR2##_n) { \ - THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->dim()); \ - THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->dim()); \ - THError("inconsistent tensor size, expected %s %s and %s %s to have the same " \ - "number of elements, but got %d and %d elements respectively", \ - #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, TENSOR1##_n, TENSOR2##_n); \ + AT_ERROR("inconsistent tensor size, expected ", \ + #TENSOR1, " ", TENSOR1->sizes(), " and ", \ + #TENSOR2, " ", TENSOR2->sizes(), \ + " to have the same number of elements, but got ", \ + TENSOR1##_n, " and ", TENSOR2##_n, " elements respectively"); \ } \ while(!TH_TENSOR_APPLY_hasFinished) \ { \ diff --git a/aten/src/TH/THTensorDimApply.h b/aten/src/TH/THTensorDimApply.h index 828b92dcb3ae69..e85bd0e9137e87 100644 --- a/aten/src/TH/THTensorDimApply.h +++ b/aten/src/TH/THTensorDimApply.h @@ -9,25 +9,21 @@ #define TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM(TENSOR1, TENSOR2, TENSOR3, DIMENSION) \ { \ int shape_check_flag = 0; \ - for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->_dim(); TH_TENSOR_DIM_APPLY_i++) \ + for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \ { \ if (TH_TENSOR_DIM_APPLY_i == DIMENSION) \ continue; \ - if (TENSOR1->size[TH_TENSOR_DIM_APPLY_i] != TENSOR2->size[TH_TENSOR_DIM_APPLY_i]) { \ + if (TENSOR1->size(TH_TENSOR_DIM_APPLY_i) != TENSOR2->size(TH_TENSOR_DIM_APPLY_i)) { \ shape_check_flag = 1; \ break; \ } \ - if(TENSOR1->size[TH_TENSOR_DIM_APPLY_i] != TENSOR3->size[TH_TENSOR_DIM_APPLY_i]) { \ + if(TENSOR1->size(TH_TENSOR_DIM_APPLY_i) != TENSOR3->size(TH_TENSOR_DIM_APPLY_i)) { \ shape_check_flag = 1; \ break; \ } \ } \ if (shape_check_flag == 1) { \ - THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->_dim()); \ - THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->_dim()); \ - THDescBuff T3buff = _THSizeDesc(TENSOR3->size, TENSOR3->_dim()); \ - THError("Expected %s %s, %s %s and %s %s to have the same size apart from dimension %d", \ - #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, #TENSOR3, T3buff.str, DIMENSION); \ + AT_ERROR("Expected ", #TENSOR1, " ", TENSOR1->sizes(), ", ", #TENSOR2, " ", TENSOR2->sizes(), " and ", #TENSOR3, " ", TENSOR3->sizes(), " to have the same size apart from dimension ", DIMENSION); \ } \ } @@ -40,55 +36,54 @@ TYPE3 *TENSOR3##_data = NULL; \ TH_UNUSED int64_t TENSOR3##_stride = 0, TENSOR3##_size = 0; \ int64_t *TH_TENSOR_DIM_APPLY_counter = NULL; \ - int TH_TENSOR_DIM_APPLY_hasFinished = 0; \ + int TH_TENSOR_DIM_APPLY_hasFinished = THTensor_(numel)(TENSOR1) == 0; \ int TH_TENSOR_DIM_APPLY_i; \ \ - if( (DIMENSION < 0) || (DIMENSION >= TENSOR1->_dim()) ) \ - THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, TENSOR1->_dim()); \ + if( (DIMENSION < 0) || (DIMENSION >= TENSOR1->dim()) ) \ + THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, TENSOR1->dim()); \ int same_dims = 1; \ - if( TENSOR1->_dim() != TENSOR2->_dim() ) { \ + if( TENSOR1->dim() != TENSOR2->dim() ) { \ same_dims = 0; \ } \ - if( TENSOR1->_dim() != TENSOR3->_dim() ) { \ + if( TENSOR1->dim() != TENSOR3->dim() ) { \ same_dims = 0; \ } \ if (same_dims == 0) { \ - THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->_dim()); \ - THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->_dim()); \ - THDescBuff T3buff = _THSizeDesc(TENSOR3->size, TENSOR3->_dim()); \ - THError("inconsistent tensor size, expected %s %s, %s %s and %s %s to have the same " \ - "number of dimensions", #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, #TENSOR3, T3buff.str); \ + AT_ERROR("inconsistent tensor size, expected ", #TENSOR1, " ", TENSOR1->sizes(), ", ", #TENSOR2, " ", TENSOR2->sizes(), " and ", #TENSOR3, " ",TENSOR3->sizes() , " to have the same number of dimensions"); \ } \ SIZE_CHECK(TENSOR1, TENSOR2, TENSOR3, DIMENSION) \ \ - TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(TENSOR1->_dim())); \ - for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->_dim(); TH_TENSOR_DIM_APPLY_i++) \ + if (TH_TENSOR_DIM_APPLY_hasFinished) { \ + return; \ + } \ + TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(TENSOR1->dim())); \ + for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \ TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \ \ TENSOR1##_data = (TENSOR1)->storage->data()+(TENSOR1)->storageOffset; \ - TENSOR1##_stride = (TENSOR1)->stride[DIMENSION]; \ - TENSOR1##_size = TENSOR1->size[DIMENSION]; \ + TENSOR1##_stride = (TENSOR1)->stride(DIMENSION); \ + TENSOR1##_size = TENSOR1->size(DIMENSION); \ \ TENSOR2##_data = (TENSOR2)->storage->data()+(TENSOR2)->storageOffset; \ - TENSOR2##_stride = (TENSOR2)->stride[DIMENSION]; \ - TENSOR2##_size = TENSOR2->size[DIMENSION]; \ + TENSOR2##_stride = (TENSOR2)->stride(DIMENSION); \ + TENSOR2##_size = TENSOR2->size(DIMENSION); \ \ TENSOR3##_data = (TENSOR3)->storage->data()+(TENSOR3)->storageOffset; \ - TENSOR3##_stride = (TENSOR3)->stride[DIMENSION]; \ - TENSOR3##_size = TENSOR3->size[DIMENSION]; \ + TENSOR3##_stride = (TENSOR3)->stride(DIMENSION); \ + TENSOR3##_size = TENSOR3->size(DIMENSION); \ \ while(!TH_TENSOR_DIM_APPLY_hasFinished) \ { \ CODE \ \ - if(TENSOR1->_dim() == 1) \ + if(TENSOR1->dim() == 1) \ break; \ \ - for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->_dim(); TH_TENSOR_DIM_APPLY_i++) \ + for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \ { \ if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \ { \ - if(TH_TENSOR_DIM_APPLY_i == TENSOR1->_dim()-1) \ + if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \ { \ TH_TENSOR_DIM_APPLY_hasFinished = 1; \ break; \ @@ -97,22 +92,22 @@ } \ \ TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]++; \ - TENSOR1##_data += TENSOR1->stride[TH_TENSOR_DIM_APPLY_i]; \ - TENSOR2##_data += TENSOR2->stride[TH_TENSOR_DIM_APPLY_i]; \ - TENSOR3##_data += TENSOR3->stride[TH_TENSOR_DIM_APPLY_i]; \ + TENSOR1##_data += TENSOR1->stride(TH_TENSOR_DIM_APPLY_i); \ + TENSOR2##_data += TENSOR2->stride(TH_TENSOR_DIM_APPLY_i); \ + TENSOR3##_data += TENSOR3->stride(TH_TENSOR_DIM_APPLY_i); \ \ - if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR1->size[TH_TENSOR_DIM_APPLY_i]) \ + if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR1->size(TH_TENSOR_DIM_APPLY_i)) \ { \ - if(TH_TENSOR_DIM_APPLY_i == TENSOR1->_dim()-1) \ + if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \ { \ TH_TENSOR_DIM_APPLY_hasFinished = 1; \ break; \ } \ else \ { \ - TENSOR1##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR1->stride[TH_TENSOR_DIM_APPLY_i]; \ - TENSOR2##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR2->stride[TH_TENSOR_DIM_APPLY_i]; \ - TENSOR3##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR3->stride[TH_TENSOR_DIM_APPLY_i]; \ + TENSOR1##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR1->stride(TH_TENSOR_DIM_APPLY_i); \ + TENSOR2##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR2->stride(TH_TENSOR_DIM_APPLY_i); \ + TENSOR3##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR3->stride(TH_TENSOR_DIM_APPLY_i); \ TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \ } \ } \ @@ -147,54 +142,51 @@ TYPE2 *TENSOR2##_data = NULL; \ TH_UNUSED int64_t TENSOR2##_stride = 0, TENSOR2##_size = 0; \ int64_t *TH_TENSOR_DIM_APPLY_counter = NULL; \ - int TH_TENSOR_DIM_APPLY_hasFinished = 0; \ + int TH_TENSOR_DIM_APPLY_hasFinished = THTensor_(numel)(TENSOR1) == 0; \ int TH_TENSOR_DIM_APPLY_i; \ \ - if( (DIMENSION < 0) || (DIMENSION >= TENSOR1->_dim()) ) \ + if( (DIMENSION < 0) || (DIMENSION >= TENSOR1->dim()) ) \ THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, TENSOR1->_dim()); \ - if( TENSOR1->_dim() != TENSOR2->_dim() ) { \ - THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->_dim()); \ - THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->_dim()); \ - THError("inconsistent tensor size, expected %s %s and %s %s to have the same " \ - "number of dimensions", #TENSOR1, T1buff.str, #TENSOR2, T2buff.str); \ + if( TENSOR1->dim() != TENSOR2->dim() ) { \ + AT_ERROR("inconsistent tensor size, expected ", #TENSOR1, " ", TENSOR1->sizes(), " and ", #TENSOR2, " ", TENSOR2->sizes(), " to have the same number of dimensions"); \ } \ TH_UNUSED int shape_check_flag = 0; \ - for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->_dim(); TH_TENSOR_DIM_APPLY_i++) \ + for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \ { \ if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \ continue; \ - if(TENSOR1->size[TH_TENSOR_DIM_APPLY_i] != TENSOR2->size[TH_TENSOR_DIM_APPLY_i]) { \ - THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->_dim()); \ - THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->_dim()); \ - THError("Expected %s %s and %s %s to have the same size in dimension %d", \ - #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, DIMENSION); \ + if(TENSOR1->size(TH_TENSOR_DIM_APPLY_i) != TENSOR2->size(TH_TENSOR_DIM_APPLY_i)) { \ + AT_ERROR("Expected ", #TENSOR1, " ", TENSOR1->sizes(), " and ", #TENSOR2, " ", TENSOR2->sizes(), " to have the same size in dimension ", DIMENSION); \ } \ } \ \ - TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(TENSOR1->_dim())); \ - for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->_dim(); TH_TENSOR_DIM_APPLY_i++) \ + if (TH_TENSOR_DIM_APPLY_hasFinished) { \ + return; \ + } \ + TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(TENSOR1->dim())); \ + for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \ TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \ \ TENSOR1##_data = (TENSOR1)->storage->data()+(TENSOR1)->storageOffset; \ - TENSOR1##_stride = (TENSOR1)->stride[DIMENSION]; \ - TENSOR1##_size = TENSOR1->size[DIMENSION]; \ + TENSOR1##_stride = (TENSOR1)->stride(DIMENSION); \ + TENSOR1##_size = TENSOR1->size(DIMENSION); \ \ TENSOR2##_data = (TENSOR2)->storage->data()+(TENSOR2)->storageOffset; \ - TENSOR2##_stride = (TENSOR2)->stride[DIMENSION]; \ - TENSOR2##_size = TENSOR2->size[DIMENSION]; \ + TENSOR2##_stride = (TENSOR2)->stride(DIMENSION); \ + TENSOR2##_size = TENSOR2->size(DIMENSION); \ \ while(!TH_TENSOR_DIM_APPLY_hasFinished) \ { \ CODE \ \ - if(TENSOR1->_dim() == 1) \ + if(TENSOR1->dim() == 1) \ break; \ \ - for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->_dim(); TH_TENSOR_DIM_APPLY_i++) \ + for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \ { \ if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \ { \ - if(TH_TENSOR_DIM_APPLY_i == TENSOR1->_dim()-1) \ + if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \ { \ TH_TENSOR_DIM_APPLY_hasFinished = 1; \ break; \ @@ -203,20 +195,20 @@ } \ \ TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]++; \ - TENSOR1##_data += TENSOR1->stride[TH_TENSOR_DIM_APPLY_i]; \ - TENSOR2##_data += TENSOR2->stride[TH_TENSOR_DIM_APPLY_i]; \ + TENSOR1##_data += TENSOR1->stride(TH_TENSOR_DIM_APPLY_i); \ + TENSOR2##_data += TENSOR2->stride(TH_TENSOR_DIM_APPLY_i); \ \ - if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR1->size[TH_TENSOR_DIM_APPLY_i]) \ + if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR1->size(TH_TENSOR_DIM_APPLY_i)) \ { \ - if(TH_TENSOR_DIM_APPLY_i == TENSOR1->_dim()-1) \ + if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \ { \ TH_TENSOR_DIM_APPLY_hasFinished = 1; \ break; \ } \ else \ { \ - TENSOR1##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR1->stride[TH_TENSOR_DIM_APPLY_i]; \ - TENSOR2##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR2->stride[TH_TENSOR_DIM_APPLY_i]; \ + TENSOR1##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR1->stride(TH_TENSOR_DIM_APPLY_i); \ + TENSOR2##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR2->stride(TH_TENSOR_DIM_APPLY_i); \ TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \ } \ } \ @@ -278,8 +270,8 @@ THError("invalid dimension"); \ \ TENSOR##_data = (TENSOR)->storage->data()+(TENSOR)->storageOffset; \ - TENSOR##_stride = (TENSOR)->stride[DIMENSION]; \ - TENSOR##_size = TENSOR->size[DIMENSION]; \ + TENSOR##_stride = (TENSOR)->stride(DIMENSION); \ + TENSOR##_size = TENSOR->size(DIMENSION); \ /* Counter stores the indices into the Tensor at any time */ \ TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(TENSOR->_dim())); \ for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR->_dim(); TH_TENSOR_DIM_APPLY_i++) \ @@ -310,9 +302,9 @@ \ /* Bump the counter at this index, update the pointer */ \ TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]++; \ - TENSOR##_data += TENSOR->stride[TH_TENSOR_DIM_APPLY_i]; \ + TENSOR##_data += TENSOR->stride(TH_TENSOR_DIM_APPLY_i); \ \ - if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR->size[TH_TENSOR_DIM_APPLY_i]) \ + if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR->size(TH_TENSOR_DIM_APPLY_i)) \ { \ /* Handled TENSOR_size(dim) iterations for DIM_APPLY_i. If this is the last dimension, exit */ \ if(TH_TENSOR_DIM_APPLY_i == TENSOR->_dim()-1) \ @@ -323,7 +315,7 @@ else \ { \ /* Reset the counter, and the pointer to the beginning of the storage for this combination of indices */ \ - TENSOR##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR->stride[TH_TENSOR_DIM_APPLY_i]; \ + TENSOR##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR->stride(TH_TENSOR_DIM_APPLY_i); \ TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \ } \ } \ diff --git a/aten/src/TH/generic/THStorage.cpp b/aten/src/TH/generic/THStorage.cpp index 2d499b0b67578c..de4b7035fcd4d0 100644 --- a/aten/src/TH/generic/THStorage.cpp +++ b/aten/src/TH/generic/THStorage.cpp @@ -21,24 +21,55 @@ size_t THStorage_(elementSize)() THStorage* THStorage_(new)(void) { - return THStorage_new(at::CTypeToScalarType>::to()); + THStorage* storage = new THStorage( + at::CTypeToScalarType>::to(), + 0, + getTHDefaultAllocator(), + TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE); + return storage; } THStorage* THStorage_(newWithSize)(ptrdiff_t size) { - return THStorage_newWithSize(at::CTypeToScalarType>::to(), size); + THStorage* storage = new THStorage( + at::CTypeToScalarType>::to(), + size, + getTHDefaultAllocator(), + TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE); + return storage; } THStorage* THStorage_(newWithAllocator)(ptrdiff_t size, at::Allocator *allocator) { - return THStorage_newWithAllocator(at::CTypeToScalarType>::to(), size, allocator); + THStorage* storage = new THStorage( + at::CTypeToScalarType>::to(), + size, + allocator, + TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE); + return storage; } THStorage* THStorage_(newWithMapping)(const char *filename, ptrdiff_t size, int flags) { - return THStorage_newWithMapping(at::CTypeToScalarType>::to(), filename, size, flags); + auto scalar_type = at::CTypeToScalarType>::to(); + size_t actual_size = -1; + THStorage* storage = new THStorage( + scalar_type, + size, + THMapAllocator::makeDataPtr( + filename, flags, size * at::elementSize(scalar_type), &actual_size), + /* allocator */ nullptr, + TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE); + + if (size <= 0) { + storage->size = actual_size / at::elementSize(scalar_type); + } + + THStorage_clearFlag(storage, TH_STORAGE_RESIZABLE); + + return storage; } THStorage* THStorage_(newWithSize1)(real data0) @@ -101,7 +132,13 @@ void THStorage_(free)(THStorage *storage) THStorage* THStorage_(newWithDataAndAllocator)(at::DataPtr&& data, ptrdiff_t size, at::Allocator* allocator) { - return THStorage_newWithDataAndAllocator(at::CTypeToScalarType>::to(), std::move(data), size, allocator); + THStorage* storage = new THStorage( + at::CTypeToScalarType>::to(), + size, + std::move(data), + allocator, + TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE); + return storage; } void THStorage_(resize)(THStorage *storage, ptrdiff_t size) diff --git a/aten/src/TH/generic/THTensor.cpp b/aten/src/TH/generic/THTensor.cpp index d03c7294f58c94..92314de69bae29 100644 --- a/aten/src/TH/generic/THTensor.cpp +++ b/aten/src/TH/generic/THTensor.cpp @@ -29,27 +29,27 @@ int64_t THTensor_(size)(const THTensor *self, int dim) { THArgCheck((dim >= 0) && (dim < self->dim()), 2, "dimension %d out of range of %dD tensor", dim+TH_INDEX_BASE, THTensor_(nDimension)(self)); - return self->size[dim]; + return self->size(dim); } int64_t THTensor_(stride)(const THTensor *self, int dim) { THArgCheck((dim >= 0) && (dim < self->dim()), 2, "dimension %d out of range of %dD tensor", dim+TH_INDEX_BASE, THTensor_(nDimension)(self)); - return self->stride[dim]; + return self->stride(dim); } THLongStorage *THTensor_(newSizeOf)(THTensor *self) { THLongStorage *size = THLongStorage_newWithSize(self->dim()); - THLongStorage_rawCopy(size, self->size); + THLongStorage_rawCopy(size, THTensor_getSizePtr(self)); return size; } THLongStorage *THTensor_(newStrideOf)(THTensor *self) { THLongStorage *stride = THLongStorage_newWithSize(self->dim()); - THLongStorage_rawCopy(stride, self->stride); + THLongStorage_rawCopy(stride, THTensor_getStridePtr(self)); return stride; } @@ -61,53 +61,36 @@ real *THTensor_(data)(const THTensor *self) return NULL; } -void THTensor_(setFlag)(THTensor *self, const char flag) -{ - self->flag |= flag; -} - -void THTensor_(clearFlag)(THTensor *self, const char flag) -{ - self->flag &= ~flag; -} - /**** creation methods ****/ -static void THTensor_(rawInit)(THTensor *self); - - /* Empty init */ THTensor *THTensor_(new)(void) { - THTensor *self = (THTensor *)THAlloc(sizeof(THTensor)); - THTensor_(rawInit)(self); - return self; + return new THTensor(THStorage_(new)()); } /* Pointer-copy init */ THTensor *THTensor_(newWithTensor)(THTensor *tensor) { - THTensor *self = (THTensor *)THAlloc(sizeof(THTensor)); - THTensor_(rawInit)(self); + THTensor *self = new THTensor(THStorage_(new)()); THTensor_(setStorageNd)(self, tensor->storage, tensor->storageOffset, tensor->dim(), - tensor->size, - tensor->stride); + THTensor_getSizePtr(tensor), + THTensor_getStridePtr(tensor)); return self; } /* Storage init */ THTensor *THTensor_(newWithStorage)(THStorage *storage, ptrdiff_t storageOffset, THLongStorage *size, THLongStorage *stride) { - THTensor *self = (THTensor *)THAlloc(sizeof(THTensor)); if(size && stride) { THArgCheck(size->size == stride->size, 4, "inconsistent size"); } - AT_CHECK(size, "size must not be null"); - THTensor_(rawInit)(self); + + THTensor *self = new THTensor(THStorage_(new)()); #ifdef DEBUG THAssert(size->size <= INT_MAX); #endif @@ -123,8 +106,7 @@ THTensor *THTensor_(newWithStorage)(THStorage *storage, ptrdiff_t storageOffset, THTensor *THTensor_(newWithStorageIntLists)(THStorage *storage, ptrdiff_t storageOffset, at::IntList sizes, at::IntList strides) { AT_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match"); - THTensor *self = (THTensor *)THAlloc(sizeof(THTensor)); - THTensor_(rawInit)(self); + THTensor *self = new THTensor(THStorage_(new)()); THTensor_(setStorageNd)(self, storage, storageOffset, sizes.size(), const_cast(sizes.data()), const_cast(strides.data())); @@ -169,8 +151,7 @@ THTensor *THTensor_(newWithSize)(THLongStorage *size, THLongStorage *stride) } THTensor *THTensor_(newWithSizeIntList)(at::IntList sizes) { - THTensor *self = (THTensor *)THAlloc(sizeof(THTensor)); - THTensor_(rawInit)(self); + THTensor *self = new THTensor(THStorage_(new)()); THTensor_(resizeNd)(self, sizes.size(), const_cast(sizes.data()), nullptr); return self; @@ -248,8 +229,8 @@ THTensor *THTensor_(newView)(THTensor *tensor, THLongStorage *size) ptrdiff_t numel = THTensor_(nElement)(tensor); THTensor *self = THTensor_(new)(); THLongStorage *inferred_size = THLongStorage_newInferSize(size, numel); - auto stride = THTensor_compute_stride(at::IntList(tensor->size, tensor->dim()), - at::IntList(tensor->stride, tensor->dim()), + auto stride = THTensor_compute_stride(tensor->sizes(), + tensor->strides(), at::IntList(inferred_size->data(), inferred_size->size)); THArgCheck(stride.has_value(), 2, "view size is " "not compatible with input tensor's size and stride (at least one dimension spans " @@ -279,7 +260,7 @@ void THTensor_(resize)(THTensor *self, THLongStorage *size, THLongStorage *strid void THTensor_(resizeAs)(THTensor *self, THTensor *src) { if(!THTensor_(isSameSizeAs)(self, src)) - THTensor_(resizeNd)(self, src->dim(), src->size, NULL); + THTensor_(resizeNd)(self, src->dim(), THTensor_getSizePtr(src), NULL); } void THTensor_(resize1d)(THTensor *tensor, int64_t size0) @@ -319,8 +300,8 @@ void THTensor_(set)(THTensor *self, THTensor *src) src->storage, src->storageOffset, src->dim(), - src->size, - src->stride); + THTensor_getSizePtr(src), + THTensor_getStridePtr(src)); } void THTensor_(setStorage)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_) @@ -401,14 +382,14 @@ void THTensor_(narrow)(THTensor *self, THTensor *src, int dimension, int64_t fir #else THArgCheck( size > 0, 4, "out of range"); #endif - THArgCheck(firstIndex <= src->size[dimension] - size, 4, "out of range"); + THArgCheck(firstIndex <= src->size(dimension) - size, 4, "out of range"); THTensor_(set)(self, src); if(firstIndex > 0) - self->storageOffset += firstIndex*self->stride[dimension]; + self->storageOffset += firstIndex*self->stride(dimension); - self->size[dimension] = size; + THTensor_setSizeAtDim(self, dimension, size); } void THTensor_(select)(THTensor *self, THTensor *src, int dimension, int64_t sliceIndex) @@ -418,20 +399,24 @@ void THTensor_(select)(THTensor *self, THTensor *src, int dimension, int64_t sli if(!src) src = self; -#ifndef USE_TH_SCALAR +#ifndef USE_TH_SIZE_ZERO_DIM THArgCheck(src->_dim() > 1, 1, "cannot select on a vector"); +#else +#ifndef USE_TH_SCALAR + THArgCheck(src->dim() > 1, 1, "cannot select on a vector"); +#endif #endif THArgCheck((dimension >= 0) && (dimension < src->dim()), 2, "out of range"); - THArgCheck((sliceIndex >= 0) && (sliceIndex < src->size[dimension]), 3, "out of range"); + THArgCheck((sliceIndex >= 0) && (sliceIndex < src->size(dimension)), 3, "out of range"); THTensor_(set)(self, src); THTensor_(narrow)(self, NULL, dimension, sliceIndex, 1); for(d = dimension; d < self->dim()-1; d++) { - self->size[d] = self->size[d+1]; - self->stride[d] = self->stride[d+1]; + THTensor_setSizeAtDim(self, d, self->size(d+1)); + THTensor_setStrideAtDim(self, d, self->stride(d+1)); } - self->dim_--; + THTensor_resizeDim(self, self->dim_ - 1); } void THTensor_(transpose)(THTensor *self, THTensor *src, int dimension1, int dimension2) @@ -441,26 +426,24 @@ void THTensor_(transpose)(THTensor *self, THTensor *src, int dimension1, int dim if(!src) src = self; - THArgCheck( (dimension1 >= 0) && (dimension1 < src->_dim()), 1, "out of range"); - THArgCheck( (dimension2 >= 0) && (dimension2 < src->_dim()), 2, "out of range"); + THArgCheck( (dimension1 >= 0) && (dimension1 < src->dim()), 1, "out of range"); + THArgCheck( (dimension2 >= 0) && (dimension2 < src->dim()), 2, "out of range"); THTensor_(set)(self, src); if(dimension1 == dimension2) return; - z = self->stride[dimension1]; - self->stride[dimension1] = self->stride[dimension2]; - self->stride[dimension2] = z; - z = self->size[dimension1]; - self->size[dimension1] = self->size[dimension2]; - self->size[dimension2] = z; + z = self->stride(dimension1); + THTensor_setStrideAtDim(self, dimension1, self->stride(dimension2)); + THTensor_setStrideAtDim(self, dimension2, z); + z = self->size(dimension1); + THTensor_setSizeAtDim(self, dimension1, self->size(dimension2)); + THTensor_setSizeAtDim(self, dimension2, z); } void THTensor_(unfold)(THTensor *self, THTensor *src, int dimension, int64_t size, int64_t step) { - int64_t *newSize; - int64_t *newStride; int d; if(!src) @@ -470,36 +453,31 @@ void THTensor_(unfold)(THTensor *self, THTensor *src, int dimension, int64_t siz THArgCheck(!src->is_empty(), 1, "cannot unfold an empty tensor"); #endif THArgCheck((dimension >= 0) && (dimension < src->dim()), 2, "out of range"); - THArgCheck(size <= src->size[dimension], 3, "out of range"); + THArgCheck(size <= src->size(dimension), 3, "out of range"); THArgCheck(step > 0, 4, "invalid step"); THTensor_(set)(self, src); - newSize = (int64_t *)THAlloc(sizeof(int64_t)*(self->dim()+1)); - newStride = (int64_t *)THAlloc(sizeof(int64_t)*(self->dim()+1)); + std::vector newSize(/* size */ self->dim()+1); + std::vector newStride(/* size */ self->dim()+1); newSize[self->dim()] = size; - newStride[self->dim()] = self->stride[dimension]; + newStride[self->dim()] = self->stride(dimension); for(d = 0; d < self->dim(); d++) { if(d == dimension) { - newSize[d] = (self->size[d] - size) / step + 1; - newStride[d] = step*self->stride[d]; + newSize[d] = (self->size(d) - size) / step + 1; + newStride[d] = step*self->stride(d); } else { - newSize[d] = self->size[d]; - newStride[d] = self->stride[d]; + newSize[d] = self->size(d); + newStride[d] = self->stride(d); } } - THFree(self->size); - THFree(self->stride); - - self->size = newSize; - self->stride = newStride; - self->dim_++; + THTensor_setSizesAndStrides(self, std::move(newSize), std::move(newStride)); } /* we have to handle the case where the result is a number */ @@ -515,12 +493,12 @@ void THTensor_(squeeze)(THTensor *self, THTensor *src) for(d = 0; d < src->dim(); d++) { - if(src->size[d] != 1) + if(src->size(d) != 1) { if(d != ndim) { - self->size[ndim] = src->size[d]; - self->stride[ndim] = src->stride[d]; + THTensor_setSizeAtDim(self, ndim, src->size(d)); + THTensor_setStrideAtDim(self, ndim, src->stride(d)); } ndim++; } @@ -530,12 +508,12 @@ void THTensor_(squeeze)(THTensor *self, THTensor *src) /* right now, we do not handle 0-dimension tensors */ if(ndim == 0 && src->dim() > 0) { - self->size[0] = 1; - self->stride[0] = 1; + THTensor_setSizeAtDim(self, 0, 1); + THTensor_setStrideAtDim(self, 0, 1); ndim = 1; } #endif - self->dim_ = ndim; + THTensor_resizeDim(self, ndim); } void THTensor_(squeeze1d)(THTensor *self, THTensor *src, int dimension) @@ -550,17 +528,17 @@ void THTensor_(squeeze1d)(THTensor *self, THTensor *src, int dimension) THTensor_(set)(self, src); #ifdef USE_TH_SCALAR - if(src->size[dimension] == 1) + if(src->size(dimension) == 1) #else - if(src->size[dimension] == 1 && src->dim() > 1) + if(src->size(dimension) == 1 && src->dim() > 1) #endif { for(d = dimension; d < self->dim()-1; d++) { - self->size[d] = self->size[d+1]; - self->stride[d] = self->stride[d+1]; + THTensor_setSizeAtDim(self, d, self->size(d+1)); + THTensor_setStrideAtDim(self, d, self->stride(d+1)); } - self->dim_--; + THTensor_resizeDim(self, self->dim_ - 1); } } @@ -578,19 +556,17 @@ void THTensor_(unsqueeze1d)(THTensor *self, THTensor *src, int dimension) THTensor_(set)(self, src); - self->size = (int64_t*)THRealloc(self->size, sizeof(int64_t)*(self->dim()+1)); - self->stride = (int64_t*)THRealloc(self->stride, sizeof(int64_t)*(self->dim()+1)); - self->dim_++; + THTensor_resizeDim(self, self->dim() + 1); for (d = self->dim()-1; d > dimension; d--) { - self->size[d] = self->size[d-1]; - self->stride[d] = self->stride[d-1]; + THTensor_setSizeAtDim(self, d, self->size(d-1)); + THTensor_setStrideAtDim(self, d, self->stride(d-1)); } if (dimension+1 < self->dim()) { - self->stride[dimension] = self->size[dimension+1] * self->stride[dimension+1]; + THTensor_setStrideAtDim(self, dimension, self->size(dimension+1) * self->stride(dimension+1)); } else { - self->stride[dimension] = 1; + THTensor_setStrideAtDim(self, dimension, 1); } - self->size[dimension] = 1; + THTensor_setSizeAtDim(self, dimension, 1); } int THTensor_(isTransposed)(const THTensor *self) @@ -603,13 +579,13 @@ int THTensor_(isTransposed)(const THTensor *self) int64_t z = 1; int d; for (d = 0; d < self->_dim(); ++d) { - if (self->stride[d] == 0 && self->size[d] != 1) + if (self->stride(d) == 0 && self->size(d) != 1) return 0; - if (self->stride[d] > max_stride) { - max_stride = self->stride[d]; - size_max_stride = self->size[d]; + if (self->stride(d) > max_stride) { + max_stride = self->stride(d); + size_max_stride = self->size(d); } - z *= self->size[d]; + z *= self->size(d); } if (z == max_stride * size_max_stride) { return 1; @@ -624,10 +600,10 @@ int THTensor_(isContiguous)(const THTensor *self) int d; for(d = self->dim()-1; d >= 0; d--) { - if(self->size[d] != 1) + if(self->size(d) != 1) { - if(self->stride[d] == z) - z *= self->size[d]; + if(self->stride(d) == z) + z *= self->size(d); else return 0; } @@ -643,7 +619,7 @@ int THTensor_(isSize)(const THTensor *self, const THLongStorage *dims) for(d = 0; d < self->_dim(); ++d) { - if(self->size[d] != THLongStorage_data(dims)[d]) + if(self->size(d) != THLongStorage_data(dims)[d]) return 0; } return 1; @@ -656,7 +632,7 @@ int THTensor_(isSameSizeAs)(const THTensor *self, const THTensor* src) return 0; for(d = 0; d < self->dim(); ++d) { - if(self->size[d] != src->size[d]) + if(self->size(d) != src->size(d)) return 0; } return 1; @@ -673,7 +649,7 @@ int THTensor_(isSetTo)(const THTensor *self, const THTensor* src) int d; for (d = 0; d < self->_dim(); ++d) { - if (self->size[d] != src->size[d] || self->stride[d] != src->stride[d]) + if (self->size(d) != src->size(d) || self->stride(d) != src->stride(d)) return 0; } return 1; @@ -690,15 +666,14 @@ ptrdiff_t THTensor_(nElement)(const THTensor *self) ptrdiff_t nElement = 1; int d; for(d = 0; d < self->_dim(); d++) - nElement *= self->size[d]; + nElement *= self->size(d); return nElement; } } void THTensor_(retain)(THTensor *self) { - if(self->flag & TH_TENSOR_REFCOUNTED) - ++self->refcount; + ++self->refcount; } void THTensor_(free)(THTensor *self) @@ -716,19 +691,6 @@ void THTensor_(freeCopyTo)(THTensor *self, THTensor *dst) /*******************************************************************************/ -static void THTensor_(rawInit)(THTensor *self) -{ - new (&self->refcount) std::atomic(1); - self->storage = THStorage_(new)(); - self->storageOffset = 0; - self->size = static_cast(THAlloc(sizeof(int64_t))); - self->stride = static_cast(THAlloc(sizeof(int64_t))); - self->size[0] = 0; - self->stride[0] = 1; - self->dim_ = 1; - self->flag = TH_TENSOR_REFCOUNTED; -} - void THTensor_(setStorageNd)(THTensor *self, THStorage *storage, ptrdiff_t storageOffset, int nDimension, int64_t *size, int64_t *stride) { /* storage */ @@ -778,12 +740,12 @@ void THTensor_(resizeNd)(THTensor *self, int nDimension, int64_t *size, int64_t AT_CHECK(size[d] > 0, "sizes must be non-negative"); } #endif - if((self->dim() > d) && (size[d] != self->size[d])) { + if((self->dim() > d) && (size[d] != self->size(d))) { hascorrectsize = false; } // NB: this used to test that stride[d] was >= 0 - if((self->dim() > d) && stride && (stride[d] != self->stride[d])) { + if((self->dim() > d) && stride && (stride[d] != self->stride(d))) { hascorrectsize = false; } } @@ -798,26 +760,24 @@ void THTensor_(resizeNd)(THTensor *self, int nDimension, int64_t *size, int64_t if(nDimension != self->dim()) { - self->size = (int64_t *)THRealloc(self->size, sizeof(int64_t)*nDimension); - self->stride = (int64_t *)THRealloc(self->stride, sizeof(int64_t)*nDimension); - self->dim_ = nDimension; + THTensor_resizeDim(self, nDimension); } totalSize = 1; for(d = nDimension-1; d >= 0; d--) { - self->size[d] = size[d]; + THTensor_setSizeAtDim(self, d, size[d]); if(stride && (stride[d] >= 0) ) { - self->stride[d] = stride[d]; + THTensor_setStrideAtDim(self, d, stride[d]); } else { if(d == nDimension-1) { - self->stride[d] = 1; + THTensor_setStrideAtDim(self, d, 1); } else { // Keep stride monotonically increasing to match NumPy. - self->stride[d] = std::max(self->size[d+1], 1)*self->stride[d+1]; + THTensor_setStrideAtDim(self, d, std::max(self->size(d+1), 1)*self->stride(d+1)); } } - totalSize += (self->size[d]-1)*self->stride[d]; + totalSize += (self->size(d)-1)*self->stride(d); } if(totalSize+self->storageOffset > 0) @@ -834,57 +794,57 @@ void THTensor_(resizeNd)(THTensor *self, int nDimension, int64_t *size, int64_t void THTensor_(set1d)(THTensor *tensor, int64_t x0, real value) { THArgCheck(tensor->_dim() == 1, 1, "tensor must have one dimension"); - THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range"); - THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0], value); + THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range"); + THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0), value); } real THTensor_(get1d)(const THTensor *tensor, int64_t x0) { THArgCheck(tensor->_dim() == 1, 1, "tensor must have one dimension"); - THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range"); - return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]); + THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range"); + return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0)); } void THTensor_(set2d)(THTensor *tensor, int64_t x0, int64_t x1, real value) { THArgCheck(tensor->_dim() == 2, 1, "tensor must have two dimensions"); - THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range"); - THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1], value); + THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)), 2, "out of range"); + THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1), value); } real THTensor_(get2d)(const THTensor *tensor, int64_t x0, int64_t x1) { THArgCheck(tensor->_dim() == 2, 1, "tensor must have two dimensions"); - THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range"); - return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]); + THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)), 2, "out of range"); + return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)); } void THTensor_(set3d)(THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, real value) { THArgCheck(tensor->_dim() == 3, 1, "tensor must have three dimensions"); - THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range"); - THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2], value); + THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)), 2, "out of range"); + THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2), value); } real THTensor_(get3d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2) { THArgCheck(tensor->_dim() == 3, 1, "tensor must have three dimensions"); - THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range"); - return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]); + THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)), 2, "out of range"); + return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)); } void THTensor_(set4d)(THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3, real value) { THArgCheck(tensor->_dim() == 4, 1, "tensor must have four dimensions"); - THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range"); - THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]+x3*tensor->stride[3], value); + THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)) && (x3 >= 0) && (x3 < tensor->size(3)), 2, "out of range"); + THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)+x3*tensor->stride(3), value); } real THTensor_(get4d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3) { THArgCheck(tensor->_dim() == 4, 1, "tensor must have four dimensions"); - THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range"); - return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]+x3*tensor->stride[3]); + THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)) && (x3 >= 0) && (x3 < tensor->size(3)), 2, "out of range"); + return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)+x3*tensor->stride(3)); } THDescBuff THTensor_(desc)(const THTensor *tensor) { @@ -898,7 +858,7 @@ THDescBuff THTensor_(desc)(const THTensor *tensor) { int i; for(i = 0; i < tensor->_dim(); i++) { if(n >= L) break; - n += snprintf(str+n, L-n, "%" PRId64, tensor->size[i]); + n += snprintf(str+n, L-n, "%" PRId64, tensor->size(i)); if(i < tensor->_dim()-1) { n += snprintf(str+n, L-n, "x"); } diff --git a/aten/src/TH/generic/THTensor.h b/aten/src/TH/generic/THTensor.h index 03cc0169677fb9..cdc8f7edef41ce 100644 --- a/aten/src/TH/generic/THTensor.h +++ b/aten/src/TH/generic/THTensor.h @@ -4,7 +4,7 @@ /* a la lua? dim, storageoffset, ... et les methodes ? */ -#define TH_TENSOR_REFCOUNTED 1 +#define THCTensor THTensor // Struct definition moved to THTensor.hpp typedef struct THTensor THTensor; @@ -33,9 +33,6 @@ TH_API THLongStorage *THTensor_(newSizeOf)(THTensor *self); TH_API THLongStorage *THTensor_(newStrideOf)(THTensor *self); TH_API real *THTensor_(data)(const THTensor *self); -TH_API void THTensor_(setFlag)(THTensor *self, const char flag); -TH_API void THTensor_(clearFlag)(THTensor *self, const char flag); - /**** creation methods ****/ TH_API THTensor *THTensor_(new)(void); diff --git a/aten/src/TH/generic/THTensorConv.cpp b/aten/src/TH/generic/THTensorConv.cpp index fb4670cf0f7903..0c590d6f9e400e 100644 --- a/aten/src/TH/generic/THTensorConv.cpp +++ b/aten/src/TH/generic/THTensorConv.cpp @@ -600,15 +600,15 @@ void THTensor_(conv2DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, input = THTensor_(newContiguous)(t_); kernel = THTensor_(newContiguous)(k_); - nInputPlane = input->size[0]; - istride0 = input->stride[0]; - nInputRows = input->size[1]; - nInputCols = input->size[2]; + nInputPlane = input->size(0); + istride0 = input->stride(0); + nInputRows = input->size(1); + nInputCols = input->size(2); - kstride0 = kernel->stride[0]; - nKernelPlane = kernel->size[0]; - nKernelRows = kernel->size[1]; - nKernelCols = kernel->size[2]; + kstride0 = kernel->stride(0); + nKernelPlane = kernel->size(0); + nKernelRows = kernel->size(1); + nKernelCols = kernel->size(2); THArgCheck(nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2, "covn2DRevger : Input image is smaller than kernel"); @@ -627,7 +627,7 @@ void THTensor_(conv2DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, /*THTensor_(zero)(r_);*/ #pragma omp parallel for private(k) - for (k = 0; k < r_->size[0]*r_->size[1]; k++) + for (k = 0; k < r_->size(0)*r_->size(1); k++) { real* ptr_output = output_data + k*nOutputCols*nOutputRows; int64_t l; @@ -639,7 +639,7 @@ void THTensor_(conv2DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, { /*THTensor_(mul)(r_, beta);*/ #pragma omp parallel for private(k) - for (k = 0; k < r_->size[0]*r_->size[1]; k++) + for (k = 0; k < r_->size(0)*r_->size(1); k++) { real* ptr_output = output_data + k*nOutputCols*nOutputRows; int64_t l; @@ -706,21 +706,21 @@ void THTensor_(conv2DRevgerm)(THTensor *r_, real beta, real alpha, THTensor *t_, input = THTensor_(newContiguous)(t_); kernel = THTensor_(newContiguous)(k_); - istride0 = input->stride[0]; - istride1 = input->stride[1]; - nbatch = input->size[0]; - nInputPlane = input->size[1]; - nInputRows = input->size[2]; - nInputCols = input->size[3]; + istride0 = input->stride(0); + istride1 = input->stride(1); + nbatch = input->size(0); + nInputPlane = input->size(1); + nInputRows = input->size(2); + nInputCols = input->size(3); - kstride0 = kernel->stride[0]; - kstride1 = kernel->stride[1]; - nKernelPlane = kernel->size[1]; - nKernelRows = kernel->size[2]; - nKernelCols = kernel->size[3]; + kstride0 = kernel->stride(0); + kstride1 = kernel->stride(1); + nKernelPlane = kernel->size(1); + nKernelRows = kernel->size(2); + nKernelCols = kernel->size(3); THArgCheck(nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2, "conv2DRevger : Input image is smaller than kernel"); - THArgCheck(kernel->size[0] == input->size[0] , 2, "conv2DRevger : Input batch and kernel batch is not same size"); + THArgCheck(kernel->size(0) == input->size(0) , 2, "conv2DRevger : Input batch and kernel batch is not same size"); nOutputRows = nInputRows - (nKernelRows - 1) * srow; nOutputCols = nInputCols - (nKernelCols - 1) * scol; @@ -737,7 +737,7 @@ void THTensor_(conv2DRevgerm)(THTensor *r_, real beta, real alpha, THTensor *t_, /*THTensor_(zero)(r_);*/ #pragma omp parallel for private(k) - for (k = 0; k < r_->size[0]*r_->size[1]; k++) + for (k = 0; k < r_->size(0)*r_->size(1); k++) { real* ptr_output = output_data + k*nOutputCols*nOutputRows; int64_t l; @@ -749,7 +749,7 @@ void THTensor_(conv2DRevgerm)(THTensor *r_, real beta, real alpha, THTensor *t_, { /*THTensor_(mul)(r_, beta);*/ #pragma omp parallel for private(k) - for (k = 0; k < r_->size[0]*r_->size[1]; k++) + for (k = 0; k < r_->size(0)*r_->size(1); k++) { real* ptr_output = output_data + k*nOutputCols*nOutputRows; int64_t l; @@ -820,15 +820,15 @@ void THTensor_(conv2Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THT input = THTensor_(newContiguous)(t_); kernel = THTensor_(newContiguous)(k_); - nInputPlane = input->size[0]; - istride0 = input->stride[0]; - nInputRows = input->size[1]; - nInputCols = input->size[2]; + nInputPlane = input->size(0); + istride0 = input->stride(0); + nInputRows = input->size(1); + nInputCols = input->size(2); - kstride0 = kernel->stride[0]; - nKernelPlane = kernel->size[0]; - nKernelRows = kernel->size[1]; - nKernelCols = kernel->size[2]; + kstride0 = kernel->stride(0); + nKernelPlane = kernel->size(0); + nKernelRows = kernel->size(1); + nKernelCols = kernel->size(2); THArgCheck((nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dger : Input image is smaller than kernel"); @@ -851,7 +851,7 @@ void THTensor_(conv2Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THT { /*THTensor_(zero)(r_);*/ #pragma omp parallel for private(k) - for (k = 0; k < r_->size[0]*r_->size[1]; k++) + for (k = 0; k < r_->size(0)*r_->size(1); k++) { real* ptr_output = output_data + k*nOutputCols*nOutputRows; int64_t l; @@ -863,7 +863,7 @@ void THTensor_(conv2Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THT { /*THTensor_(mul)(r_, beta);*/ #pragma omp parallel for private(k) - for (k = 0; k < r_->size[0]*r_->size[1]; k++) + for (k = 0; k < r_->size(0)*r_->size(1); k++) { real* ptr_output = output_data + k*nOutputCols*nOutputRows; int64_t l; @@ -949,24 +949,24 @@ void THTensor_(conv2Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTe THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'"); input = THTensor_(newContiguous)(t_); - if (!(k_->stride[3] == 1) || !(k_->stride[2] == k_->size[3])) { + if (!(k_->stride(3) == 1) || !(k_->stride(2) == k_->size(3))) { kernel = THTensor_(newContiguous)(k_); } else { THTensor_(retain)(k_); kernel = k_; } - nInputPlane = input->size[0]; - istride0 = input->stride[0]; - nInputRows = input->size[1]; - nInputCols = input->size[2]; + nInputPlane = input->size(0); + istride0 = input->stride(0); + nInputRows = input->size(1); + nInputCols = input->size(2); - kstride0 = kernel->stride[0]; - kstride1 = kernel->stride[1]; - nKernelRows = kernel->size[2]; - nKernelCols = kernel->size[3]; - nOutputPlane = kernel->size[0]; - THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes"); + kstride0 = kernel->stride(0); + kstride1 = kernel->stride(1); + nKernelRows = kernel->size(2); + nKernelCols = kernel->size(3); + nOutputPlane = kernel->size(0); + THArgCheck(kernel->size(1) == nInputPlane, 2, "invalid number of input planes"); THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dmv : Input image is smaller than kernel"); @@ -989,7 +989,7 @@ void THTensor_(conv2Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTe { /*THTensor_(zero)(r_);*/ #pragma omp parallel for private(k) - for (k = 0; k < r_->size[0]; k++) + for (k = 0; k < r_->size(0); k++) { real* ptr_output = output_data + k*nOutputCols*nOutputRows; int64_t l; @@ -1001,7 +1001,7 @@ void THTensor_(conv2Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTe { /*THTensor_(mul)(r_, beta);*/ #pragma omp parallel for private(k) - for (k = 0; k < r_->size[0]; k++) + for (k = 0; k < r_->size(0); k++) { real* ptr_output = output_data + k*nOutputCols*nOutputRows; int64_t l; @@ -1087,24 +1087,24 @@ void THTensor_(conv2Dmm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTe THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'"); input = THTensor_(newContiguous)(t_); - if (!(k_->stride[3] == 1) || !(k_->stride[2] == k_->size[3])) { + if (!(k_->stride(3) == 1) || !(k_->stride(2) == k_->size(3))) { kernel = THTensor_(newContiguous)(k_); } else { THTensor_(retain)(k_); kernel = k_; } - nbatch = input->size[0]; - nInputPlane = input->size[1]; - nInputRows = input->size[2]; - nInputCols = input->size[3]; + nbatch = input->size(0); + nInputPlane = input->size(1); + nInputRows = input->size(2); + nInputCols = input->size(3); - kstride0 = kernel->stride[0]; - kstride1 = kernel->stride[1]; - nKernelRows = kernel->size[2]; - nKernelCols = kernel->size[3]; - nOutputPlane = kernel->size[0]; - THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes"); + kstride0 = kernel->stride(0); + kstride1 = kernel->stride(1); + nKernelRows = kernel->size(2); + nKernelCols = kernel->size(3); + nOutputPlane = kernel->size(0); + THArgCheck(kernel->size(1) == nInputPlane, 2, "invalid number of input planes"); THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dmv : Input image is smaller than kernel"); @@ -1127,10 +1127,10 @@ void THTensor_(conv2Dmm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTe { /*THTensor_(zero)(r_);*/ #pragma omp parallel for private(p) - for (p=0; p < r_->size[0]; p++) + for (p=0; p < r_->size(0); p++) { int64_t k; - for (k = 0; k < r_->size[1]; k++) + for (k = 0; k < r_->size(1); k++) { real* ptr_output = output_data + p*nOutputPlane*nOutputRows*nOutputCols + k*nOutputCols*nOutputRows; int64_t l; @@ -1143,10 +1143,10 @@ void THTensor_(conv2Dmm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTe { /*THTensor_(mul)(r_, beta);*/ #pragma omp parallel for private(p) - for(p=0; p < r_->size[0]; p++) + for(p=0; p < r_->size(0); p++) { int64_t k; - for (k = 0; k < r_->size[1]; k++) + for (k = 0; k < r_->size(1); k++) { real* ptr_output = output_data + p*nOutputPlane*nOutputRows*nOutputCols + k*nOutputCols*nOutputRows; int64_t l; @@ -1236,10 +1236,10 @@ void THTensor_(conv2Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THT input = THTensor_(newContiguous)(t_); kernel = THTensor_(newContiguous)(k_); - nInputRows = input->size[0]; - nInputCols = input->size[1]; - nKernelRows = kernel->size[0]; - nKernelCols = kernel->size[1]; + nInputRows = input->size(0); + nInputCols = input->size(1); + nKernelRows = kernel->size(0); + nKernelCols = kernel->size(1); THArgCheck((nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dmul : Input image is smaller than kernel"); @@ -1295,15 +1295,15 @@ void THTensor_(conv2Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, TH input = THTensor_(newContiguous)(t_); kernel = THTensor_(newContiguous)(k_); - istride0 = input->stride[0]; - nInputPlane = input->size[0]; - nInputRows = input->size[1]; - nInputCols = input->size[2]; + istride0 = input->stride(0); + nInputPlane = input->size(0); + nInputRows = input->size(1); + nInputCols = input->size(2); - kstride0 = kernel->stride[0]; - nOutputPlane = kernel->size[0]; - nKernelRows = kernel->size[1]; - nKernelCols = kernel->size[2]; + kstride0 = kernel->stride(0); + nOutputPlane = kernel->size(0); + nKernelRows = kernel->size(1); + nKernelCols = kernel->size(2); THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes"); THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dcmul : Input image is smaller than kernel"); @@ -1374,15 +1374,15 @@ void THTensor_(conv2Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THT input = THTensor_(newContiguous)(t_); kernel = THTensor_(newContiguous)(k_); - istride0 = input->stride[0]; - nInputPlane = input->size[0]; - nInputRows = input->size[1]; - nInputCols = input->size[2]; + istride0 = input->stride(0); + nInputPlane = input->size(0); + nInputRows = input->size(1); + nInputCols = input->size(2); - kstride0 = kernel->stride[0]; - nOutputPlane = kernel->size[0]; - nKernelRows = kernel->size[1]; - nKernelCols = kernel->size[2]; + kstride0 = kernel->stride(0); + nOutputPlane = kernel->size(0); + nKernelRows = kernel->size(1); + nKernelCols = kernel->size(2); THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes"); THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) @@ -1405,7 +1405,7 @@ void THTensor_(conv2Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THT weight_data = THTensor_(data)(kernel); output_data = THTensor_(data)(r_); - nmaps = map->size[0]; + nmaps = map->size(0); for(k = 0; k < nmaps; k++) { @@ -1462,17 +1462,17 @@ void THTensor_(conv3DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, input = THTensor_(newContiguous)(t_); kernel = THTensor_(newContiguous)(k_); - nInputPlane = input->size[0]; - istride0 = input->stride[0]; - nInputDepth = input->size[1]; - nInputRows = input->size[2]; - nInputCols = input->size[3]; + nInputPlane = input->size(0); + istride0 = input->stride(0); + nInputDepth = input->size(1); + nInputRows = input->size(2); + nInputCols = input->size(3); - kstride0 = kernel->stride[0]; - nKernelPlane = kernel->size[0]; - nKernelDepth= kernel->size[1]; - nKernelRows = kernel->size[2]; - nKernelCols = kernel->size[3]; + kstride0 = kernel->stride(0); + nKernelPlane = kernel->size(0); + nKernelDepth= kernel->size(1); + nKernelRows = kernel->size(2); + nKernelCols = kernel->size(3); THArgCheck(nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2, "conv3DRevger : Input image is smaller than kernel"); @@ -1550,17 +1550,17 @@ void THTensor_(conv3Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THT input = THTensor_(newContiguous)(t_); kernel = THTensor_(newContiguous)(k_); - nInputPlane = input->size[0]; - istride0 = input->stride[0]; - nInputDepth = input->size[1]; - nInputRows = input->size[2]; - nInputCols = input->size[3]; + nInputPlane = input->size(0); + istride0 = input->stride(0); + nInputDepth = input->size(1); + nInputRows = input->size(2); + nInputCols = input->size(3); - kstride0 = kernel->stride[0]; - nKernelPlane = kernel->size[0]; - nKernelDepth = kernel->size[1]; - nKernelRows = kernel->size[2]; - nKernelCols = kernel->size[3]; + kstride0 = kernel->stride(0); + nKernelPlane = kernel->size(0); + nKernelDepth = kernel->size(1); + nKernelRows = kernel->size(2); + nKernelCols = kernel->size(3); THArgCheck((nInputDepth >= nKernelDepth && nInputRows >= nKernelRows @@ -1639,26 +1639,26 @@ void THTensor_(conv3Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTe THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'"); input = THTensor_(newContiguous)(t_); - if (!(k_->stride[4] == 1) || !(k_->stride[3] == k_->size[4])) { + if (!(k_->stride(4) == 1) || !(k_->stride(3) == k_->size(4))) { kernel = THTensor_(newContiguous)(k_); } else { THTensor_(retain)(k_); kernel = k_; } - nInputPlane = input->size[0]; - istride0 = input->stride[0]; - nInputDepth = input->size[1]; - nInputRows = input->size[2]; - nInputCols = input->size[3]; + nInputPlane = input->size(0); + istride0 = input->stride(0); + nInputDepth = input->size(1); + nInputRows = input->size(2); + nInputCols = input->size(3); - kstride0 = kernel->stride[0]; - kstride1 = kernel->stride[1]; - nKernelDepth = kernel->size[2]; - nKernelRows = kernel->size[3]; - nKernelCols = kernel->size[4]; - nOutputPlane = kernel->size[0]; - THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes"); + kstride0 = kernel->stride(0); + kstride1 = kernel->stride(1); + nKernelDepth = kernel->size(2); + nKernelRows = kernel->size(3); + nKernelCols = kernel->size(4); + nOutputPlane = kernel->size(0); + THArgCheck(kernel->size(1) == nInputPlane, 2, "invalid number of input planes"); THArgCheck( (nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv3Dmv : Input image is smaller than kernel"); @@ -1736,12 +1736,12 @@ void THTensor_(conv3Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THT input = THTensor_(newContiguous)(t_); kernel = THTensor_(newContiguous)(k_); - nInputDepth = input->size[0]; - nInputRows = input->size[1]; - nInputCols = input->size[2]; - nKernelDepth = kernel->size[0]; - nKernelRows = kernel->size[1]; - nKernelCols = kernel->size[2]; + nInputDepth = input->size(0); + nInputRows = input->size(1); + nInputCols = input->size(2); + nKernelDepth = kernel->size(0); + nKernelRows = kernel->size(1); + nKernelCols = kernel->size(2); THArgCheck((nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv3Dmul : Input image is smaller than kernel"); @@ -1802,17 +1802,17 @@ void THTensor_(conv3Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, TH input = THTensor_(newContiguous)(t_); kernel = THTensor_(newContiguous)(k_); - istride0 = input->stride[0]; - nInputPlane = input->size[0]; - nInputDepth = input->size[1]; - nInputRows = input->size[2]; - nInputCols = input->size[3]; + istride0 = input->stride(0); + nInputPlane = input->size(0); + nInputDepth = input->size(1); + nInputRows = input->size(2); + nInputCols = input->size(3); - kstride0 = kernel->stride[0]; - nOutputPlane = kernel->size[0]; - nKernelDepth = kernel->size[1]; - nKernelRows = kernel->size[2]; - nKernelCols = kernel->size[3]; + kstride0 = kernel->stride(0); + nOutputPlane = kernel->size(0); + nKernelDepth = kernel->size(1); + nKernelRows = kernel->size(2); + nKernelCols = kernel->size(3); THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes"); THArgCheck( (nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv3Dcmul : Input image is smaller than kernel"); @@ -1889,17 +1889,17 @@ void THTensor_(conv3Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THT input = THTensor_(newContiguous)(t_); kernel = THTensor_(newContiguous)(k_); - istride0 = input->stride[0]; - nInputPlane = input->size[0]; - nInputDepth = input->size[1]; - nInputRows = input->size[2]; - nInputCols = input->size[3]; + istride0 = input->stride(0); + nInputPlane = input->size(0); + nInputDepth = input->size(1); + nInputRows = input->size(2); + nInputCols = input->size(3); - kstride0 = kernel->stride[0]; - nOutputPlane = kernel->size[0]; - nKernelDepth = kernel->size[1]; - nKernelRows = kernel->size[2]; - nKernelCols = kernel->size[3]; + kstride0 = kernel->stride(0); + nOutputPlane = kernel->size(0); + nKernelDepth = kernel->size(1); + nKernelRows = kernel->size(2); + nKernelCols = kernel->size(3); THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes"); THArgCheck((nInputDepth >= nKernelDepth @@ -1925,7 +1925,7 @@ void THTensor_(conv3Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THT weight_data = THTensor_(data)(kernel); output_data = THTensor_(data)(r_); - nmaps = map->size[0]; + nmaps = map->size(0); for(k = 0; k < nmaps; k++) { diff --git a/aten/src/TH/generic/THTensorFastGetSet.hpp b/aten/src/TH/generic/THTensorFastGetSet.hpp index de65f083ea38f3..fa989ddafaf403 100644 --- a/aten/src/TH/generic/THTensorFastGetSet.hpp +++ b/aten/src/TH/generic/THTensorFastGetSet.hpp @@ -3,43 +3,43 @@ #else static inline real THTensor_(fastGet1d)(THTensor *self, int64_t x0) { - return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]]; + return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)]; } static inline real THTensor_(fastGet2d)(THTensor *self, int64_t x0, int64_t x1) { - return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]]; + return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)]; } static inline real THTensor_(fastGet3d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2) { - return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]+(x2)*self->stride[2]]; + return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)]; } static inline real THTensor_(fastGet4d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3) { - return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]+(x2)*self->stride[2]+(x3)*self->stride[3]]; + return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)]; } static inline real THTensor_(fastGet5d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, int64_t x4) { - return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]+(x2)*self->stride[2]+(x3)*self->stride[3]+(x4)*self->stride[4]]; + return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)+(x4)*self->stride(4)]; } static inline void THTensor_(fastSet1d)(THTensor *self, int64_t x0, real value) { - (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]] = value; + (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)] = value; } static inline void THTensor_(fastSet2d)(THTensor *self, int64_t x0, int64_t x1, real value) { - (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]] = value; + (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)] = value; } static inline void THTensor_(fastSet3d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, real value) { - (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]+(x2)*self->stride[2]] = value; + (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)] = value; } static inline void THTensor_(fastSet4d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, real value) { - (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]+(x2)*self->stride[2]+(x3)*self->stride[3]] = value; + (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)] = value; } static inline void THTensor_(fastSet5d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, int64_t x4, real value) { - (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]+(x2)*self->stride[2]+(x3)*self->stride[3]+(x4)*self->stride[4]] = value; + (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)+(x4)*self->stride(4)] = value; } #endif diff --git a/aten/src/TH/generic/THTensorLapack.cpp b/aten/src/TH/generic/THTensorLapack.cpp index ca562e3021aec2..4793dec43de2af 100644 --- a/aten/src/TH/generic/THTensorLapack.cpp +++ b/aten/src/TH/generic/THTensorLapack.cpp @@ -7,7 +7,7 @@ Check if self is transpose of a contiguous matrix */ static int THTensor_(isTransposedContiguous)(THTensor *self) { - return self->stride[0] == 1 && self->stride[1] == self->size[0]; + return self->stride(0) == 1 && self->stride(1) == self->size(0); } /* If a matrix is a regular contiguous matrix, make sure it is transposed @@ -53,7 +53,7 @@ input space, like underdetermined gels. static THTensor *THTensor_(checkLapackClone)(THTensor *result, THTensor *src, int nrows) { /* check if user wants to reuse src and if it is correct shape/size */ - if (src == result && THTensor_(isTransposedContiguous)(src) && src->size[1] == nrows) + if (src == result && THTensor_(isTransposedContiguous)(src) && src->size(1) == nrows) THTensor_(retain)(result); else if(src == result || result == NULL) /* in this case, user wants reuse of src, but its structure is not OK */ result = THTensor_(new)(); @@ -77,14 +77,14 @@ static THTensor *THTensor_(cloneColumnMajorNrows)(THTensor *self, THTensor *src, if (src == result) return result; - THTensor_(resize2d)(result, src->size[1], nrows); + THTensor_(resize2d)(result, src->size(1), nrows); THTensor_(checkTransposed)(result); - if (src->size[0] == nrows) + if (src->size(0) == nrows) THTensor_(copy)(result, src); else { - view = THTensor_(newNarrow)(result, 0, 0, src->size[0]); + view = THTensor_(newNarrow)(result, 0, 0, src->size(0)); THTensor_(copy)(view, src); THTensor_(free)(view); } @@ -98,7 +98,7 @@ freed by calling function. */ static THTensor *THTensor_(cloneColumnMajor)(THTensor *self, THTensor *src) { - return THTensor_(cloneColumnMajorNrows)(self, src, src->size[0]); + return THTensor_(cloneColumnMajorNrows)(self, src, src->size(0)); } void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a) @@ -106,18 +106,20 @@ void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a) int free_b = 0; if (a == NULL) a = ra_; if (b == NULL) b = rb_; - THArgCheck(a->_dim() == 2, 2, "A should have 2 dimensions, but has %d", - a->_dim()); - THArgCheck(b->_dim() == 1 || b->_dim() == 2, 1, "B should have 1 or 2 " - "dimensions, but has %d", b->_dim()); - THArgCheck(a->size[0] == a->size[1], 2, "A should be square, but is %ldx%ld", - a->size[0], a->size[1]); - THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld " - "rows, B has %ld", a->size[0], b->size[0]); - - if (b->_dim() == 1) { - b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0], - b->stride[0], 1, 0); + THArgCheck(a->dim() == 2, 2, "A should have 2 dimensions, but has %d", + a->dim()); + THArgCheck(!a->is_empty(), 2, "A should not be empty"); + THArgCheck(b->dim() == 1 || b->dim() == 2, 1, "B should have 1 or 2 " + "dimensions, but has %d", b->dim()); + THArgCheck(!b->is_empty(), 2, "B should not be empty"); + THArgCheck(a->size(0) == a->size(1), 2, "A should be square, but is %ldx%ld", + a->size(0), a->size(1)); + THArgCheck(a->size(0) == b->size(0), 2, "A,B size incompatible - A has %ld " + "rows, B has %ld", a->size(0), b->size(0)); + + if (b->dim() == 1) { + b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size(0), + b->stride(0), 1, 0); free_b = 1; } @@ -129,8 +131,8 @@ void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a) ra__ = THTensor_(cloneColumnMajor)(ra_, a); rb__ = THTensor_(cloneColumnMajor)(rb_, b); - n = (int)ra__->size[0]; - nrhs = (int)rb__->size[1]; + n = (int)ra__->size(0); + nrhs = (int)rb__->size(1); lda = n; ldb = n; @@ -163,14 +165,14 @@ void THTensor_(trtrs)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a, a->_dim()); THArgCheck(b->_dim() == 1 || b->_dim() == 2, 1, "B should have 1 or 2 " "dimensions, but has %d", b->_dim()); - THArgCheck(a->size[0] == a->size[1], 2, "A should be square, but is %ldx%ld", - a->size[0], a->size[1]); - THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld " - "rows, B has %ld", a->size[0], b->size[0]); + THArgCheck(a->size(0) == a->size(1), 2, "A should be square, but is %ldx%ld", + a->size(0), a->size(1)); + THArgCheck(a->size(0) == b->size(0), 2, "A,B size incompatible - A has %ld " + "rows, B has %ld", a->size(0), b->size(0)); if (b->_dim() == 1) { - b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0], - b->stride[0], 1, 0); + b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size(0), + b->stride(0), 1, 0); free_b = 1; } @@ -181,8 +183,8 @@ void THTensor_(trtrs)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a, ra__ = THTensor_(cloneColumnMajor)(ra_, a); rb__ = THTensor_(cloneColumnMajor)(rb_, b); - n = (int)ra__->size[0]; - nrhs = (int)rb__->size[1]; + n = (int)ra__->size(0); + nrhs = (int)rb__->size(1); lda = n; ldb = n; @@ -209,16 +211,18 @@ void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a) // Note that a = NULL is interpreted as a = ra_, and b = NULL as b = rb_. if (a == NULL) a = ra_; if (b == NULL) b = rb_; - THArgCheck(a->_dim() == 2, 2, "A should have 2 dimensions, but has %d", - a->_dim()); - THArgCheck(b->_dim() == 1 || b->_dim() == 2, 1, "B should have 1 or 2 " - "dimensions, but has %d", b->_dim()); - THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld " - "rows, B has %ld", a->size[0], b->size[0]); + THArgCheck(a->dim() == 2, 2, "A should have 2 dimensions, but has %d", + a->dim()); + THArgCheck(!a->is_empty(), 2, "A should not be empty"); + THArgCheck(b->dim() == 1 || b->dim() == 2, 1, "B should have 1 or 2 " + "dimensions, but has %d", b->dim()); + THArgCheck(!b->is_empty(), 1, "B should not be empty"); + THArgCheck(a->size(0) == b->size(0), 2, "A,B size incompatible - A has %ld " + "rows, B has %ld", a->size(0), b->size(0)); if (b->_dim() == 1) { - b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0], - b->stride[0], 1, 0); + b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size(0), + b->stride(0), 1, 0); free_b = 1; } @@ -231,14 +235,14 @@ void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a) ra__ = THTensor_(cloneColumnMajor)(ra_, a); - m = ra__->size[0]; - n = ra__->size[1]; + m = ra__->size(0); + n = ra__->size(1); lda = m; ldb = (m > n) ? m : n; rb__ = THTensor_(cloneColumnMajorNrows)(rb_, b, ldb); - nrhs = rb__->size[1]; + nrhs = rb__->size(1); info = 0; @@ -277,7 +281,7 @@ void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a) void THTensor_(geev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *jobvr) { int n, lda, lwork, info, ldvr; - THTensor *work, *wi, *wr, *a; + THTensor *work=nullptr, *wi, *wr, *a; real wkopt; real *rv_data; int64_t i; @@ -285,13 +289,13 @@ void THTensor_(geev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *job THTensor *re__ = NULL; THTensor *rv__ = NULL; - THArgCheck(a_->_dim() == 2, 1, "A should be 2 dimensional"); - THArgCheck(a_->size[0] == a_->size[1], 1,"A should be square"); + THArgCheck(a_->dim() == 2, 1, "A should be 2 dimensional"); + THArgCheck(a_->size(0) == a_->size(1), 1,"A should be square"); /* we want to definitely clone a_ for geev*/ a = THTensor_(cloneColumnMajor)(NULL, a_); - n = a->size[0]; + n = a->size(0); lda = n; wi = THTensor_(newWithSize1d)(n); @@ -310,24 +314,26 @@ void THTensor_(geev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *job THTensor_(resize2d)(re_,n,2); re__ = THTensor_(newContiguous)(re_); - /* get optimal workspace size */ - THLapack_(geev)('N', jobvr[0], n, THTensor_(data)(a), lda, THTensor_(data)(wr), THTensor_(data)(wi), - NULL, 1, rv_data, ldvr, &wkopt, -1, &info); - - lwork = (int)wkopt; - work = THTensor_(newWithSize1d)(lwork); - - THLapack_(geev)('N', jobvr[0], n, THTensor_(data)(a), lda, THTensor_(data)(wr), THTensor_(data)(wi), - NULL, 1, rv_data, ldvr, THTensor_(data)(work), lwork, &info); - - THLapackCheckWithCleanup(" Lapack Error in %s : %d off-diagonal elements of an didn't converge to zero", - THCleanup(THTensor_(free)(re__); - THTensor_(free)(rv__); - THTensor_(free)(a); - THTensor_(free)(wi); - THTensor_(free)(wr); - THTensor_(free)(work);), - "geev", info,""); + if (n > 0) { // lapack doesn't work with size 0 + /* get optimal workspace size */ + THLapack_(geev)('N', jobvr[0], n, THTensor_(data)(a), lda, THTensor_(data)(wr), THTensor_(data)(wi), + NULL, 1, rv_data, ldvr, &wkopt, -1, &info); + + lwork = (int)wkopt; + work = THTensor_(newWithSize1d)(lwork); + + THLapack_(geev)('N', jobvr[0], n, THTensor_(data)(a), lda, THTensor_(data)(wr), THTensor_(data)(wi), + NULL, 1, rv_data, ldvr, THTensor_(data)(work), lwork, &info); + + THLapackCheckWithCleanup(" Lapack Error in %s : %d off-diagonal elements of an didn't converge to zero", + THCleanup(THTensor_(free)(re__); + THTensor_(free)(rv__); + THTensor_(free)(a); + THTensor_(free)(wi); + THTensor_(free)(wr); + THTensor_(free)(work);), + "geev", info,""); + } { real *re_data = THTensor_(data)(re__); @@ -355,11 +361,11 @@ void THTensor_(geev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *job void THTensor_(syev)(THTensor *re_, THTensor *rv_, THTensor *a, const char *jobz, const char *uplo) { if (a == NULL) a = rv_; - THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional"); - THArgCheck(a->size[0] == a->size[1], 1,"A should be square"); + THArgCheck(a->dim() == 2, 1, "A should be 2 dimensional"); + THArgCheck(a->size(0) == a->size(1), 1,"A should be square"); int n, lda, lwork, info; - THTensor *work; + THTensor *work = nullptr; real wkopt; THTensor *rv__ = NULL; @@ -367,25 +373,27 @@ void THTensor_(syev)(THTensor *re_, THTensor *rv_, THTensor *a, const char *jobz rv__ = THTensor_(cloneColumnMajor)(rv_, a); - n = rv__->size[0]; + n = rv__->size(0); lda = n; THTensor_(resize1d)(re_,n); re__ = THTensor_(newContiguous)(re_); /* get optimal workspace size */ - THLapack_(syev)(jobz[0], uplo[0], n, THTensor_(data)(rv__), lda, - THTensor_(data)(re_), &wkopt, -1, &info); - lwork = (int)wkopt; - work = THTensor_(newWithSize1d)(lwork); - THLapack_(syev)(jobz[0], uplo[0], n, THTensor_(data)(rv__), lda, - THTensor_(data)(re_), THTensor_(data)(work), lwork, &info); - - THLapackCheckWithCleanup("Lapack Error %s : %d off-diagonal elements didn't converge to zero", - THCleanup(THTensor_(free)(rv__); - THTensor_(free)(re__); - THTensor_(free)(work);), - "syev", info,""); + if (n != 0) { + THLapack_(syev)(jobz[0], uplo[0], n, THTensor_(data)(rv__), lda, + THTensor_(data)(re_), &wkopt, -1, &info); + lwork = (int)wkopt; + work = THTensor_(newWithSize1d)(lwork); + THLapack_(syev)(jobz[0], uplo[0], n, THTensor_(data)(rv__), lda, + THTensor_(data)(re_), THTensor_(data)(work), lwork, &info); + + THLapackCheckWithCleanup("Lapack Error %s : %d off-diagonal elements didn't converge to zero", + THCleanup(THTensor_(free)(rv__); + THTensor_(free)(re__); + THTensor_(free)(work);), + "syev", info,""); + } // No eigenvectors specified if (*jobz == 'N') { @@ -407,7 +415,8 @@ void THTensor_(gesvd)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *a, void THTensor_(gesvd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra_, THTensor *a, const char* jobu) { if (a == NULL) a = ra_; - THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional"); + THArgCheck(a->dim() == 2, 1, "A should be 2 dimensional"); + THArgCheck(!a->is_empty(), 1, "A should not be empty"); int k,m, n, lda, ldu, ldvt, lwork, info; THTensor *work; @@ -421,8 +430,8 @@ void THTensor_(gesvd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra ra__ = THTensor_(cloneColumnMajor)(ra_, a); - m = ra__->size[0]; - n = ra__->size[1]; + m = ra__->size(0); + n = ra__->size(1); k = (m < n ? m : n); lda = m; @@ -490,7 +499,7 @@ void THTensor_(getri)(THTensor *ra_, THTensor *a) { if (a == NULL) a = ra_; THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional"); - THArgCheck(a->size[0] == a->size[1], 1, "A should be square"); + THArgCheck(a->size(0) == a->size(1), 1, "A should be square"); int m, n, lda, info, lwork; real wkopt; @@ -500,8 +509,8 @@ void THTensor_(getri)(THTensor *ra_, THTensor *a) ra__ = THTensor_(cloneColumnMajor)(ra_, a); - m = ra__->size[0]; - n = ra__->size[1]; + m = ra__->size(0); + n = ra__->size(1); lda = m; ipiv = THIntTensor_newWithSize1d((int64_t)m); @@ -533,9 +542,9 @@ void THTensor_(getri)(THTensor *ra_, THTensor *a) void THTensor_(clearUpLoTriangle)(THTensor *a, const char *uplo) { THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional"); - THArgCheck(a->size[0] == a->size[1], 1, "A should be square"); + THArgCheck(a->size(0) == a->size(1), 1, "A should be square"); - int n = a->size[0]; + int n = a->size(0); /* Build full matrix */ real *p = THTensor_(data)(a); @@ -566,9 +575,9 @@ void THTensor_(clearUpLoTriangle)(THTensor *a, const char *uplo) void THTensor_(copyUpLoTriangle)(THTensor *a, const char *uplo) { THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional"); - THArgCheck(a->size[0] == a->size[1], 1, "A should be square"); + THArgCheck(a->size(0) == a->size(1), 1, "A should be square"); - int n = a->size[0]; + int n = a->size(0); /* Build full matrix */ real *p = THTensor_(data)(a); @@ -600,14 +609,14 @@ void THTensor_(potrf)(THTensor *ra_, THTensor *a, const char *uplo) { if (a == NULL) a = ra_; THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional"); - THArgCheck(a->size[0] == a->size[1], 1, "A should be square"); + THArgCheck(a->size(0) == a->size(1), 1, "A should be square"); int n, lda, info; THTensor *ra__ = NULL; ra__ = THTensor_(cloneColumnMajor)(ra_, a); - n = ra__->size[0]; + n = ra__->size(0); lda = n; /* Run Factorization */ @@ -629,14 +638,14 @@ void THTensor_(potrs)(THTensor *rb_, THTensor *b, THTensor *a, const char *uplo) a->_dim()); THArgCheck(b->_dim() == 1 || b->_dim() == 2, 1, "B should have 1 or 2 " "dimensions, but has %d", b->_dim()); - THArgCheck(a->size[0] == a->size[1], 2, "A should be square, but is %ldx%ld", - a->size[0], a->size[1]); - THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld " - "rows, B has %ld", a->size[0], b->size[0]); + THArgCheck(a->size(0) == a->size(1), 2, "A should be square, but is %ldx%ld", + a->size(0), a->size(1)); + THArgCheck(a->size(0) == b->size(0), 2, "A,B size incompatible - A has %ld " + "rows, B has %ld", a->size(0), b->size(0)); if (b->_dim() == 1) { - b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0], - b->stride[0], 1, 0); + b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size(0), + b->stride(0), 1, 0); free_b = 1; } @@ -647,8 +656,8 @@ void THTensor_(potrs)(THTensor *rb_, THTensor *b, THTensor *a, const char *uplo) ra__ = THTensor_(cloneColumnMajor)(NULL, a); rb__ = THTensor_(cloneColumnMajor)(rb_, b); - n = (int)ra__->size[0]; - nrhs = (int)rb__->size[1]; + n = (int)ra__->size(0); + nrhs = (int)rb__->size(1); lda = n; ldb = n; @@ -672,14 +681,14 @@ void THTensor_(potri)(THTensor *ra_, THTensor *a, const char *uplo) { if (a == NULL) a = ra_; THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional"); - THArgCheck(a->size[0] == a->size[1], 1, "A should be square"); + THArgCheck(a->size(0) == a->size(1), 1, "A should be square"); int n, lda, info; THTensor *ra__ = NULL; ra__ = THTensor_(cloneColumnMajor)(ra_, a); - n = ra__->size[0]; + n = ra__->size(0); lda = n; /* Run inverse */ @@ -710,9 +719,9 @@ void THTensor_(potri)(THTensor *ra_, THTensor *a, const char *uplo) */ void THTensor_(pstrf)(THTensor *ra_, THIntTensor *rpiv_, THTensor *a, const char *uplo, real tol) { THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional"); - THArgCheck(a->size[0] == a->size[1], 1, "A should be square"); + THArgCheck(a->size(0) == a->size(1), 1, "A should be square"); - int n = a->size[0]; + int n = a->size(0); THTensor *ra__ = THTensor_(cloneColumnMajor)(ra_, a); THIntTensor_resize1d(rpiv_, n); @@ -757,17 +766,17 @@ void THTensor_(pstrf)(THTensor *ra_, THIntTensor *rpiv_, THTensor *a, const char */ void THTensor_(qr)(THTensor *rq_, THTensor *rr_, THTensor *a) { - int m = a->size[0]; - int n = a->size[1]; + int m = a->size(0); + int n = a->size(1); int k = (m < n ? m : n); THTensor *ra_ = THTensor_(new)(); THTensor *rtau_ = THTensor_(new)(); THTensor *rr__ = THTensor_(new)(); THTensor_(geqrf)(ra_, rtau_, a); - THTensor_(resize2d)(rr__, k, ra_->size[1]); + THTensor_(resize2d)(rr__, k, ra_->size(1)); THTensor_(narrow)(rr__, ra_, 0, 0, k); THTensor_(triu)(rr_, rr__, 0); - THTensor_(resize2d)(rq_, ra_->size[0], k); + THTensor_(resize2d)(rq_, ra_->size(0), k); THTensor_(orgqr)(rq_, ra_, rtau_); THTensor_(narrow)(rq_, rq_, 1, 0, k); THTensor_(free)(ra_); @@ -795,15 +804,16 @@ void THTensor_(qr)(THTensor *rq_, THTensor *rr_, THTensor *a) void THTensor_(geqrf)(THTensor *ra_, THTensor *rtau_, THTensor *a) { if (a == NULL) ra_ = a; - THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional"); + THArgCheck(a->dim() == 2, 1, "A should be 2 dimensional"); + THArgCheck(!a->is_empty(), 1, "A should not be empty"); THTensor *ra__ = NULL; /* Prepare the input for LAPACK, making a copy if necessary. */ ra__ = THTensor_(cloneColumnMajor)(ra_, a); - int m = ra__->size[0]; - int n = ra__->size[1]; + int m = ra__->size(0); + int n = ra__->size(1); int k = (m < n ? m : n); int lda = m; THTensor_(resize1d)(rtau_, k); @@ -856,8 +866,8 @@ void THTensor_(orgqr)(THTensor *ra_, THTensor *a, THTensor *tau) THTensor *ra__ = NULL; ra__ = THTensor_(cloneColumnMajor)(ra_, a); - int m = ra__->size[0]; - int k = tau->size[0]; + int m = ra__->size(0); + int k = tau->size(0); int lda = m; /* Dry-run to query the suggested size of the workspace. */ @@ -909,9 +919,9 @@ void THTensor_(ormqr)(THTensor *ra_, THTensor *a, THTensor *tau, THTensor *c, co THTensor *ra__ = NULL; ra__ = THTensor_(cloneColumnMajor)(ra_, c); - int m = c->size[0]; - int n = c->size[1]; - int k = tau->size[0]; + int m = c->size(0); + int n = c->size(1); + int k = tau->size(0); int lda; if (*side == 'L') { @@ -948,7 +958,7 @@ void THTensor_(ormqr)(THTensor *ra_, THTensor *a, THTensor *tau, THTensor *c, co void THTensor_(btrifact)(THTensor *ra_, THIntTensor *rpivots_, THIntTensor *rinfo_, int pivot, THTensor *a) { - AT_CHECK(!a->is_empty() && THTensor_(nDimension)(a) == 3, "expected 3D tensor, got size: ", a->sizes()); + AT_CHECK(THTensor_(nDimension)(a) == 3, "expected 3D tensor, got size: ", a->sizes()); if (!pivot) { THError("btrifact without pivoting is not implemented on the CPU"); } @@ -958,8 +968,8 @@ void THTensor_(btrifact)(THTensor *ra_, THIntTensor *rpivots_, THIntTensor *rinf THTensor_(copy)(ra_, a); } - int m = a->size[1]; - int n = a->size[2]; + int m = a->size(1); + int n = a->size(2); if (m != n) { THError("btrifact is only implemented for square matrices"); } @@ -967,9 +977,9 @@ void THTensor_(btrifact)(THTensor *ra_, THIntTensor *rpivots_, THIntTensor *rinf THTensor *ra__; int lda; - if (ra_->stride[1] == 1) { + if (ra_->stride(1) == 1) { // column ordered, what BLAS wants - lda = ra_->stride[2]; + lda = ra_->stride(2); ra__ = ra_; } else { // not column ordered, need to make it such (requires copy) @@ -977,7 +987,7 @@ void THTensor_(btrifact)(THTensor *ra_, THIntTensor *rpivots_, THIntTensor *rinf ra__ = THTensor_(newClone)(transp_r_); THTensor_(free)(transp_r_); THTensor_(transpose)(ra__, NULL, 1, 2); - lda = ra__->stride[2]; + lda = ra__->stride(2); } THTensor *ai = THTensor_(new)(); @@ -1039,18 +1049,18 @@ void THTensor_(btrisolve)(THTensor *rb_, THTensor *b, THTensor *atf, THIntTensor THTensor_(copy)(rb_, b); } - int64_t num_batches = atf->size[0]; - int64_t n = atf->size[1]; - int nrhs = rb_->_dim() > 2 ? rb_->size[2] : 1; + int64_t num_batches = atf->size(0); + int64_t n = atf->size(1); + int nrhs = rb_->_dim() > 2 ? rb_->size(2) : 1; int lda, ldb; THTensor *atf_; THTensor *rb__; // correct ordering of A - if (atf->stride[1] == 1) { + if (atf->stride(1) == 1) { // column ordered, what BLAS wants - lda = atf->stride[2]; + lda = atf->stride(2); atf_ = atf; } else { // not column ordered, need to make it such (requires copy) @@ -1061,16 +1071,16 @@ void THTensor_(btrisolve)(THTensor *rb_, THTensor *b, THTensor *atf, THIntTensor atf_ = THTensor_(newClone)(transp_r_); THTensor_(free)(transp_r_); THTensor_(transpose)(atf_, NULL, 1, 2); - lda = atf_->stride[2]; + lda = atf_->stride(2); } // correct ordering of B - if (rb_->stride[1] == 1) { + if (rb_->stride(1) == 1) { // column ordered - if (rb_->_dim() == 2 || rb_->size[2] == 1) { + if (rb_->_dim() == 2 || rb_->size(2) == 1) { ldb = n; } else { - ldb = rb_->stride[2]; + ldb = rb_->stride(2); } rb__ = rb_; } else { @@ -1080,7 +1090,7 @@ void THTensor_(btrisolve)(THTensor *rb_, THTensor *b, THTensor *atf, THIntTensor rb__ = THTensor_(newClone)(transp_r_); THTensor_(free)(transp_r_); THTensor_(transpose)(rb__, NULL, 1, 2); - ldb = rb__->stride[2]; + ldb = rb__->stride(2); } else { rb__ = THTensor_(newClone)(rb_); ldb = n; diff --git a/aten/src/TH/generic/THTensorMath.cpp b/aten/src/TH/generic/THTensorMath.cpp index 4fa0003984b5f8..e4152432a3068a 100644 --- a/aten/src/TH/generic/THTensorMath.cpp +++ b/aten/src/TH/generic/THTensorMath.cpp @@ -109,10 +109,7 @@ #define TH_CHECK_SAME_SIZE(TENSOR1, TENSOR2) \ { \ if(!THTensor_(isSameSizeAs)(TENSOR1, TENSOR2)) { \ - THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->_dim()); \ - THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->_dim()); \ - THError("inconsistent tensor size, expected %s %s and %s %s to have the same size", \ - #TENSOR1, T1buff.str, #TENSOR2, T2buff.str); \ + AT_ERROR("inconsistent tensor size, expected ", #TENSOR1, " ", TENSOR1->sizes(), " and ", #TENSOR2, " ", TENSOR2->sizes(), " to have the same size"); \ } \ } @@ -121,31 +118,27 @@ // TENSOR2 is src // TENSOR3 is index // Tests: -// 1. index->size[d] <= src->size[d] for all d -// 2. index->size[d] <= real->size[d] for all d != dim +// 1. index->size(d) <= src->size(d) for all d +// 2. index->size(d) <= real->size(d) for all d != dim #define TH_TENSOR_DIM_APPLY3_SIZE_SCATTER(TENSOR1, TENSOR2, TENSOR3, DIMENSION) \ { \ int shape_check_flag = 0; \ for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->_dim(); TH_TENSOR_DIM_APPLY_i++) \ { \ - int64_t TENSOR3##_dim_size = TENSOR3->size[TH_TENSOR_DIM_APPLY_i]; \ + int64_t TENSOR3##_dim_size = TENSOR3->size(TH_TENSOR_DIM_APPLY_i); \ if (TH_TENSOR_DIM_APPLY_i != DIMENSION) { \ - if (TENSOR3##_dim_size > TENSOR1->size[TH_TENSOR_DIM_APPLY_i]) { \ + if (TENSOR3##_dim_size > TENSOR1->size(TH_TENSOR_DIM_APPLY_i)) { \ shape_check_flag = 1; \ break; \ } \ } \ - if (TENSOR3##_dim_size > TENSOR2->size[TH_TENSOR_DIM_APPLY_i]) { \ + if (TENSOR3##_dim_size > TENSOR2->size(TH_TENSOR_DIM_APPLY_i)) { \ shape_check_flag = 1; \ break; \ } \ } \ if (shape_check_flag == 1) { \ - THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->_dim()); \ - THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->_dim()); \ - THDescBuff T3buff = _THSizeDesc(TENSOR3->size, TENSOR3->_dim()); \ - THError("Expected %s %s to be smaller size than %s %s and to be smaller than %s %s apart from dimension %d", \ - #TENSOR3, T3buff.str, #TENSOR2, T2buff.str, #TENSOR1, T1buff.str, DIMENSION); \ + AT_ERROR("Expected ", #TENSOR3, " ", TENSOR3->sizes(), " to be smaller size than ", #TENSOR2, " ", TENSOR2->sizes(), " and to be smaller than ", #TENSOR1, " ", TENSOR1->sizes(), " apart from dimension ", DIMENSION); \ } \ } @@ -297,8 +290,8 @@ void THTensor_(nonzero)(THLongTensor *subscript, THTensor *tensor) div = 1; for (dim = tensor->dim() - 1; dim >= 0; dim--) { - *(subscript_data + dim) = (i/div) % tensor->size[dim]; - div *= tensor->size[dim]; + *(subscript_data + dim) = (i/div) % tensor->size(dim); + div *= tensor->size(dim); } subscript_data += tensor->dim(); @@ -314,14 +307,20 @@ void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTens int64_t *index_data; real *tensor_data, *src_data; +#ifndef USE_TH_SIZE_ZERO_DIM THArgCheck(index->_dim() <= 1, 3, "Index is supposed to be an empty tensor or a vector"); THArgCheck(dim < src->_dim(), 4, "Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE); THArgCheck(src->_dim() > 0, 2, "Source tensor is empty"); +#else + THArgCheck(index->dim() == 1, 3, "Index is supposed to be 1-dimensional"); + THArgCheck(dim < src->dim(), 4, "Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE); + //THArgCheck(src->dim() > 0, 2, "Source tensor is empty"); +#endif numel = THLongTensor_nElement(index); newSize = THLongStorage_newWithSize(src->dim()); - THLongStorage_rawCopy(newSize,src->size); + THLongStorage_rawCopy(newSize, THTensor_getSizePtr(src)); #ifdef DEBUG THAssert(numel <= LONG_MAX); #endif @@ -336,10 +335,10 @@ void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTens { tensor_data = THTensor_(data)(tensor); src_data = THTensor_(data)(src); - ptrdiff_t rowsize = THTensor_(nElement)(src) / src->size[0]; + ptrdiff_t rowsize = src->size(0) == 0 ? 1: THTensor_(nElement)(src) / src->size(0); // check that the indices are within range - int64_t max = src->size[0] - 1 + TH_INDEX_BASE; + int64_t max = src->size(0) - 1 + TH_INDEX_BASE; for (i=0; i max) { THLongTensor_free(index); @@ -347,7 +346,7 @@ void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTens } } - if (src->_dim() == 1) { + if (src->dim() == 1) { #pragma omp parallel for if(numel > TH_OMP_OVERHEAD_THRESHOLD) private(i) for (i=0; i_dim() == 1) + else if (src->dim() == 1) { for (i=0; i_dim() > 1 ) + if (tensor->dim() > 1 ) { tSlice = THTensor_(new)(); sSlice = THTensor_(new)(); @@ -418,8 +417,8 @@ void THTensor_(indexCopy)(THTensor *tensor, int dim, THLongTensor *index, THTens } static ptrdiff_t THTensor_(dataOffset)(THTensor* tensor, ptrdiff_t linearIndex) { - int64_t *size = tensor->size; - int64_t *stride = tensor->stride; + auto size = tensor->sizes(); + auto stride = tensor->strides(); int nDim = tensor->_dim(); ptrdiff_t dataOffset = 0; for (int i = nDim - 1; i >= 0; i--) { @@ -439,7 +438,7 @@ static inline int64_t THTensor_(wrapLinearIndex)(int64_t linearIndex, int64_t nu void THTensor_(take)(THTensor *r_, THTensor *src, THLongTensor *index) { - THTensor_(resizeNd)(r_, index->dim(), index->size, NULL); + THTensor_(resizeNd)(r_, index->dim(), THTensor_getSizePtr(index), NULL); THTensor* dst = THTensor_(newContiguous)(r_); index = THLongTensor_newContiguous(index); @@ -513,14 +512,19 @@ void THTensor_(indexAdd)(THTensor *tensor, int dim, THLongTensor *index, THTenso int64_t *index_data; numel = THLongTensor_nElement(index); +#ifndef USE_TH_SIZE_ZERO_DIM THArgCheck(index->_dim() == 1, 3, "Index is supposed to be a vector"); THArgCheck(dim < src->_dim(), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE); - THArgCheck(numel == src->size[dim],4,"Number of indices should be equal to source:size(dim)"); +#else + THArgCheck(index->dim() == 1, 3, "Index is supposed to be a vector"); + THArgCheck(dim < src->dim(), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE); +#endif + THArgCheck(numel == src->size(dim),4,"Number of indices should be equal to source:size(dim)"); index = THLongTensor_newContiguous(index); index_data = THLongTensor_data(index); - if (tensor->_dim() > 1) + if (tensor->dim() > 1) { tSlice = THTensor_(new)(); sSlice = THTensor_(new)(); @@ -554,15 +558,20 @@ void THTensor_(indexFill)(THTensor *tensor, int dim, THLongTensor *index, real v int64_t *index_data; numel = THLongTensor_nElement(index); +#ifndef USE_TH_SIZE_ZERO_DIM THArgCheck(index->_dim() == 1, 3, "Index is supposed to be a vector"); THArgCheck(dim < tensor->_dim(), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE); +#else + THArgCheck(index->dim() == 1, 3, "Index is supposed to be a vector"); + THArgCheck(dim < tensor->dim(), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE); +#endif index = THLongTensor_newContiguous(index); index_data = THLongTensor_data(index); for (i=0; i_dim() > 1) + if (tensor->dim() > 1) { tSlice = THTensor_(new)(); THTensor_(select)(tSlice, tensor,dim,index_data[i] - TH_INDEX_BASE); @@ -581,11 +590,11 @@ void THTensor_(gather)(THTensor *tensor, THTensor *src, int dim, THLongTensor *i { int64_t elems_per_row, i, idx; - THArgCheck(THLongTensor__nDimension(index) == THTensor_(_nDimension)(src), 4, + THArgCheck(THLongTensor_nDimension(index) == THTensor_(nDimension)(src), 4, "Index tensor must have same dimensions as input tensor"); - THArgCheck(dim >= 0 && dim < THTensor_(_nDimension)(tensor), 3, + THArgCheck(dim >= 0 && dim < THTensor_(nDimension)(tensor), 3, "Index dimension is out of bounds"); - THArgCheck(THTensor_(_nDimension)(src) == THTensor_(_nDimension)(tensor), 2, + THArgCheck(THTensor_(nDimension)(src) == THTensor_(nDimension)(tensor), 2, "Input tensor must have same dimensions as output tensor"); elems_per_row = THLongTensor_size(index, dim); @@ -608,11 +617,19 @@ void THTensor_(scatter)(THTensor *tensor, int dim, THLongTensor *index, THTensor { int64_t elems_per_row, i, idx; +#ifndef USE_TH_SIZE_ZERO_DIM THArgCheck(dim < THTensor_(_nDimension)(tensor), 2, "Index dimension is out of bounds"); THArgCheck(THLongTensor__nDimension(index) == THTensor_(_nDimension)(tensor), 3, "Index tensor must have same dimensions as output tensor"); THArgCheck(THTensor_(_nDimension)(src) == THTensor_(_nDimension)(tensor), 4, "Input tensor must have same dimensions as output tensor"); +#else + THArgCheck(dim < THTensor_(nDimension)(tensor), 2, "Index dimension is out of bounds"); + THArgCheck(THLongTensor_nDimension(index) == THTensor_(nDimension)(tensor), 3, + "Index tensor must have same dimensions as output tensor"); + THArgCheck(THTensor_(nDimension)(src) == THTensor_(nDimension)(tensor), 4, + "Input tensor must have same dimensions as output tensor"); +#endif elems_per_row = THLongTensor_size(index, dim); @@ -634,10 +651,10 @@ void THTensor_(scatterAdd)(THTensor *tensor, int dim, THLongTensor *index, THTen { int64_t elems_per_row, i, idx; - THArgCheck(dim < THTensor_(_nDimension)(tensor), 2, "Index dimension is out of bounds"); - THArgCheck(THLongTensor__nDimension(index) == THTensor_(_nDimension)(tensor), 3, + THArgCheck(dim < THTensor_(nDimension)(tensor), 2, "Index dimension is out of bounds"); + THArgCheck(THLongTensor_nDimension(index) == THTensor_(nDimension)(tensor), 3, "Index tensor must have same dimensions as output tensor"); - THArgCheck(THTensor_(_nDimension)(src) == THTensor_(_nDimension)(tensor), 4, + THArgCheck(THTensor_(nDimension)(src) == THTensor_(nDimension)(tensor), 4, "Input tensor must have same dimensions as output tensor"); elems_per_row = THLongTensor_size(index, dim); @@ -1931,20 +1948,20 @@ void THTensor_(addcdiv)(THTensor *r_, THTensor *t, real value, THTensor *src1, T void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *mat, THTensor *vec) { - if( (mat->_dim() != 2) || (vec->_dim() != 1) ) + if( (mat->dim() != 2) || (vec->dim() != 1) ) THError("matrix and vector expected, got %dD, %dD", - mat->_dim(), vec->_dim()); + mat->dim(), vec->dim()); - if( mat->size[1] != vec->size[0] ) { + if( mat->size(1) != vec->size(0) ) { THDescBuff bm = THTensor_(sizeDesc)(mat); THDescBuff bv = THTensor_(sizeDesc)(vec); THError("size mismatch, %s, %s", bm.str, bv.str); } - if(t->_dim() != 1) - THError("vector expected, got t: %dD", t->_dim()); + if(t->dim() != 1) + THError("vector expected, got t: %dD", t->dim()); - if(t->size[0] != mat->size[0]) { + if(t->size(0) != mat->size(0)) { THDescBuff bt = THTensor_(sizeDesc)(t); THDescBuff bm = THTensor_(sizeDesc)(mat); THError("size mismatch, t: %s, mat: %s", bt.str, bm.str); @@ -1959,28 +1976,28 @@ void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor // n == 1 || lda >= max(1, m) #define LDA_COND(M, N, LDA) ((N) == 1 || (LDA) >= THMax(1, (M))) - if(mat->stride[0] == 1 && LDA_COND(mat->size[0], mat->size[1], mat->stride[1])) + if(mat->stride(0) == 1 && LDA_COND(mat->size(0), mat->size(1), mat->stride(1))) { - THBlas_(gemv)('n', mat->size[0], mat->size[1], - alpha, THTensor_(data)(mat), mat->stride[1], - THTensor_(data)(vec), vec->stride[0], - beta, THTensor_(data)(r_), r_->stride[0]); + THBlas_(gemv)('n', mat->size(0), mat->size(1), + alpha, THTensor_(data)(mat), mat->stride(1), + THTensor_(data)(vec), vec->stride(0), + beta, THTensor_(data)(r_), r_->stride(0)); } - else if(mat->stride[1] == 1 && LDA_COND(mat->size[1], mat->size[0], mat->stride[0])) + else if(mat->stride(1) == 1 && LDA_COND(mat->size(1), mat->size(0), mat->stride(0))) { - THBlas_(gemv)('t', mat->size[1], mat->size[0], - alpha, THTensor_(data)(mat), mat->stride[0], - THTensor_(data)(vec), vec->stride[0], - beta, THTensor_(data)(r_), r_->stride[0]); + THBlas_(gemv)('t', mat->size(1), mat->size(0), + alpha, THTensor_(data)(mat), mat->stride(0), + THTensor_(data)(vec), vec->stride(0), + beta, THTensor_(data)(r_), r_->stride(0)); } else { THTensor *cmat = THTensor_(newContiguous)(mat); - THBlas_(gemv)('t', mat->size[1], mat->size[0], - alpha, THTensor_(data)(cmat), cmat->stride[0], - THTensor_(data)(vec), vec->stride[0], - beta, THTensor_(data)(r_), r_->stride[0]); + THBlas_(gemv)('t', mat->size(1), mat->size(0), + alpha, THTensor_(data)(cmat), cmat->stride(0), + THTensor_(data)(vec), vec->stride(0), + beta, THTensor_(data)(r_), r_->stride(0)); THTensor_(free)(cmat); } @@ -1990,8 +2007,8 @@ void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor void THTensor_(match)(THTensor *r_, THTensor *m1, THTensor *m2, real gain) { - int64_t N1 = m1->size[0]; - int64_t N2 = m2->size[0]; + int64_t N1 = m1->size(0); + int64_t N2 = m2->size(0); int64_t dim; real *m1_p; real *m2_p; @@ -2006,8 +2023,8 @@ void THTensor_(match)(THTensor *r_, THTensor *m1, THTensor *m2, real gain) THTensor_(resize2d)(m1, N1, THTensor_(nElement)(m1) / N1); THTensor_(resize2d)(m2, N2, THTensor_(nElement)(m2) / N2); - dim = m1->size[1]; - THArgCheck(m1->size[1] == m2->size[1], 3, "m1 and m2 must have the same inner vector dim"); + dim = m1->size(1); + THArgCheck(m1->size(1) == m2->size(1), 3, "m1 and m2 must have the same inner vector dim"); m1_p = THTensor_(data)(m1); m2_p = THTensor_(data)(m2); @@ -2037,19 +2054,19 @@ void THTensor_(addmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor int free_m1 = 0; int free_m2 = 0; - if( (m1->_dim() != 2) || (m2->_dim() != 2)) - THError("matrices expected, got %dD, %dD tensors", m1->_dim(), m2->_dim()); + if( (m1->dim() != 2) || (m2->dim() != 2)) + THError("matrices expected, got %dD, %dD tensors", m1->dim(), m2->dim()); - if(m1->size[1] != m2->size[0]) { + if(m1->size(1) != m2->size(0)) { THDescBuff bm1 = THTensor_(sizeDesc)(m1); THDescBuff bm2 = THTensor_(sizeDesc)(m2); THError("size mismatch, m1: %s, m2: %s", bm1.str, bm2.str); } - if( t->_dim() != 2 ) - THError("matrix expected, got %dD tensor for t", t->_dim()); + if( t->dim() != 2 ) + THError("matrix expected, got %dD tensor for t", t->dim()); - if( (t->size[0] != m1->size[0]) || (t->size[1] != m2->size[1]) ) { + if( (t->size(0) != m1->size(0)) || (t->size(1) != m2->size(1)) ) { THDescBuff bt = THTensor_(sizeDesc)(t); THDescBuff bm1 = THTensor_(sizeDesc)(m1); THDescBuff bm2 = THTensor_(sizeDesc)(m2); @@ -2068,14 +2085,14 @@ void THTensor_(addmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor #define LDC_COND(M, N, LDC) ((N) == 1 || (LDC) >= THMax(1, M)) /* r_ */ - if(r_->stride[0] == 1 && - LDC_COND(r_->size[0], r_->size[1], r_->stride[1])) + if(r_->stride(0) == 1 && + LDC_COND(r_->size(0), r_->size(1), r_->stride(1))) { transpose_r = 'n'; r__ = r_; } - else if(r_->stride[1] == 1 && - LDC_COND(r_->size[1], r_->size[0], r_->stride[0])) + else if(r_->stride(1) == 1 && + LDC_COND(r_->size(1), r_->size(0), r_->stride(0))) { THTensor *swap = m2; m2 = m1; @@ -2095,21 +2112,21 @@ void THTensor_(addmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor #undef LDC_COND - int64_t m = r__->size[(transpose_r == 'n' ? 0 : 1)]; - int64_t n = r__->size[(transpose_r == 'n' ? 1 : 0)]; - int64_t k = m1->size[(transpose_r == 'n' ? 1 : 0)]; - int64_t ldr__ = r__->stride[(transpose_r == 'n' ? 1 : 0)]; + int64_t m = r__->size((transpose_r == 'n' ? 0 : 1)); + int64_t n = r__->size((transpose_r == 'n' ? 1 : 0)); + int64_t k = m1->size((transpose_r == 'n' ? 1 : 0)); + int64_t ldr__ = r__->stride((transpose_r == 'n' ? 1 : 0)); /* m1 */ /* Need ldm1_ >= max(1, (transpose_m1 == 'n' ? m : k)) */ - if(m1->stride[(transpose_r == 'n' ? 0 : 1)] == 1 && - m1->stride[(transpose_r == 'n' ? 1 : 0)] >= THMax(1, m)) + if(m1->stride((transpose_r == 'n' ? 0 : 1)) == 1 && + m1->stride((transpose_r == 'n' ? 1 : 0)) >= THMax(1, m)) { transpose_m1 = 'n'; m1_ = m1; } - else if(m1->stride[(transpose_r == 'n' ? 1 : 0)] == 1 && - m1->stride[(transpose_r == 'n' ? 0 : 1)] >= THMax(1, k)) + else if(m1->stride((transpose_r == 'n' ? 1 : 0)) == 1 && + m1->stride((transpose_r == 'n' ? 0 : 1)) >= THMax(1, k)) { transpose_m1 = 't'; m1_ = m1; @@ -2123,14 +2140,14 @@ void THTensor_(addmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor /* m2 */ /* Need ldm2_ >= max(1, (transpose_m2 == 'n' ? k : n)) */ - if(m2->stride[(transpose_r == 'n' ? 0 : 1)] == 1 && - m2->stride[(transpose_r == 'n' ? 1 : 0)] >= THMax(1, k)) + if(m2->stride((transpose_r == 'n' ? 0 : 1)) == 1 && + m2->stride((transpose_r == 'n' ? 1 : 0)) >= THMax(1, k)) { transpose_m2 = 'n'; m2_ = m2; } - else if(m2->stride[(transpose_r == 'n' ? 1 : 0)] == 1 && - m2->stride[(transpose_r == 'n' ? 0 : 1)] >= THMax(1, n)) + else if(m2->stride((transpose_r == 'n' ? 1 : 0)) == 1 && + m2->stride((transpose_r == 'n' ? 0 : 1)) >= THMax(1, n)) { transpose_m2 = 't'; m2_ = m2; @@ -2142,8 +2159,8 @@ void THTensor_(addmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor free_m2 = 1; } - int64_t ldm1_ = (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]); - int64_t ldm2_ = (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]); + int64_t ldm1_ = (transpose_m1 == 'n' ? m1_->stride((transpose_r == 'n' ? 1 : 0)) : m1_->stride((transpose_r == 'n' ? 0 : 1))); + int64_t ldm2_ = (transpose_m2 == 'n' ? m2_->stride((transpose_r == 'n' ? 1 : 0)) : m2_->stride((transpose_r == 'n' ? 0 : 1))); #pragma omp critical(blasgemm) /* do the operation */ @@ -2174,14 +2191,14 @@ void THTensor_(addmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor void THTensor_(addr)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *vec1, THTensor *vec2) { - if( (vec1->_dim() != 1) || (vec2->_dim() != 1) ) + if( (vec1->dim() != 1) || (vec2->dim() != 1) ) THError("vector and vector expected, got %dD, %dD tensors", - vec1->_dim(), vec2->_dim()); + vec1->dim(), vec2->dim()); - if(t->_dim() != 2) - THError("expected matrix, got %dD tensor for t", t->_dim()); + if(t->dim() != 2) + THError("expected matrix, got %dD tensor for t", t->dim()); - if( (t->size[0] != vec1->size[0]) || (t->size[1] != vec2->size[0]) ) { + if( (t->size(0) != vec1->size(0)) || (t->size(1) != vec2->size(0)) ) { THDescBuff bt = THTensor_(sizeDesc)(t); THDescBuff bv1 = THTensor_(sizeDesc)(vec1); THDescBuff bv2 = THTensor_(sizeDesc)(vec2); @@ -2203,28 +2220,28 @@ void THTensor_(addr)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor // n == 1 || lda >= max(1, m) #define LDA_COND(M, N, LDA) ((N) == 1 || (LDA) >= THMax(1, (M))) - if(r_->stride[0] == 1 && LDA_COND(vec1->size[0], vec2->size[0], r_->stride[1])) + if(r_->stride(0) == 1 && LDA_COND(vec1->size(0), vec2->size(0), r_->stride(1))) { - THBlas_(ger)(vec1->size[0], vec2->size[0], - alpha, THTensor_(data)(vec1), vec1->stride[0], - THTensor_(data)(vec2), vec2->stride[0], - THTensor_(data)(r_), r_->stride[1]); + THBlas_(ger)(vec1->size(0), vec2->size(0), + alpha, THTensor_(data)(vec1), vec1->stride(0), + THTensor_(data)(vec2), vec2->stride(0), + THTensor_(data)(r_), r_->stride(1)); } - else if(r_->stride[1] == 1 && LDA_COND(vec2->size[0], vec1->size[0], r_->stride[0])) + else if(r_->stride(1) == 1 && LDA_COND(vec2->size(0), vec1->size(0), r_->stride(0))) { - THBlas_(ger)(vec2->size[0], vec1->size[0], - alpha, THTensor_(data)(vec2), vec2->stride[0], - THTensor_(data)(vec1), vec1->stride[0], - THTensor_(data)(r_), r_->stride[0]); + THBlas_(ger)(vec2->size(0), vec1->size(0), + alpha, THTensor_(data)(vec2), vec2->stride(0), + THTensor_(data)(vec1), vec1->stride(0), + THTensor_(data)(r_), r_->stride(0)); } else { THTensor *cr = THTensor_(newClone)(r_); - THBlas_(ger)(vec2->size[0], vec1->size[0], - alpha, THTensor_(data)(vec2), vec2->stride[0], - THTensor_(data)(vec1), vec1->stride[0], - THTensor_(data)(cr), cr->stride[0]); + THBlas_(ger)(vec2->size(0), vec1->size(0), + alpha, THTensor_(data)(vec2), vec2->stride(0), + THTensor_(data)(vec1), vec1->stride(0), + THTensor_(data)(cr), cr->stride(0)); THTensor_(freeCopyTo)(cr, r_); } @@ -2236,8 +2253,8 @@ void THTensor_(addbmm)(THTensor *result, real beta, THTensor *t, real alpha, THT { int64_t batch; - THArgCheck(!batch1->is_empty() && THTensor_(nDimension)(batch1) == 3, 1, "expected non-empty 3D tensor"); - THArgCheck(!batch2->is_empty() && THTensor_(nDimension)(batch2) == 3, 2, "expected non-empty 3D tensor"); + THArgCheck(THTensor_(nDimension)(batch1) == 3, 1, "expected 3D tensor"); + THArgCheck(THTensor_(nDimension)(batch2) == 3, 2, "expected 3D tensor"); THArgCheck(THTensor_(size)(batch1, 0) == THTensor_(size)(batch2, 0), 2, "equal number of batches expected, got %d, %d", THTensor_(size)(batch1, 0), THTensor_(size)(batch2, 0)); @@ -2277,8 +2294,8 @@ void THTensor_(baddbmm)(THTensor *result, real beta, THTensor *t, real alpha, TH { int64_t batch; - THArgCheck(THTensor_(_nDimension)(batch1) == 3, 1, "expected 3D tensor, got %dD", THTensor_(_nDimension)(batch1)); - THArgCheck(THTensor_(_nDimension)(batch2) == 3, 2, "expected 3D tensor, got %dD", THTensor_(_nDimension)(batch2)); + THArgCheck(THTensor_(nDimension)(batch1) == 3, 1, "expected 3D tensor, got %dD", THTensor_(nDimension)(batch1)); + THArgCheck(THTensor_(nDimension)(batch2) == 3, 2, "expected 3D tensor, got %dD", THTensor_(nDimension)(batch2)); THArgCheck(THTensor_(size)(batch1, 0) == THTensor_(size)(batch2, 0), 2, "equal number of batches expected, got %d, %d", THTensor_(size)(batch1, 0), THTensor_(size)(batch2, 0)); @@ -2357,7 +2374,7 @@ void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int THLongStorage_free(dim); // two implementations optimized for data locality - if (t->stride[dimension] == 1) { + if (t->stride(dimension) == 1) { real theMax; real value; int64_t theIndex; @@ -2390,7 +2407,7 @@ void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int } THLongTensor_zero(indices_); - if(t->size[dimension] == 1) { + if(t->size(dimension) == 1) { if (!keepdim) { THTensor_(squeeze1d)(values_, values_, dimension); THLongTensor_squeeze1d(indices_, indices_, dimension); @@ -2400,13 +2417,13 @@ void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int THTensor *tempValues_ = THTensor_(newWithTensor)(values_); // tempValues_.expand_as(t) - tempValues_->size[dimension] = t->size[dimension]; - tempValues_->stride[dimension] = 0; + THTensor_setSizeAtDim(tempValues_, dimension, t->size(dimension)); + THTensor_setStrideAtDim(tempValues_, dimension, 0); THLongTensor *tempIndices_ = THLongTensor_newWithTensor(indices_); // tempIndices_.expand_as(t) - tempIndices_->size[dimension] = t->size[dimension]; - tempIndices_->stride[dimension] = 0; + THTensor_setSizeAtDim(tempIndices_, dimension, t->size(dimension)); + THTensor_setStrideAtDim(tempIndices_, dimension, 0); TH_TENSOR_APPLY3_D(real, t, real, tempValues_, int64_t, tempIndices_, dimension, if(!(*t_data <= *tempValues__data) && !th_isnan(*tempValues__data)) { @@ -2441,7 +2458,7 @@ void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int THLongStorage_free(dim); // two implementations optimized for data locality - if (t->stride[dimension] == 1) { + if (t->stride(dimension) == 1) { real theMax; real value; int64_t theIndex; @@ -2474,7 +2491,7 @@ void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int } THLongTensor_zero(indices_); - if(t->size[dimension] == 1) { + if(t->size(dimension) == 1) { if (!keepdim) { THTensor_(squeeze1d)(values_, values_, dimension); THLongTensor_squeeze1d(indices_, indices_, dimension); @@ -2484,13 +2501,13 @@ void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int THTensor *tempValues_ = THTensor_(newWithTensor)(values_); // tempValues_.expand_as(t) - tempValues_->size[dimension] = t->size[dimension]; - tempValues_->stride[dimension] = 0; + THTensor_setSizeAtDim(tempValues_, dimension, t->size(dimension)); + THTensor_setStrideAtDim(tempValues_, dimension, 0); THLongTensor *tempIndices_ = THLongTensor_newWithTensor(indices_); // tempIndices_.expand_as(t) - tempIndices_->size[dimension] = t->size[dimension]; - tempIndices_->stride[dimension] = 0; + THTensor_setSizeAtDim(tempIndices_, dimension, t->size(dimension)); + THTensor_setStrideAtDim(tempIndices_, dimension, 0); TH_TENSOR_APPLY3_D(real, t, real, tempValues_, int64_t, tempIndices_, dimension, if(!(*t_data >= *tempValues__data) && !th_isnan(*tempValues__data)) { @@ -2543,16 +2560,16 @@ void THTensor_(sum)(THTensor *r_, THTensor *t, int dimension, int keepdim) for(j = 0; j < r_Dim; ++j) { if(j != dimension){ - quot = rem/r_->stride[j]; - rem = rem%r_->stride[j]; - tBasicIndex += quot*t->stride[j]; + quot = rem/r_->stride(j); + rem = rem%r_->stride(j); + tBasicIndex += quot*t->stride(j); } } real *t_data = tp+tBasicIndex; real *r__data = rp+iter; *r__data = 0; - for(j=0; j < t->size[dimension]; ++j) { - *r__data += *(t_data + j*t->stride[dimension]); + for(j=0; j < t->size(dimension); ++j) { + *r__data += *(t_data + j*t->stride(dimension)); } } } else { @@ -2564,7 +2581,7 @@ void THTensor_(sum)(THTensor *r_, THTensor *t, int dimension, int keepdim) #endif if (serial_path) { // two implementations optimized for data locality - if (t->stride[dimension] == 1) { + if (t->stride(dimension) == 1) { TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension, accreal sum = 0; int64_t i; @@ -2575,8 +2592,8 @@ void THTensor_(sum)(THTensor *r_, THTensor *t, int dimension, int keepdim) THTensor_(zero)(r_); THTensor *temp_ = THTensor_(newWithTensor)(r_); // r_.expand_as(t) - temp_->size[dimension] = t->size[dimension]; - temp_->stride[dimension] = 0; + THTensor_setSizeAtDim(temp_, dimension, t->size(dimension)); + THTensor_setStrideAtDim(temp_, dimension, 0); TH_TENSOR_APPLY2(real, temp_, real, t, *temp__data = *temp__data + *t_data;); THTensor_(free)(temp_); @@ -2623,16 +2640,16 @@ void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension, int keepdim) for(j = 0; j < r_Dim; ++j) { if(j != dimension){ - quot = rem/r_->stride[j]; - rem = rem%r_->stride[j]; - tBasicIndex += quot*t->stride[j]; + quot = rem/r_->stride(j); + rem = rem%r_->stride(j); + tBasicIndex += quot*t->stride(j); } } real *t_data = tp+tBasicIndex; real *r__data = rp+iter; *r__data = 1; - for(j=0; j < t->size[dimension]; ++j) { - *r__data *= *(t_data + j*t->stride[dimension]); + for(j=0; j < t->size(dimension); ++j) { + *r__data *= *(t_data + j*t->stride(dimension)); } } } else { @@ -2645,7 +2662,7 @@ void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension, int keepdim) if(serial_path) { // two implementations optimized for data locality - if (t->stride[dimension] == 1) { + if (t->stride(dimension) == 1) { TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension, accreal prod = 1; int64_t i; @@ -2656,8 +2673,8 @@ void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension, int keepdim) THTensor_(fill)(r_, 1); THTensor *temp_ = THTensor_(newWithTensor)(r_); // r_.expand_as(t) - temp_->size[dimension] = t->size[dimension]; - temp_->stride[dimension] = 0; + THTensor_setSizeAtDim(temp_, dimension, t->size(dimension)); + THTensor_setStrideAtDim(temp_, dimension, 0); TH_TENSOR_APPLY2(real, temp_, real, t, *temp__data = *temp__data * *t_data;); THTensor_(free)(temp_); @@ -2670,7 +2687,7 @@ void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension, int keepdim) void THTensor_(cumsum)(THTensor *r_, THTensor *t, int dimension) { - THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 2, "dimension %d out of range", + THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "dimension %d out of range", dimension + TH_INDEX_BASE); THTensor_(resizeAs)(r_, t); @@ -2687,7 +2704,7 @@ void THTensor_(cumsum)(THTensor *r_, THTensor *t, int dimension) void THTensor_(cumprod)(THTensor *r_, THTensor *t, int dimension) { - THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 2, "dimension %d out of range", + THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "dimension %d out of range", dimension + TH_INDEX_BASE); THTensor_(resizeAs)(r_, t); @@ -2745,11 +2762,11 @@ void THTensor_(cross)(THTensor *r_, THTensor *a, THTensor *b, int dimension) { int i; - if(THTensor_(_nDimension)(a) != THTensor_(_nDimension)(b)) + if(THTensor_(nDimension)(a) != THTensor_(nDimension)(b)) THError("inconsistent tensor dimension %dD, %dD", - THTensor_(_nDimension)(a), THTensor_(_nDimension)(b)); + THTensor_(nDimension)(a), THTensor_(nDimension)(b)); - for(i = 0; i < THTensor_(_nDimension)(a); i++) + for(i = 0; i < THTensor_(nDimension)(a); i++) { if(THTensor_(size)(a, i) != THTensor_(size)(b, i)) { THDescBuff ba = THTensor_(sizeDesc)(a); @@ -2760,7 +2777,7 @@ void THTensor_(cross)(THTensor *r_, THTensor *a, THTensor *b, int dimension) if(dimension < 0) { - for(i = 0; i < THTensor_(_nDimension)(a); i++) + for(i = 0; i < THTensor_(nDimension)(a); i++) { if(THTensor_(size)(a, i) == 3) { @@ -2774,7 +2791,7 @@ void THTensor_(cross)(THTensor *r_, THTensor *a, THTensor *b, int dimension) } } - THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(a), 3, "dimension %d out of range", + THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(a), 3, "dimension %d out of range", dimension + TH_INDEX_BASE); THArgCheck(THTensor_(size)(a, dimension) == 3, 3, "dimension %d does not have size 3", dimension + TH_INDEX_BASE); @@ -2893,7 +2910,7 @@ void THTensor_(eye)(THTensor *r_, int64_t n, int64_t m) r__data = THTensor_(data)(r_); sz = THMin(THTensor_(size)(r_, 0), THTensor_(size)(r_, 1)); for(i = 0; i < sz; i++) - r__data[i*(r_->stride[0]+r_->stride[1])] = 1; + r__data[i*(r_->stride(0)+r_->stride(1))] = 1; } @@ -3169,7 +3186,7 @@ static void THTensor_(quicksortdescend)(real *arr, int64_t *idx, int64_t element void THTensor_(sort)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int dimension, int descendingOrder) { - THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 2, "invalid dimension %d", + THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "invalid dimension %d", dimension + TH_INDEX_BASE); THTensor_(resizeAs)(rt_, t); @@ -3376,7 +3393,7 @@ void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t, int64_t t_size_dim; THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 3, "dimension out of range"); - THArgCheck(k > 0 && k <= t->size[dimension], 2, "selected index out of range"); + THArgCheck(k > 0 && k <= t->size(dimension), 2, "selected index out of range"); int in_dims = THTensor_(_nDimension)(t); THTensor_(preserveReduceDimSemantics)(values_, in_dims, dimension, keepdim); @@ -3430,11 +3447,19 @@ void THTensor_(median)(THTensor *values_, THLongTensor *indices_, THTensor *t, i void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int64_t k, int dim, int dir, int sorted) { +#ifndef USE_TH_SIZE_ZERO_DIM int numDims = THTensor_(_nDimension)(t); +#else + int numDims = THTensor_(nDimension)(t); +#endif THArgCheck(dim >= 0 && dim < numDims, 3, "dim not in range"); int64_t sliceSize = THTensor_(size)(t, dim); +#ifndef USE_TH_SIZE_ZERO_DIM THArgCheck(k > 0 && k <= sliceSize, 2, "k not in range for dimension"); +#else + THArgCheck(k >= 0 && k <= sliceSize, 2, "k not in range for dimension"); +#endif THTensor *tmpResults = THTensor_(new)(); THTensor_(resize1d)(tmpResults, sliceSize); @@ -3577,8 +3602,8 @@ inline void THTensor_(check_shape_except_dim)(THTensor *first, THTensor *second, if (dim == dimension) { continue; } - int64_t first_dim_size = first->size[dim]; - int64_t second_dim_size = second->size[dim]; + int64_t first_dim_size = first->size(dim); + int64_t second_dim_size = second->size(dim); THArgCheck(first_dim_size == second_dim_size, 0, "Sizes of tensors must match except in dimension %d. Got %lld and %lld in dimension %d", dimension, (long long)first_dim_size, (long long)second_dim_size, dim); @@ -3622,13 +3647,13 @@ void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int continue; } THTensor_(check_shape_except_dim)(notSkippedTensor, tensor, dimension); - cat_dim_size += tensor->size[dimension]; + cat_dim_size += tensor->size(dimension); } // Compute the size of the result THLongStorage *size = THLongStorage_newWithSize(nDims); for (int dim = 0; dim < nDims; dim++) { - int64_t result_dim_size = notSkippedTensor->size[dim]; + int64_t result_dim_size = notSkippedTensor->size(dim); if (dim == dimension) { result_dim_size = cat_dim_size; } @@ -3667,7 +3692,7 @@ void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int offset = 0; for (int j = 0; j < numInputs; j++) { if (!should_skip(inputs[j])) { - int64_t dimSize = inputs[j]->size[dimension]; + int64_t dimSize = inputs[j]->size(dimension); THTensor *nt = THTensor_(newWithTensor)(result); THTensor_(narrow)(nt, NULL, dimension, offset, dimSize); THTensor_(copy)(nt, inputs[j]); @@ -3707,25 +3732,25 @@ int THTensor_(equal)(THTensor *ta, THTensor* tb) #define TENSOR_IMPLEMENT_LOGICAL(NAME,OP) \ void THTensor_(NAME##Value)(THByteTensor *r_, THTensor* t, real value) \ { \ - THByteTensor_resizeNd(r_, t->dim(), t->size, NULL); \ + THByteTensor_resizeNd(r_, t->dim(), THTensor_getSizePtr(t), NULL); \ TH_TENSOR_APPLY2(unsigned char, r_, real, t, \ *r__data = (*t_data OP value) ? 1 : 0;); \ } \ void THTensor_(NAME##ValueT)(THTensor* r_, THTensor* t, real value) \ { \ - THTensor_(resizeNd)(r_, t->dim(), t->size, NULL); \ + THTensor_(resizeNd)(r_, t->dim(), THTensor_getSizePtr(t), NULL); \ TH_TENSOR_APPLY2(real, r_, real, t, \ *r__data = (*t_data OP value) ? 1 : 0;); \ } \ void THTensor_(NAME##Tensor)(THByteTensor *r_, THTensor *ta, THTensor *tb) \ { \ - THByteTensor_resizeNd(r_, ta->dim(), ta->size, NULL); \ + THByteTensor_resizeNd(r_, ta->dim(), THTensor_getSizePtr(ta), NULL); \ TH_TENSOR_APPLY3(unsigned char, r_, real, ta, real, tb, \ *r__data = (*ta_data OP *tb_data) ? 1 : 0;); \ } \ void THTensor_(NAME##TensorT)(THTensor *r_, THTensor *ta, THTensor *tb) \ { \ - THTensor_(resizeNd)(r_, ta->dim(), ta->size, NULL); \ + THTensor_(resizeNd)(r_, ta->dim(), THTensor_getSizePtr(ta), NULL); \ TH_TENSOR_APPLY3(real, r_, real, ta, real, tb, \ *r__data = (*ta_data OP *tb_data) ? 1 : 0;); \ } \ @@ -3926,16 +3951,16 @@ void THTensor_(logicalAnd)(THTensor *r_, THTensor *t, int dimension, int keepdim for(j = 0; j < r_Dim; ++j) { if(j != dimension){ - quot = rem/r_->stride[j]; - rem = rem%r_->stride[j]; - tBasicIndex += quot*t->stride[j]; + quot = rem/r_->stride(j); + rem = rem%r_->stride(j); + tBasicIndex += quot*t->stride(j); } } real *t_data = tp+tBasicIndex; real *r__data = rp+iter; *r__data = 1; - for(j=0; j < t->size[dimension]; ++j) { - *r__data = *r__data && *(t_data + j*t->stride[dimension]); + for(j=0; j < t->size(dimension); ++j) { + *r__data = *r__data && *(t_data + j*t->stride(dimension)); } } } else { @@ -3948,7 +3973,7 @@ void THTensor_(logicalAnd)(THTensor *r_, THTensor *t, int dimension, int keepdim if(serial_path) { // two implementations optimized for data locality - if (t->stride[dimension] == 1) { + if (t->stride(dimension) == 1) { TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension, accreal prod = 1; int64_t i; @@ -3959,8 +3984,8 @@ void THTensor_(logicalAnd)(THTensor *r_, THTensor *t, int dimension, int keepdim THTensor_(fill)(r_, 1); THTensor *temp_ = THTensor_(newWithTensor)(r_); // r_.expand_as(t) - temp_->size[dimension] = t->size[dimension]; - temp_->stride[dimension] = 0; + THTensor_setSizeAtDim(temp_, dimension, t->size(dimension)); + THTensor_setStrideAtDim(temp_, dimension, 0); TH_TENSOR_APPLY2(real, temp_, real, t, *temp__data = *temp__data && *t_data;); THTensor_(free)(temp_); @@ -4006,16 +4031,16 @@ void THTensor_(logicalAny)(THTensor *r_, THTensor *t, int dimension, int keepdim for(j = 0; j < r_Dim; ++j) { if(j != dimension){ - quot = rem/r_->stride[j]; - rem = rem%r_->stride[j]; - tBasicIndex += quot*t->stride[j]; + quot = rem/r_->stride(j); + rem = rem%r_->stride(j); + tBasicIndex += quot*t->stride(j); } } real *t_data = tp+tBasicIndex; real *r__data = rp+iter; *r__data = 0; - for(j=0; j < t->size[dimension]; ++j) { - *r__data = *r__data || *(t_data + j*t->stride[dimension]); + for(j=0; j < t->size(dimension); ++j) { + *r__data = *r__data || *(t_data + j*t->stride(dimension)); } } } else { @@ -4027,7 +4052,7 @@ void THTensor_(logicalAny)(THTensor *r_, THTensor *t, int dimension, int keepdim #endif if (serial_path) { // two implementations optimized for data locality - if (t->stride[dimension] == 1) { + if (t->stride(dimension) == 1) { TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension, accreal sum = 0; int64_t i; @@ -4038,8 +4063,8 @@ void THTensor_(logicalAny)(THTensor *r_, THTensor *t, int dimension, int keepdim THTensor_(zero)(r_); THTensor *temp_ = THTensor_(newWithTensor)(r_); // r_.expand_as(t) - temp_->size[dimension] = t->size[dimension]; - temp_->stride[dimension] = 0; + THTensor_setSizeAtDim(temp_, dimension, t->size(dimension)); + THTensor_setStrideAtDim(temp_, dimension, 0); TH_TENSOR_APPLY2(real, temp_, real, t, *temp__data = *temp__data || *t_data;); THTensor_(free)(temp_); @@ -4123,7 +4148,7 @@ void THTensor_(mean)(THTensor *r_, THTensor *t, int dimension, int keepdim) dimension + TH_INDEX_BASE); THTensor_(sum)(r_, t, dimension, keepdim); - THTensor_(div)(r_, r_, t->size[dimension]); + THTensor_(div)(r_, r_, t->size(dimension)); } void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int biased, int keepdim) @@ -4288,21 +4313,20 @@ accreal THTensor_(normall)(THTensor *tensor, real value) void THTensor_(renorm)(THTensor *res, THTensor *src, real value, int dimension, real maxnorm) { - int i; THTensor *rowR, *rowS; - THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(src), 3, "invalid dimension %d", + THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(src), 3, "invalid dimension %d", dimension + TH_INDEX_BASE); THArgCheck(value > 0, 2, "non-positive-norm not supported"); - THArgCheck(THTensor_(_nDimension)(src) > 1, 1, "need at least 2 dimensions, got %d dimensions", - THTensor_(_nDimension)(src)); + THArgCheck(THTensor_(nDimension)(src) > 1, 1, "need at least 2 dimensions, got %d dimensions", + THTensor_(nDimension)(src)); rowR = THTensor_(new)(); rowS = THTensor_(new)(); THTensor_(resizeAs)(res, src); - for (i=0; isize[dimension]; i++) + for (int64_t i = 0; i < src->size(dimension); i++) { real norm = 0; real new_norm; @@ -4454,7 +4478,7 @@ void THTensor_(bhistc)(THTensor *hist, THTensor *tensor, int64_t nbins, real min real minval; real maxval; - THTensor_(resize2d)(hist, tensor->size[0], nbins); + THTensor_(resize2d)(hist, tensor->size(0), nbins); THTensor_(zero)(hist); minval = minvalue; diff --git a/aten/src/TH/generic/THTensorRandom.cpp b/aten/src/TH/generic/THTensorRandom.cpp index 3ddbfa66b4c7b7..ceb927429573fc 100644 --- a/aten/src/TH/generic/THTensorRandom.cpp +++ b/aten/src/TH/generic/THTensorRandom.cpp @@ -79,7 +79,7 @@ void THTensor_(iBernoulli_generate_copy)(THTensor *self, THGenerator *_generator #endif } else { intTensor = THIntTensor_new(); - THIntTensor_resizeNd(intTensor, self->dim(), self->size, NULL); + THIntTensor_resizeNd(intTensor, self->dim(), THTensor_getSizePtr(self), nullptr); tmp = THIntTensor_data(intTensor); } @@ -284,9 +284,9 @@ void THTensor_(multinomialAliasSetup)(THTensor *probs, THLongTensor *J, THTensor small = THLongTensor_fastGet1d(smaller, small_c-1); THLongTensor_fastSet1d(J, small, large); - q_data[large * q->stride[0]] -= 1.0 - THTensor_(fastGet1d)(q, small); + q_data[large * q->stride(0)] -= 1.0 - THTensor_(fastGet1d)(q, small); - if(q_data[large * q->stride[0]] < 1.0) + if(q_data[large * q->stride(0)] < 1.0) { THLongTensor_fastSet1d(smaller, small_c-1, large); large_c -= 1; @@ -317,7 +317,7 @@ void THTensor_(multinomialAliasSetup)(THTensor *probs, THLongTensor *J, THTensor { for (i=0; i < inputsize; i++) { - q_data[i*q->stride[0]] /= q_max; + q_data[i*q->stride(0)] /= q_max; } } for (i=0; i < inputsize; i++) @@ -399,7 +399,7 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso { val = THStorage_(get)( \ prob_dist->storage, \ - prob_dist->storageOffset+i*prob_dist->stride[0]+j*prob_dist->stride[1] \ + prob_dist->storageOffset+i*prob_dist->stride(0)+j*prob_dist->stride(1) \ ); THArgCheckWithCleanup((val >= 0), THCleanup(THDoubleTensor_free(cum_dist); if (start_dim == 1) THTensor_(squeeze1d)(prob_dist, prob_dist, 0);), @@ -412,7 +412,7 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso sum += val; THDoubleStorage_set( cum_dist->storage, \ - cum_dist->storageOffset+j*cum_dist->stride[0], \ + cum_dist->storageOffset+j*cum_dist->stride(0), \ sum \ ); } @@ -426,7 +426,7 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso { for (j=0; jstride[0]] /= sum; + THDoubleTensor_data(cum_dist)[j*cum_dist->stride(0)] /= sum; } } @@ -442,14 +442,14 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso double cum_prob; int sample_idx; /* Make sure the last cumulative distribution bucket sums to 1 */ - THDoubleTensor_data(cum_dist)[(n_categories-1)*cum_dist->stride[0]] = 1; + THDoubleTensor_data(cum_dist)[(n_categories-1)*cum_dist->stride(0)] = 1; while(right_pointer - left_pointer > 0) { mid_pointer = left_pointer + (right_pointer - left_pointer) / 2; cum_prob = THDoubleStorage_get( \ cum_dist->storage, \ - cum_dist->storageOffset+mid_pointer*cum_dist->stride[0] \ + cum_dist->storageOffset+mid_pointer*cum_dist->stride(0) \ ); if (cum_prob < uniform_sample) { @@ -465,7 +465,7 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso /* store in result tensor (will be incremented for lua compat by wrapper) */ THLongStorage_set( \ self->storage, \ - self->storageOffset+i*self->stride[0]+j*self->stride[1], \ + self->storageOffset+i*self->stride(0)+j*self->stride(1), \ sample_idx \ ); @@ -481,13 +481,13 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso { new_val = THDoubleStorage_get( \ cum_dist->storage, \ - cum_dist->storageOffset+(sample_idx-1)*cum_dist->stride[0] \ + cum_dist->storageOffset+(sample_idx-1)*cum_dist->stride(0) \ ); } /* marginal cumulative mass (i.e. original probability) of sample */ diff = THDoubleStorage_get( \ cum_dist->storage, \ - cum_dist->storageOffset+sample_idx*cum_dist->stride[0] \ + cum_dist->storageOffset+sample_idx*cum_dist->stride(0) \ ) - new_val; /* new sum of marginals is not one anymore... */ sum = 1.0 - diff; @@ -495,7 +495,7 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso { new_val = THDoubleStorage_get( \ cum_dist->storage, \ - cum_dist->storageOffset+k*cum_dist->stride[0] \ + cum_dist->storageOffset+k*cum_dist->stride(0) \ ); if (k >= sample_idx) { @@ -506,7 +506,7 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso new_val /= sum; THDoubleStorage_set( \ cum_dist->storage, \ - cum_dist->storageOffset+k*cum_dist->stride[0], \ + cum_dist->storageOffset+k*cum_dist->stride(0), \ new_val \ ); } diff --git a/aten/src/THC/THCAllocator.h b/aten/src/THC/THCAllocator.h index 0f7724dd8d4fb2..652bb7a4a95d47 100644 --- a/aten/src/THC/THCAllocator.h +++ b/aten/src/THC/THCAllocator.h @@ -3,8 +3,8 @@ #include "THCGeneral.h" -THC_API THAllocator* getTHCudaHostAllocator(); -THC_API THAllocator* getTHCUVAAllocator(); +THC_API THAllocator* getTHCudaHostAllocator(void); +THC_API THAllocator* getTHCUVAAllocator(void); // IPC doesn't support (re)allocation #ifdef __cplusplus diff --git a/aten/src/THC/THCBlas.cu b/aten/src/THC/THCBlas.cu index e2003da5791f97..f36fc74112eb8c 100644 --- a/aten/src/THC/THCBlas.cu +++ b/aten/src/THC/THCBlas.cu @@ -80,10 +80,19 @@ half THCudaBlas_Hdot(THCState *state, int64_t n, half *x, int64_t incx, half *y, #endif /* Level 2 */ + +void adjustLdLevel2(int64_t m, int64_t n, int64_t *lda) +{ + // Note: leading dimensions generally are checked that they are > 0 and at least as big the result + // requires (even if the value won't be used). + // TODO: why does Level3 check trans but this doesn't? + if (n <= 1) + *lda = std::max(m, 1); +} + void THCudaBlas_Sgemv(THCState *state, char trans, int64_t m, int64_t n, float alpha, float *a, int64_t lda, float *x, int64_t incx, float beta, float *y, int64_t incy) { - if(n == 1) - lda = m; + adjustLdLevel2(m, n, &lda); cublasOperation_t op; if (trans == 't') op = CUBLAS_OP_T; @@ -113,8 +122,7 @@ void THCudaBlas_Sgemv(THCState *state, char trans, int64_t m, int64_t n, float a void THCudaBlas_Dgemv(THCState *state, char trans, int64_t m, int64_t n, double alpha, double *a, int64_t lda, double *x, int64_t incx, double beta, double *y, int64_t incy) { - if(n == 1) - lda = m; + adjustLdLevel2(m, n, &lda); cublasOperation_t op; if (trans == 't') op = CUBLAS_OP_T; @@ -144,8 +152,7 @@ void THCudaBlas_Dgemv(THCState *state, char trans, int64_t m, int64_t n, double void THCudaBlas_Sger(THCState *state, int64_t m, int64_t n, float alpha, float *x, int64_t incx, float *y, int64_t incy, float *a, int64_t lda) { - if(n == 1) - lda = m; + adjustLdLevel2(m, n, &lda); if( (m <= INT_MAX) && (n <= INT_MAX) && (lda <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) ) { @@ -166,8 +173,7 @@ void THCudaBlas_Sger(THCState *state, int64_t m, int64_t n, float alpha, float * void THCudaBlas_Dger(THCState *state, int64_t m, int64_t n, double alpha, double *x, int64_t incx, double *y, int64_t incy, double *a, int64_t lda) { - if(n == 1) - lda = m; + adjustLdLevel2(m, n, &lda); if( (m <= INT_MAX) && (n <= INT_MAX) && (lda <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) ) { @@ -197,41 +203,44 @@ cublasOperation_t convertTransToCublasOperation(char trans) { } } -void adjustLd(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t *lda, int64_t *ldb, int64_t *ldc) +void adjustLdLevel3(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t *lda, int64_t *ldb, int64_t *ldc) { int transa_ = ((transa == 't') || (transa == 'T')); int transb_ = ((transb == 't') || (transb == 'T')); - if(n == 1) - *ldc = m; + // Note: leading dimensions generally are checked that they are > 0 and at least as big the result + // requires (even if the value won't be used). + if(n <= 1) + *ldc = std::max(m, 1); if(transa_) { - if(m == 1) - *lda = k; + if(m <= 1) + *lda = std::max(k, 1); } else { - if(k == 1) - *lda = m; + if(k <= 1) + *lda = std::max(m, 1); } if(transb_) { - if(k == 1) - *ldb = n; + if(k <= 1) + *ldb = std::max(n, 1); } else { - if(n == 1) - *ldb = k; + if(n <= 1) + *ldb = std::max(k, 1); } + } /* Level 3 */ void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, float alpha, float *a, int64_t lda, float *b, int64_t ldb, float beta, float *c, int64_t ldc) { - adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc); + adjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc); cublasOperation_t opa = convertTransToCublasOperation(transa); cublasOperation_t opb = convertTransToCublasOperation(transb); @@ -261,7 +270,7 @@ void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t m, int6 void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, half alpha, half *a, int64_t lda, half *b, int64_t ldb, half beta, half *c, int64_t ldc) { - adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc); + adjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc); cublasOperation_t opa = convertTransToCublasOperation(transa); cublasOperation_t opb = convertTransToCublasOperation(transb); @@ -312,7 +321,7 @@ void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int6 void THCudaBlas_Dgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, double alpha, double *a, int64_t lda, double *b, int64_t ldb, double beta, double *c, int64_t ldc) { - adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc); + adjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc); cublasOperation_t opa = convertTransToCublasOperation(transa); cublasOperation_t opb = convertTransToCublasOperation(transb); @@ -346,7 +355,7 @@ void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, i "with the bound [val] <= %d", INT_MAX); } - adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc); + adjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc); cublasOperation_t opa = convertTransToCublasOperation(transa); cublasOperation_t opb = convertTransToCublasOperation(transb); @@ -375,7 +384,7 @@ void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, int64_t "with the bound [val] <= %d", INT_MAX); } - adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc); + adjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc); cublasOperation_t opa = convertTransToCublasOperation(transa); cublasOperation_t opb = convertTransToCublasOperation(transb); @@ -399,7 +408,7 @@ void THCudaBlas_SgemmStridedBatched(THCState *state, char transa, char transb, i "with the bound [val] <= %d", INT_MAX); } - adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc); + adjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc); cublasOperation_t opa = convertTransToCublasOperation(transa); cublasOperation_t opb = convertTransToCublasOperation(transb); @@ -422,7 +431,7 @@ void THCudaBlas_DgemmBatched(THCState *state, char transa, char transb, int64_t "with the bound [val] <= %d", INT_MAX); } - adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc); + adjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc); cublasOperation_t opa = convertTransToCublasOperation(transa); cublasOperation_t opb = convertTransToCublasOperation(transb); @@ -445,7 +454,7 @@ void THCudaBlas_DgemmStridedBatched(THCState *state, char transa, char transb, i "with the bound [val] <= %d", INT_MAX); } - adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc); + adjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc); cublasOperation_t opa = convertTransToCublasOperation(transa); cublasOperation_t opb = convertTransToCublasOperation(transb); diff --git a/aten/src/THC/THCCachingHostAllocator.h b/aten/src/THC/THCCachingHostAllocator.h index d270c2081e8726..adb86cbb120526 100644 --- a/aten/src/THC/THCCachingHostAllocator.h +++ b/aten/src/THC/THCCachingHostAllocator.h @@ -19,7 +19,7 @@ // Note that this allocator does not split larger allocations into smaller // blocks, unlike the caching device allocator. // -THC_API THAllocator* getTHCCachingHostAllocator(); +THC_API THAllocator* getTHCCachingHostAllocator(void); // Records an event in the specified stream. The allocation 'ptr' will not be // re-used until the event has occurred. diff --git a/aten/src/THC/THCStorage.cpp b/aten/src/THC/THCStorage.cpp index ab92022d9a6a1d..4d46a01296cb20 100644 --- a/aten/src/THC/THCStorage.cpp +++ b/aten/src/THC/THCStorage.cpp @@ -8,58 +8,6 @@ #include "generic/THCStorage.cpp" #include "THCGenerateAllTypes.h" -THCStorage* THCStorage_new(THCState *state, at::ScalarType scalar_type) -{ - return THCStorage_newWithSize(state, scalar_type, 0); -} - -THCStorage* THCStorage_newWithSize(THCState *state, at::ScalarType scalar_type, ptrdiff_t size) -{ - return THCStorage_newWithAllocator( - state, scalar_type, size, - state->cudaDeviceAllocator); -} - -THCStorage* THCStorage_newWithAllocator(THCState *state, - at::ScalarType scalar_type, - ptrdiff_t size, - at::Allocator* allocator) -{ - THCStorage *storage = (THCStorage*)THAlloc(sizeof(THCStorage)); - memset(storage, 0, sizeof(THCStorage)); - new (&storage->refcount) std::atomic(1); - new (&storage->weakcount) std::atomic(1); - new (&storage->finalizer) std::unique_ptr(nullptr); - storage->scalar_type = scalar_type; - storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE; - storage->allocator = allocator; - storage->size = size; - - at::DataPtr ptr; - try { - ptr = allocator->allocate(size * at::elementSize(scalar_type)); - } catch(...) { - free(storage); - throw; - } - new (&storage->data_ptr) at::DataPtr(std::move(ptr)); - return storage; -} - -void THCStorage_free(THCState *state, THCStorage *storage) -{ - if (storage->flag & TH_STORAGE_REFCOUNTED) { - if (--storage->refcount == 0) { - if (storage->finalizer) { - (*storage->finalizer)(); - } - storage->finalizer.~unique_ptr(); - storage->data_ptr.~DataPtr(); - THStorage_weakFree(storage); - } - } -} - void THCStorage_resize(THCState *state, THCStorage *self, ptrdiff_t size) { THArgCheck(size >= 0, 2, "invalid size"); @@ -103,18 +51,13 @@ int THCStorage_getDevice(THCState* state, const THCStorage* storage) { return storage->data_ptr.device().index(); } -THCStorage* THCStorage_newWithDataAndAllocator( - THCState *state, at::ScalarType scalar_type, at::DataPtr&& data, ptrdiff_t size, - at::Allocator *allocator) { - THCStorage *storage = (THCStorage*)THAlloc(sizeof(THCStorage)); - memset(storage, 0, sizeof(THCStorage)); - storage->scalar_type = scalar_type; - new (&storage->data_ptr) at::DataPtr(std::move(data)); - storage->size = size; - new (&storage->refcount) std::atomic(1); - new (&storage->weakcount) std::atomic(1); - new (&storage->finalizer) std::unique_ptr(nullptr); - storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE; - storage->allocator = allocator; +THC_API THCStorage* THCStorage_new( + THCState* state, + at::ScalarType scalar_type) { + THStorage* storage = new THStorage( + scalar_type, + 0, + state->cudaDeviceAllocator, + TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE); return storage; } diff --git a/aten/src/THC/THCStorage.h b/aten/src/THC/THCStorage.h index 22a607ce43107f..d14df7f50859ae 100644 --- a/aten/src/THC/THCStorage.h +++ b/aten/src/THC/THCStorage.h @@ -1,7 +1,7 @@ #ifndef THC_STORAGE_INC #define THC_STORAGE_INC -#include "THStorage.h" +#include "THStorageFunctions.h" #include "THCGeneral.h" #define THCStorage_(NAME) TH_CONCAT_4(TH,CReal,Storage_,NAME) diff --git a/aten/src/THC/THCStorage.hpp b/aten/src/THC/THCStorage.hpp index ae5ad7bd8cdc72..8ab7e27fec485a 100644 --- a/aten/src/THC/THCStorage.hpp +++ b/aten/src/THC/THCStorage.hpp @@ -4,7 +4,8 @@ // read Note [TH abstraction violation] #include "THCStorage.h" -#include +// Should work with THStorageClass +#include #include "ATen/ScalarType.h" #include "ATen/ScalarTypeUtils.h" @@ -17,19 +18,10 @@ struct CTypeToScalarType<__half> : public CTypeToScalarType {}; } -THC_API THCStorage* THCStorage_new(THCState *state, at::ScalarType scalar_type); -THC_API THCStorage* THCStorage_newWithSize(THCState *state, at::ScalarType scalar_type, ptrdiff_t size); - -THC_API THCStorage* THCStorage_newWithAllocator(THCState *state, - at::ScalarType scalar_type, - ptrdiff_t size, - at::Allocator* allocator); +THC_API THCStorage* THCStorage_new(THCState* state, at::ScalarType); THC_API void THCStorage_retain(THCState *state, THCStorage *storage); -// This exists to have a data-type independent way of freeing (necessary for THPPointer). -THC_API void THCStorage_free(THCState *state, THCStorage *self); - THC_API void THCStorage_resize(THCState *state, THCStorage *storage, ptrdiff_t size); THC_API int THCStorage_getDevice(THCState* state, const THCStorage* storage); diff --git a/aten/src/THC/THCTensor.cpp b/aten/src/THC/THCTensor.cpp index 7ecf02f014a342..13fdff6b3b566b 100644 --- a/aten/src/THC/THCTensor.cpp +++ b/aten/src/THC/THCTensor.cpp @@ -19,16 +19,16 @@ int THCTensor__nDimension(THCState *state, const THCTensor *self) { int64_t THCTensor_size(THCState *state, const THCTensor *self, int dim) { THArgCheck((dim >= 0) && (dim < self->dim()), 2, "out of range"); - return self->size[dim]; + return self->size(dim); } int64_t THCTensor_stride(THCState *state, const THCTensor *self, int dim) { THArgCheck((dim >= 0) && (dim < self->dim()), 2, "out of range"); - return self->stride[dim]; + return self->stride(dim); } THLongStorage *THCTensor_newSizeOf(THCState *state, THCTensor *self) { THLongStorage *size = THLongStorage_newWithSize(self->dim()); - THLongStorage_rawCopy(size, self->size); + THLongStorage_rawCopy(size, THTensor_getSizePtr(self)); return size; } @@ -73,7 +73,7 @@ void THCTensor_resizeAs(THCState *state, THCTensor *self, THCTensor *src) { isSame = 1; for(d = 0; d < self->dim(); d++) { - if(self->size[d] != src->size[d]) + if(self->size(d) != src->size(d)) { isSame = 0; break; @@ -82,7 +82,7 @@ void THCTensor_resizeAs(THCState *state, THCTensor *self, THCTensor *src) { } if(!isSame) - THCTensor_resizeNd(state, self, src->dim(), src->size, NULL); + THCTensor_resizeNd(state, self, src->dim(), THTensor_getSizePtr(src), NULL); } void THCTensor_resizeNd(THCState *state, THCTensor *self, int nDimension, int64_t *size, int64_t *stride) @@ -108,12 +108,12 @@ void THCTensor_resizeNd(THCState *state, THCTensor *self, int nDimension, int64_ AT_CHECK(size[d] > 0, "sizes must be non-negative"); } #endif - if((self->dim() > d) && (size[d] != self->size[d])) { + if((self->dim() > d) && (size[d] != self->size(d))) { hascorrectsize = false; } // NB: this used to test that stride[d] was >= 0 - if((self->dim() > d) && stride && (stride[d] != self->stride[d])) { + if((self->dim() > d) && stride && (stride[d] != self->stride(d))) { hascorrectsize = false; } } @@ -128,26 +128,24 @@ void THCTensor_resizeNd(THCState *state, THCTensor *self, int nDimension, int64_ if(nDimension != self->dim()) { - self->size = (int64_t*)THRealloc(self->size, sizeof(int64_t)*nDimension); - self->stride = (int64_t*)THRealloc(self->stride, sizeof(int64_t)*nDimension); - self->dim_ = nDimension; + THTensor_resizeDim(self, nDimension); } totalSize = 1; for(d = nDimension-1; d >= 0; d--) { - self->size[d] = size[d]; + THTensor_setSizeAtDim(self, d, size[d]); if(stride && (stride[d] >= 0) ) { - self->stride[d] = stride[d]; + THTensor_setStrideAtDim(self, d, stride[d]); } else { if(d == nDimension-1) { - self->stride[d] = 1; + THTensor_setStrideAtDim(self, d, 1); } else { // Keep stride monotonically increasing to match NumPy. - self->stride[d] = std::max(self->size[d+1],1)*self->stride[d+1]; + THTensor_setStrideAtDim(self, d, std::max(self->size(d+1),1)*self->stride(d+1)); } } - totalSize += (self->size[d]-1)*self->stride[d]; + totalSize += (self->size(d)-1)*self->stride(d); } if(totalSize+self->storageOffset > 0) @@ -169,8 +167,8 @@ void THCTensor_set(THCState *state, THCTensor *self, THCTensor *src) src->storage, src->storageOffset, src->dim(), - src->size, - src->stride); + THTensor_getSizePtr(src), + THTensor_getStridePtr(src)); } void THCTensor_setStorageNd(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, int64_t *size, int64_t *stride) @@ -182,7 +180,7 @@ void THCTensor_setStorageNd(THCState *state, THCTensor *self, THCStorage *storag THError("Tensor: invalid null storage"); } auto scalar_type = self->storage->scalar_type; - THCStorage_free(state, self->storage); + THStorage_free(self->storage); if(storage) { @@ -214,17 +212,17 @@ void THCTensor_squeeze1d(THCState *state, THCTensor *self, THCTensor *src, int d THCTensor_set(state, self, src); #ifdef TH_SCALAR - if(src->size[dimension] == 1) + if(src->size(dimension) == 1) #else - if(src->size[dimension] == 1 && src->dim() > 1) + if(src->size(dimension) == 1 && src->dim() > 1) #endif { for(d = dimension; d < self->dim()-1; d++) { - self->size[d] = self->size[d+1]; - self->stride[d] = self->stride[d+1]; + THTensor_setSizeAtDim(self, d, self->size(d+1)); + THTensor_setStrideAtDim(self, d, self->stride(d+1)); } - self->dim_--; + THTensor_resizeDim(self, self->dim_ - 1); } } @@ -242,19 +240,17 @@ void THCTensor_unsqueeze1d(THCState *state, THCTensor *self, THCTensor *src, int THCTensor_set(state, self, src); - self->size = (int64_t*)THRealloc(self->size, sizeof(int64_t)*(self->dim()+1)); - self->stride = (int64_t*)THRealloc(self->stride, sizeof(int64_t)*(self->dim()+1)); - self->dim_++; + THTensor_resizeDim(self, self->dim() + 1); for (d = self->dim()-1; d > dimension; d--) { - self->size[d] = self->size[d-1]; - self->stride[d] = self->stride[d-1]; + THTensor_setSizeAtDim(self, d, self->size(d-1)); + THTensor_setStrideAtDim(self, d, self->stride(d-1)); } if (dimension+1 < self->dim()) { - self->stride[dimension] = self->size[dimension+1] * self->stride[dimension+1]; + THTensor_setStrideAtDim(self, dimension, self->size(dimension+1) * self->stride(dimension+1)); } else { - self->stride[dimension] = 1; + THTensor_setStrideAtDim(self, dimension, 1); } - self->size[dimension] = 1; + THTensor_setSizeAtDim(self, dimension, 1); } bool THCTensor_isContiguous(THCState *state, const THCTensor *self) { @@ -263,10 +259,10 @@ bool THCTensor_isContiguous(THCState *state, const THCTensor *self) { int d; for(d = self->dim()-1; d >= 0; d--) { - if(self->size[d] != 1) + if(self->size(d) != 1) { - if(self->stride[d] == z) - z *= self->size[d]; + if(self->stride(d) == z) + z *= self->size(d); else return false; } @@ -292,33 +288,18 @@ ptrdiff_t THCTensor_nElement(THCState *state, const THCTensor *self) { ptrdiff_t nElement = 1; int d; for(d = 0; d < self->_dim(); d++) - nElement *= self->size[d]; + nElement *= self->size(d); return nElement; } } void THCTensor_retain(THCState *state, THCTensor *self) { - if(self->flag & TH_TENSOR_REFCOUNTED) - self->refcount++; + self->refcount++; } void THCTensor_free(THCState *state, THCTensor *self) { - if(!self) - return; - - if(self->flag & TH_TENSOR_REFCOUNTED) - { - if(--self->refcount == 0) - { - THFree(self->size); - THFree(self->stride); - if(self->storage) - THCStorage_free(state, self->storage); - self->refcount.~atomic(); - THFree(self); - } - } + THTensor_free(self); } int THCTensor_getDevice(THCState* state, const THCTensor* tensor) { diff --git a/aten/src/THC/THCTensor.hpp b/aten/src/THC/THCTensor.hpp index d4fa2c6835f46e..56147b27e912c4 100644 --- a/aten/src/THC/THCTensor.hpp +++ b/aten/src/THC/THCTensor.hpp @@ -10,54 +10,6 @@ #include #include -typedef struct THCTensor -{ - int64_t *size; - int64_t *stride; - int64_t dim_; - - THCStorage *storage; - ptrdiff_t storageOffset; - std::atomic refcount; - - char flag; - - template - inline T * data() const { - return storage->data() + storageOffset; - } - - template - inline T * unsafe_data() const { - return storage->unsafe_data() + storageOffset; - } - - // [NOTE: _dim() vs dim()] - // _dim() returns the "old" TH dimension view where no dimensions represents an empty tensor. - // dim() returns the ATen view of the dimensionality, i.e. 0-sized dimensions are supported. - inline int64_t _dim() const { - return is_empty() ? 0: dim_; - } - - inline int64_t dim() const { - return dim_; - } - - // represents that numel() == 0. - inline bool is_empty() const { - for (int64_t i = 0; i < dim_; ++i) { - if (size[i] == 0) { - return true; - } - } - return false; - } - - inline at::IntList sizes() { - return at::IntList(size, dim_); - } -} THCTensor; - // See [NOTE: _dim() vs dim()]; _nDimension corresponds to _dim(), nDimension corresponds to dim(). THC_API int THCTensor_nDimension(THCState *state, const THCTensor *self); THC_API int THCTensor__nDimension(THCState *state, const THCTensor *self); diff --git a/aten/src/THC/THCTensorRandom.cuh b/aten/src/THC/THCTensorRandom.cuh index 7749f231c5c771..fc3d7fb49fec81 100644 --- a/aten/src/THC/THCTensorRandom.cuh +++ b/aten/src/THC/THCTensorRandom.cuh @@ -160,8 +160,8 @@ sampleMultinomialOnce(int64_t* dest, int categories, T* sampled, T* dist, - int stride_dist, // dist->stride[0] - int stride_categories // dist->stride[1] + int stride_dist, // dist->stride(0) + int stride_categories // dist->stride(1) ) { extern __shared__ unsigned char my_smem[]; __shared__ bool found; diff --git a/aten/src/THC/generic/THCStorage.cpp b/aten/src/THC/generic/THCStorage.cpp index 98b4c3bebb40fb..91754e70f00328 100644 --- a/aten/src/THC/generic/THCStorage.cpp +++ b/aten/src/THC/generic/THCStorage.cpp @@ -40,19 +40,33 @@ real THCStorage_(get)(THCState *state, const THCStorage *self, ptrdiff_t index) THCStorage* THCStorage_(new)(THCState *state) { - return THCStorage_new(state, at::CTypeToScalarType::to()); + THStorage* storage = new THStorage( + at::CTypeToScalarType::to(), + 0, + state->cudaDeviceAllocator, + TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE); + return storage; } THCStorage* THCStorage_(newWithSize)(THCState *state, ptrdiff_t size) { - return THCStorage_newWithSize(state, at::CTypeToScalarType::to(), size); + THStorage* storage = new THStorage( + at::CTypeToScalarType::to(), + size, + state->cudaDeviceAllocator, + TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE); + return storage; } THCStorage* THCStorage_(newWithAllocator)(THCState *state, ptrdiff_t size, at::Allocator* allocator) { - return THCStorage_newWithAllocator(state, at::CTypeToScalarType::to(), - size, allocator); + THStorage* storage = new THStorage( + at::CTypeToScalarType::to(), + size, + allocator, + TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE); + return storage; } THCStorage* THCStorage_(newWithSize1)(THCState *state, real data0) @@ -96,9 +110,17 @@ THCStorage* THCStorage_(newWithMapping)(THCState *state, const char *fileName, p } THCStorage* THCStorage_(newWithDataAndAllocator)( - THCState *state, at::DataPtr&& data, ptrdiff_t size, - at::Allocator *allocator) { - return THCStorage_newWithDataAndAllocator(state, at::CTypeToScalarType::to(), std::move(data), size, allocator); + THCState* state, + at::DataPtr&& data, + ptrdiff_t size, + at::Allocator* allocator) { + THStorage* storage = new THStorage( + at::CTypeToScalarType::to(), + size, + std::move(data), + allocator, + TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE); + return storage; } void THCStorage_(setFlag)(THCState *state, THCStorage *storage, const char flag) @@ -118,6 +140,6 @@ void THCStorage_(retain)(THCState *state, THCStorage *self) void THCStorage_(free)(THCState *state, THCStorage *self) { - THCStorage_free(state, self); + THStorage_free(self); } #endif diff --git a/aten/src/THC/generic/THCTensor.cpp b/aten/src/THC/generic/THCTensor.cpp index d8ee2045bcc60e..3b03e37232ef05 100644 --- a/aten/src/THC/generic/THCTensor.cpp +++ b/aten/src/THC/generic/THCTensor.cpp @@ -41,7 +41,7 @@ THLongStorage *THCTensor_(newSizeOf)(THCState *state, THCTensor *self) THLongStorage *THCTensor_(newStrideOf)(THCState *state, THCTensor *self) { THLongStorage *stride = THLongStorage_newWithSize(self->dim()); - THLongStorage_rawCopy(stride, self->stride); + THLongStorage_rawCopy(stride, THTensor_getStridePtr(self)); return stride; } @@ -53,53 +53,36 @@ real *THCTensor_(data)(THCState *state, const THCTensor *self) return NULL; } -void THCTensor_(setFlag)(THCState *state, THCTensor *self, const char flag) -{ - self->flag |= flag; -} - -void THCTensor_(clearFlag)(THCState *state, THCTensor *self, const char flag) -{ - self->flag &= ~flag; -} - /**** creation methods ****/ -static void THCTensor_(rawInit)(THCState *state, THCTensor *self); - - /* Empty init */ THCTensor *THCTensor_(new)(THCState *state) { - THCTensor *self = (THCTensor*)THAlloc(sizeof(THCTensor)); - THCTensor_(rawInit)(state, self); - return self; + return new THCTensor(THCStorage_(new)(state)); } /* Pointer-copy init */ THCTensor *THCTensor_(newWithTensor)(THCState *state, THCTensor *tensor) { - THCTensor *self = (THCTensor*)THAlloc(sizeof(THCTensor)); - THCTensor_(rawInit)(state, self); + THCTensor *self = new THCTensor(THCStorage_(new)(state)); THCTensor_(setStorageNd)(state, self, tensor->storage, tensor->storageOffset, tensor->dim(), - tensor->size, - tensor->stride); + THTensor_getSizePtr(tensor), + THTensor_getStridePtr(tensor)); return self; } /* Storage init */ THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset, THLongStorage *size, THLongStorage *stride) { - THCTensor *self = (THCTensor*)THAlloc(sizeof(THCTensor)); if(size && stride) THArgCheck(size->size == stride->size, 4, "inconsistent size"); AT_CHECK(size, "size must not be null"); - THCTensor_(rawInit)(state, self); + THCTensor *self = new THCTensor(THCStorage_(new)(state)); THCTensor_(setStorageNd)(state, self, storage, @@ -113,8 +96,7 @@ THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage, ptrd THCTensor *THCTensor_(newWithStorageIntLists)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset, at::IntList sizes, at::IntList strides) { AT_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match"); - THCTensor *self = (THCTensor *)THAlloc(sizeof(THCTensor)); - THCTensor_(rawInit)(state, self); + THCTensor *self = new THCTensor(THCStorage_(new)(state)); THCTensor_(setStorageNd)(state, self, storage, storageOffset, sizes.size(), const_cast(sizes.data()), const_cast(strides.data())); @@ -159,8 +141,7 @@ THCTensor *THCTensor_(newWithSize)(THCState *state, THLongStorage *size, THLongS } THCTensor *THCTensor_(newWithSizeIntList)(THCState *state, at::IntList sizes) { - THCTensor *self = (THCTensor *)THAlloc(sizeof(THCTensor)); - THCTensor_(rawInit)(state, self); + THCTensor *self = new THCTensor(THCStorage_(new)(state)); THCTensor_(resizeNd)(state, self, sizes.size(), const_cast(sizes.data()), nullptr); return self; @@ -237,8 +218,8 @@ THCTensor *THCTensor_(newView)(THCState *state, THCTensor *tensor, THLongStorage ptrdiff_t numel = THCTensor_(nElement)(state, tensor); THCTensor *self = THCTensor_(new)(state); THLongStorage *inferred_size = THLongStorage_newInferSize(size, numel); - auto stride = THTensor_compute_stride(at::IntList(tensor->size, tensor->dim()), - at::IntList(tensor->stride, tensor->dim()), + auto stride = THTensor_compute_stride(tensor->sizes(), + tensor->strides(), at::IntList(inferred_size->data(), inferred_size->size)); THArgCheck(stride.has_value(), 2, "view size is " "not compatible with input tensor's size and stride (at least one dimension spans " @@ -391,14 +372,14 @@ void THCTensor_(narrow)(THCState *state, THCTensor *self, THCTensor *src, int di #else THArgCheck( size > 0, 5, "out of range"); #endif - THArgCheck(firstIndex+size <= src->size[dimension], 5, "out of range"); + THArgCheck(firstIndex+size <= src->size(dimension), 5, "out of range"); THCTensor_(set)(state, self, src); if(firstIndex > 0) - self->storageOffset += firstIndex*self->stride[dimension]; + self->storageOffset += firstIndex*self->stride(dimension); - self->size[dimension] = size; + THTensor_setSizeAtDim(self, dimension, size); } void THCTensor_(select)(THCState *state, THCTensor *self, THCTensor *src, int dimension, int64_t sliceIndex) @@ -408,20 +389,24 @@ void THCTensor_(select)(THCState *state, THCTensor *self, THCTensor *src, int di if(!src) src = self; -#ifndef USE_TH_SCALAR +#ifndef USE_TH_SIZE_ZERO_DIM THArgCheck(src->_dim() > 1, 1, "cannot select on a vector"); +#else +#ifndef USE_TH_SCALAR + THArgCheck(src->dim() > 1, 1, "cannot select on a vector"); +#endif #endif THArgCheck((dimension >= 0) && (dimension < src->dim()), 3, "out of range"); - THArgCheck((sliceIndex >= 0) && (sliceIndex < src->size[dimension]), 4, "out of range"); + THArgCheck((sliceIndex >= 0) && (sliceIndex < src->size(dimension)), 4, "out of range"); THCTensor_(set)(state, self, src); THCTensor_(narrow)(state, self, NULL, dimension, sliceIndex, 1); for(d = dimension; d < self->dim()-1; d++) { - self->size[d] = self->size[d+1]; - self->stride[d] = self->stride[d+1]; + THTensor_setSizeAtDim(self, d, self->size(d+1)); + THTensor_setStrideAtDim(self, d, self->stride(d+1)); } - self->dim_--; + THTensor_resizeDim(self, self->dim_ - 1); } void THCTensor_(transpose)(THCState *state, THCTensor *self, THCTensor *src, int dimension1, int dimension2) @@ -431,26 +416,24 @@ void THCTensor_(transpose)(THCState *state, THCTensor *self, THCTensor *src, int if(!src) src = self; - THArgCheck( (dimension1 >= 0) && (dimension1 < src->_dim()), 1, "out of range"); - THArgCheck( (dimension2 >= 0) && (dimension2 < src->_dim()), 2, "out of range"); + THArgCheck( (dimension1 >= 0) && (dimension1 < src->dim()), 1, "out of range"); + THArgCheck( (dimension2 >= 0) && (dimension2 < src->dim()), 2, "out of range"); THCTensor_(set)(state, self, src); if(dimension1 == dimension2) return; - z = self->stride[dimension1]; - self->stride[dimension1] = self->stride[dimension2]; - self->stride[dimension2] = z; - z = self->size[dimension1]; - self->size[dimension1] = self->size[dimension2]; - self->size[dimension2] = z; + z = self->stride(dimension1); + THTensor_setStrideAtDim(self, dimension1, self->stride(dimension2)); + THTensor_setStrideAtDim(self, dimension2, z); + z = self->size(dimension1); + THTensor_setSizeAtDim(self, dimension1, self->size(dimension2)); + THTensor_setSizeAtDim(self, dimension2, z); } void THCTensor_(unfold)(THCState *state, THCTensor *self, THCTensor *src, int dimension, int64_t size, int64_t step) { - int64_t *newSize; - int64_t *newStride; int d; if(!src) @@ -460,36 +443,31 @@ void THCTensor_(unfold)(THCState *state, THCTensor *self, THCTensor *src, int di THArgCheck(!src->is_empty(), 1, "cannot unfold an empty tensor"); #endif THArgCheck(dimension < src->dim(), 2, "out of range"); - THArgCheck(size <= src->size[dimension], 3, "out of range"); + THArgCheck(size <= src->size(dimension), 3, "out of range"); THArgCheck(step > 0, 4, "invalid step"); THCTensor_(set)(state, self, src); - newSize = (int64_t*)THAlloc(sizeof(int64_t)*(self->dim()+1)); - newStride = (int64_t*)THAlloc(sizeof(int64_t)*(self->dim()+1)); + std::vector newSize(self->dim() + 1); + std::vector newStride(self->dim() + 1); newSize[self->dim()] = size; - newStride[self->dim()] = self->stride[dimension]; + newStride[self->dim()] = self->stride(dimension); for(d = 0; d < self->dim(); d++) { if(d == dimension) { - newSize[d] = (self->size[d] - size) / step + 1; - newStride[d] = step*self->stride[d]; + newSize[d] = (self->size(d) - size) / step + 1; + newStride[d] = step*self->stride(d); } else { - newSize[d] = self->size[d]; - newStride[d] = self->stride[d]; + newSize[d] = self->size(d); + newStride[d] = self->stride(d); } } - THFree(self->size); - THFree(self->stride); - - self->size = newSize; - self->stride = newStride; - self->dim_++; + THTensor_setSizesAndStrides(self, std::move(newSize), std::move(newStride)); } /* we have to handle the case where the result is a number */ @@ -505,12 +483,12 @@ void THCTensor_(squeeze)(THCState *state, THCTensor *self, THCTensor *src) for(d = 0; d < src->dim(); d++) { - if(src->size[d] != 1) + if(src->size(d) != 1) { if(d != ndim) { - self->size[ndim] = src->size[d]; - self->stride[ndim] = src->stride[d]; + THTensor_setSizeAtDim(self, ndim, src->size(d)); + THTensor_setStrideAtDim(self, ndim, src->stride(d)); } ndim++; } @@ -520,11 +498,11 @@ void THCTensor_(squeeze)(THCState *state, THCTensor *self, THCTensor *src) /* right now, we do not handle 0-dimension tensors */ if(ndim == 0 && src->dim() > 0) { - self->size[0] = 1; - self->stride[0] = 1; + THTensor_setSizeAtDim(self, 0, 1); + THTensor_setStrideAtDim(self, 0, 1); ndim = 1; } - self->dim_ = ndim; + THTensor_resizeDim(self, ndim); } #endif @@ -551,7 +529,7 @@ int THCTensor_(isSize)(THCState *state, const THCTensor *self, const THLongStora for (d = 0; d < self->dim(); ++d) { - if (self->size[d] != THLongStorage_data(dims)[d]) + if (self->size(d) != THLongStorage_data(dims)[d]) return 0; } return 1; @@ -566,7 +544,7 @@ int THCTensor_(isSetTo)(THCState *state, const THCTensor *self, const THCTensor int d; for (d = 0; d < self->dim(); ++d) { - if (self->size[d] != src->size[d] || self->stride[d] != src->stride[d]) + if (self->size(d) != src->size(d) || self->stride(d) != src->stride(d)) return 0; } return 1; @@ -581,7 +559,7 @@ int THCTensor_(isSameSizeAs)(THCState *state, const THCTensor *self, const THCTe return 0; for(d = 0; d < self->dim(); ++d) { - if(self->size[d] != src->size[d]) + if(self->size(d) != src->size(d)) return 0; } return 1; @@ -612,19 +590,6 @@ void THCTensor_(freeCopyTo)(THCState *state, THCTensor *self, THCTensor *dst) /*******************************************************************************/ -static void THCTensor_(rawInit)(THCState *state, THCTensor *self) -{ - new (&self->refcount) std::atomic(1); - self->storage = THCStorage_(new)(state); - self->storageOffset = 0; - self->size = static_cast(THAlloc(sizeof(int64_t))); - self->stride = static_cast(THAlloc(sizeof(int64_t))); - self->size[0] = 0; - self->stride[0] = 1; - self->dim_ = 1; - self->flag = TH_TENSOR_REFCOUNTED; -} - void THCTensor_(setStorageNd)(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, int64_t *size, int64_t *stride) { THCTensor_setStorageNd(state, self, storage, storageOffset, nDimension, size, stride); @@ -638,57 +603,57 @@ void THCTensor_(resizeNd)(THCState *state, THCTensor *self, int nDimension, int6 void THCTensor_(set1d)(THCState *state, THCTensor *tensor, int64_t x0, real value) { THArgCheck(tensor->dim() == 1, 1, "tensor must have one dimension"); - THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range"); - THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0], value); + THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range"); + THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0), value); } real THCTensor_(get1d)(THCState *state, const THCTensor *tensor, int64_t x0) { THArgCheck(tensor->dim() == 1, 1, "tensor must have one dimension"); - THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range"); - return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]); + THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range"); + return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0)); } void THCTensor_(set2d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, real value) { THArgCheck(tensor->dim() == 2, 1, "tensor must have two dimensions"); - THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range"); - THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1], value); + THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)), 2, "out of range"); + THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1), value); } real THCTensor_(get2d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1) { THArgCheck(tensor->dim() == 2, 1, "tensor must have two dimensions"); - THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range"); - return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]); + THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)), 2, "out of range"); + return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)); } void THCTensor_(set3d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, real value) { THArgCheck(tensor->dim() == 3, 1, "tensor must have three dimensions"); - THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range"); - THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2], value); + THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)), 2, "out of range"); + THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2), value); } real THCTensor_(get3d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2) { THArgCheck(tensor->dim() == 3, 1, "tensor must have three dimensions"); - THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range"); - return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]); + THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)), 2, "out of range"); + return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)); } void THCTensor_(set4d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3, real value) { THArgCheck(tensor->dim() == 4, 1, "tensor must have four dimensions"); - THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range"); - THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]+x3*tensor->stride[3], value); + THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)) && (x3 >= 0) && (x3 < tensor->size(3)), 2, "out of range"); + THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)+x3*tensor->stride(3), value); } real THCTensor_(get4d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3) { THArgCheck(tensor->dim() == 4, 1, "tensor must have four dimensions"); - THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range"); - return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]+x3*tensor->stride[3]); + THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)) && (x3 >= 0) && (x3 < tensor->size(3)), 2, "out of range"); + return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)+x3*tensor->stride(3)); } int THCTensor_(checkGPU)(THCState *state, unsigned int nTensors, ...) @@ -747,7 +712,7 @@ THCDescBuff THCTensor_(sizeDesc)(THCState *state, const THCTensor *tensor) { int i; for(i = 0; i < tensor->dim(); i++) { if(n >= L) break; - n += snprintf(str+n, L-n, "%" PRId64, tensor->size[i]); + n += snprintf(str+n, L-n, "%" PRId64, tensor->size(i)); if(i < tensor->dim()-1) { n += snprintf(str+n, L-n, " x "); } diff --git a/aten/src/THC/generic/THCTensor.h b/aten/src/THC/generic/THCTensor.h index e3e3648c3b3a5a..8e9bf84727420e 100644 --- a/aten/src/THC/generic/THCTensor.h +++ b/aten/src/THC/generic/THCTensor.h @@ -2,8 +2,6 @@ #define THC_GENERIC_FILE "generic/THCTensor.h" #else -#define TH_TENSOR_REFCOUNTED 1 - typedef struct THCTensor THCTensor; // These used to be distinct types; for some measure of backwards compatibility and documentation diff --git a/aten/src/THC/generic/THCTensorIndex.cu b/aten/src/THC/generic/THCTensorIndex.cu index 0e6a7ffcc1d47b..f93ad4dfe14dd2 100644 --- a/aten/src/THC/generic/THCTensorIndex.cu +++ b/aten/src/THC/generic/THCTensorIndex.cu @@ -19,14 +19,14 @@ static ptrdiff_t THCTensor_(getSliceSize)(THCState *state, THCTensor *dst, ptrdiff_t dstSliceSize = 1; for (int d = 0; d < dstDims; d++) { if (d != dim) { - dstSliceSize *= dst->size[d]; + dstSliceSize *= dst->size(d); } } if (src == nullptr) return dstSliceSize; THArgCheck(dim < srcDims, 3, "Indexing dim is out of bounds"); - THArgCheck(THCudaLongTensor_nElement(state, index) == src->size[dim], 4, + THArgCheck(THCudaLongTensor_nElement(state, index) == src->size(dim), 4, "length of src.size[dim] is not equal to length of indices"); ptrdiff_t srcSliceSize = 1; @@ -36,8 +36,8 @@ static ptrdiff_t THCTensor_(getSliceSize)(THCState *state, THCTensor *dst, for (int d = 0; d < srcDims; d++) { if (d != dim) { - srcSliceSize *= src->size[d]; - if (!mismatch && dst->size[d] != src->size[d]) mismatch = true; + srcSliceSize *= src->size(d); + if (!mismatch && dst->size(d) != src->size(d)) mismatch = true; } } @@ -224,7 +224,7 @@ void THCTensor_(take)(THCState *state, THCTensor *dst, THCTensor *src, THCudaLon THArgCheck(!(THCTensor_(_nDimension)(state, src) == 0 && THCudaLongTensor__nDimension(state, index) != 0), 2, "tried to take from an empty tensor"); - THCTensor_(resizeNd)(state, dst, index->dim(), index->size, NULL); + THCTensor_(resizeNd)(state, dst, index->dim(), THTensor_getSizePtr(index), NULL); // dispatchTakePut only handles non-empty tensors; if (index->_dim() > 0) { diff --git a/aten/src/THC/generic/THCTensorMath.cu b/aten/src/THC/generic/THCTensorMath.cu index 8bdd8fa68871e8..07033fa0e8f1d8 100644 --- a/aten/src/THC/generic/THCTensorMath.cu +++ b/aten/src/THC/generic/THCTensorMath.cu @@ -314,9 +314,9 @@ void THCTensor_(nonzero)(THCState* state, THCudaLongTensor *tensor, strided_tensor.begin(), strided_tensor.end(), stride_dim.begin(), - idx_functor(div, self->size[dim]) + idx_functor(div, self->size(dim)) ); - div *= self->size[dim]; + div *= self->size(dim); } THCudaLongTensor_resize2d(state, tensor, num_nonzeros, num_dim); diff --git a/aten/src/THC/generic/THCTensorMathBlas.cu b/aten/src/THC/generic/THCTensorMathBlas.cu index 6d1da07d74b947..babbd6d24eb61d 100644 --- a/aten/src/THC/generic/THCTensorMathBlas.cu +++ b/aten/src/THC/generic/THCTensorMathBlas.cu @@ -49,16 +49,16 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real { #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, mat, vec)); - if( (mat->_dim() != 2) || (vec->_dim() != 1) ) + if( (mat->dim() != 2) || (vec->dim() != 1) ) THError("matrix and vector expected"); - if( mat->size[1] != vec->size[0] ) + if( mat->size(1) != vec->size(0) ) THError("size mismatch"); - if(t->_dim() != 1) + if(t->dim() != 1) THError("size mismatch"); - if(t->size[0] != mat->size[0]) + if(t->size(0) != mat->size(0)) THError("size mismatch"); #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) @@ -68,32 +68,32 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real THCTensor_(copy)(state, r_, t); } - if(mat->stride[0] == 1) + if(mat->stride(0) == 1) { #ifdef THC_REAL_IS_FLOAT - THCudaBlas_Sgemv(state, 'n', mat->size[0], mat->size[1], - alpha, THCTensor_(data)(state, mat), mat->stride[1], - THCTensor_(data)(state, vec), vec->stride[0], - beta, THCTensor_(data)(state, r_), r_->stride[0]); + THCudaBlas_Sgemv(state, 'n', mat->size(0), mat->size(1), + alpha, THCTensor_(data)(state, mat), mat->stride(1), + THCTensor_(data)(state, vec), vec->stride(0), + beta, THCTensor_(data)(state, r_), r_->stride(0)); #elif defined(THC_REAL_IS_DOUBLE) - THCudaBlas_Dgemv(state, 'n', mat->size[0], mat->size[1], - alpha, THCTensor_(data)(state, mat), mat->stride[1], - THCTensor_(data)(state, vec), vec->stride[0], - beta, THCTensor_(data)(state, r_), r_->stride[0]); + THCudaBlas_Dgemv(state, 'n', mat->size(0), mat->size(1), + alpha, THCTensor_(data)(state, mat), mat->stride(1), + THCTensor_(data)(state, vec), vec->stride(0), + beta, THCTensor_(data)(state, r_), r_->stride(0)); #endif } - else if(mat->stride[1] == 1) + else if(mat->stride(1) == 1) { #ifdef THC_REAL_IS_FLOAT - THCudaBlas_Sgemv(state, 't', mat->size[1], mat->size[0], - alpha, THCTensor_(data)(state, mat), mat->stride[0], - THCTensor_(data)(state, vec), vec->stride[0], - beta, THCTensor_(data)(state, r_), r_->stride[0]); + THCudaBlas_Sgemv(state, 't', mat->size(1), mat->size(0), + alpha, THCTensor_(data)(state, mat), mat->stride(0), + THCTensor_(data)(state, vec), vec->stride(0), + beta, THCTensor_(data)(state, r_), r_->stride(0)); #elif defined(THC_REAL_IS_DOUBLE) - THCudaBlas_Dgemv(state, 't', mat->size[1], mat->size[0], - alpha, THCTensor_(data)(state, mat), mat->stride[0], - THCTensor_(data)(state, vec), vec->stride[0], - beta, THCTensor_(data)(state, r_), r_->stride[0]); + THCudaBlas_Dgemv(state, 't', mat->size(1), mat->size(0), + alpha, THCTensor_(data)(state, mat), mat->stride(0), + THCTensor_(data)(state, vec), vec->stride(0), + beta, THCTensor_(data)(state, r_), r_->stride(0)); #endif } else @@ -101,32 +101,42 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real THCTensor *cmat = THCTensor_(newContiguous)(state, mat); #ifdef THC_REAL_IS_FLOAT - THCudaBlas_Sgemv(state, 't', mat->size[1], mat->size[0], - alpha, THCTensor_(data)(state, cmat), cmat->stride[0], - THCTensor_(data)(state, vec), vec->stride[0], - beta, THCTensor_(data)(state, r_), r_->stride[0]); + THCudaBlas_Sgemv(state, 't', mat->size(1), mat->size(0), + alpha, THCTensor_(data)(state, cmat), cmat->stride(0), + THCTensor_(data)(state, vec), vec->stride(0), + beta, THCTensor_(data)(state, r_), r_->stride(0)); #elif defined(THC_REAL_IS_DOUBLE) - THCudaBlas_Dgemv(state, 't', mat->size[1], mat->size[0], - alpha, THCTensor_(data)(state, cmat), cmat->stride[0], - THCTensor_(data)(state, vec), vec->stride[0], - beta, THCTensor_(data)(state, r_), r_->stride[0]); + THCudaBlas_Dgemv(state, 't', mat->size(1), mat->size(0), + alpha, THCTensor_(data)(state, cmat), cmat->stride(0), + THCTensor_(data)(state, vec), vec->stride(0), + beta, THCTensor_(data)(state, r_), r_->stride(0)); #endif THCTensor_(free)(state, cmat); } + // cublasSgemv, cublasDgemv have a bug where (x,0).mv(0) does not + // handle beta, whereas cublasSgemm, cublasDgemm do for case where (x,0).mm(0,y). + if (vec->size(0) == 0 && mat->size(0) != 0) { + if(THCNumerics::eq(beta, ScalarConvert::to(0))) { + THCTensor_(zero)(state, r_); + } else if(THCNumerics::ne(beta, ScalarConvert::to(1))) { + THCTensor_(mul)(state, r_, r_, beta); + } + } + #elif defined(THC_REAL_IS_HALF) // Currently no Hgemv/SgemvEx in Cublas THCTensor *vecAsMatrix = THCTensor_(newWithTensor)(state, vec); - THCTensor_(resize2d)(state, vecAsMatrix, vecAsMatrix->size[0], 1); + THCTensor_(resize2d)(state, vecAsMatrix, vecAsMatrix->size(0), 1); THCTensor *tAsMatrix = THCTensor_(newWithTensor)(state, t); - THCTensor_(resize2d)(state, tAsMatrix, tAsMatrix->size[0], 1); + THCTensor_(resize2d)(state, tAsMatrix, tAsMatrix->size(0), 1); THCTensor_(addmm)(state, r_, beta, tAsMatrix, alpha, mat, vecAsMatrix); // r_ will have answer as matrix, need to return a vector - THCTensor_(resize1d)(state, r_, r_->size[0]); + THCTensor_(resize1d)(state, r_, r_->size(0)); THCTensor_(free)(state, vecAsMatrix); THCTensor_(free)(state, tAsMatrix); #endif @@ -140,15 +150,15 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real a { #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, vec1, vec2)); - if ( (vec1->_dim() != 1) || (vec2->_dim() != 1) ) { + if ( (vec1->dim() != 1) || (vec2->dim() != 1) ) { THError("vector and vector expected"); } - if (t->_dim() != 2) { + if (t->dim() != 2) { THError("size mismatch"); } - if ( (t->size[0] != vec1->size[0]) || (t->size[1] != vec2->size[0]) ) { + if ( (t->size(0) != vec1->size(0)) || (t->size(1) != vec2->size(0)) ) { THError("size mismatch"); } @@ -164,32 +174,32 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real a THCTensor_(mul)(state, r_, r_, beta); } - if(r_->stride[0] == 1) + if(r_->stride(0) == 1) { #ifdef THC_REAL_IS_FLOAT - THCudaBlas_Sger(state, vec1->size[0], vec2->size[0], - alpha, THCTensor_(data)(state, vec1), vec1->stride[0], - THCTensor_(data)(state, vec2), vec2->stride[0], - THCTensor_(data)(state, r_), r_->stride[1]); + THCudaBlas_Sger(state, vec1->size(0), vec2->size(0), + alpha, THCTensor_(data)(state, vec1), vec1->stride(0), + THCTensor_(data)(state, vec2), vec2->stride(0), + THCTensor_(data)(state, r_), r_->stride(1)); #elif defined(THC_REAL_IS_DOUBLE) - THCudaBlas_Dger(state, vec1->size[0], vec2->size[0], - alpha, THCTensor_(data)(state, vec1), vec1->stride[0], - THCTensor_(data)(state, vec2), vec2->stride[0], - THCTensor_(data)(state, r_), r_->stride[1]); + THCudaBlas_Dger(state, vec1->size(0), vec2->size(0), + alpha, THCTensor_(data)(state, vec1), vec1->stride(0), + THCTensor_(data)(state, vec2), vec2->stride(0), + THCTensor_(data)(state, r_), r_->stride(1)); #endif } - else if(r_->stride[1] == 1) + else if(r_->stride(1) == 1) { #ifdef THC_REAL_IS_FLOAT - THCudaBlas_Sger(state, vec2->size[0], vec1->size[0], - alpha, THCTensor_(data)(state, vec2), vec2->stride[0], - THCTensor_(data)(state, vec1), vec1->stride[0], - THCTensor_(data)(state, r_), r_->stride[0]); + THCudaBlas_Sger(state, vec2->size(0), vec1->size(0), + alpha, THCTensor_(data)(state, vec2), vec2->stride(0), + THCTensor_(data)(state, vec1), vec1->stride(0), + THCTensor_(data)(state, r_), r_->stride(0)); #elif defined(THC_REAL_IS_DOUBLE) - THCudaBlas_Dger(state, vec2->size[0], vec1->size[0], - alpha, THCTensor_(data)(state, vec2), vec2->stride[0], - THCTensor_(data)(state, vec1), vec1->stride[0], - THCTensor_(data)(state, r_), r_->stride[0]); + THCudaBlas_Dger(state, vec2->size(0), vec1->size(0), + alpha, THCTensor_(data)(state, vec2), vec2->stride(0), + THCTensor_(data)(state, vec1), vec1->stride(0), + THCTensor_(data)(state, r_), r_->stride(0)); #endif } else @@ -197,15 +207,15 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real a THCTensor *cr = THCTensor_(newClone)(state, r_); #ifdef THC_REAL_IS_FLOAT - THCudaBlas_Sger(state, vec2->size[0], vec1->size[0], - alpha, THCTensor_(data)(state, vec2), vec2->stride[0], - THCTensor_(data)(state, vec1), vec1->stride[0], - THCTensor_(data)(state, cr), cr->stride[0]); + THCudaBlas_Sger(state, vec2->size(0), vec1->size(0), + alpha, THCTensor_(data)(state, vec2), vec2->stride(0), + THCTensor_(data)(state, vec1), vec1->stride(0), + THCTensor_(data)(state, cr), cr->stride(0)); #elif defined(THC_REAL_IS_DOUBLE) - THCudaBlas_Dger(state, vec2->size[0], vec1->size[0], - alpha, THCTensor_(data)(state, vec2), vec2->stride[0], - THCTensor_(data)(state, vec1), vec1->stride[0], - THCTensor_(data)(state, cr), cr->stride[0]); + THCudaBlas_Dger(state, vec2->size(0), vec1->size(0), + alpha, THCTensor_(data)(state, vec2), vec2->stride(0), + THCTensor_(data)(state, vec1), vec1->stride(0), + THCTensor_(data)(state, cr), cr->stride(0)); #endif THCTensor_(freeCopyTo)(state, cr, r_); @@ -213,11 +223,11 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real a #elif defined(THC_REAL_IS_HALF) // currently no Hger/SgerEx in Cublas. THCTensor *vec2T = THCTensor_(newWithTensor)(state, vec2); - THCTensor_(resize2d)(state, vec2T, vec2T->size[0], 1); + THCTensor_(resize2d)(state, vec2T, vec2T->size(0), 1); THCTensor_(transpose)(state, vec2T, NULL, 0, 1); THCTensor *vec1M = THCTensor_(newWithTensor)(state, vec1); - THCTensor_(resize2d)(state, vec1M, vec1M->size[0], 1); + THCTensor_(resize2d)(state, vec1M, vec1M->size(0), 1); THCTensor_(addmm)(state, r_, beta, t, alpha, vec1M, vec2T); THCTensor_(free)(state, vec2T); @@ -237,19 +247,19 @@ THCTensor_(addmm)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real char transpose_r, transpose_m1, transpose_m2; THCTensor *r__, *m1_, *m2_; - if( (m1->_dim() != 2) || (m2->_dim() != 2) ) - THError("matrices expected, got %dD, %dD tensors", m1->_dim(), m2->_dim()); + if( (m1->dim() != 2) || (m2->dim() != 2) ) + THError("matrices expected, got %dD, %dD tensors", m1->dim(), m2->dim()); - if(t->_dim() != 2) - THError("matrix expected, got %dD tensor for t", t->_dim()); + if(t->dim() != 2) + THError("matrix expected, got %dD tensor for t", t->dim()); - if(m1->size[1] != m2->size[0]) { + if(m1->size(1) != m2->size(0)) { THCDescBuff bm1 = THCTensor_(sizeDesc)(state, m1); THCDescBuff bm2 = THCTensor_(sizeDesc)(state, m2); THError("size mismatch, m1: %s, m2: %s", bm1.str, bm2.str); } - if( (t->size[0] != m1->size[0]) || (t->size[1] != m2->size[1]) ) { + if( (t->size(0) != m1->size(0)) || (t->size(1) != m2->size(1)) ) { THCDescBuff bt = THCTensor_(sizeDesc)(state, t); THCDescBuff bm1 = THCTensor_(sizeDesc)(state, m1); THCDescBuff bm2 = THCTensor_(sizeDesc)(state, m2); @@ -265,14 +275,14 @@ THCTensor_(addmm)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real } /* r_ */ - if(r_->stride[0] == 1 && - r_->stride[1] != 0) + if(r_->stride(0) == 1 && + r_->stride(1) != 0) { transpose_r = 'n'; r__ = r_; } - else if(r_->stride[1] == 1 && - r_->stride[0] != 0) + else if(r_->stride(1) == 1 && + r_->stride(0) != 0) { THCTensor *swap = m2; m2 = m1; @@ -291,14 +301,14 @@ THCTensor_(addmm)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real } /* m1 */ - if(m1->stride[(transpose_r == 'n' ? 0 : 1)] == 1 && - m1->stride[(transpose_r == 'n' ? 1 : 0)] != 0) + if(m1->stride((transpose_r == 'n' ? 0 : 1)) == 1 && + m1->stride((transpose_r == 'n' ? 1 : 0)) != 0) { transpose_m1 = 'n'; m1_ = m1; } - else if(m1->stride[(transpose_r == 'n' ? 1 : 0)] == 1 && - m1->stride[(transpose_r == 'n' ? 0 : 1)] != 0) + else if(m1->stride((transpose_r == 'n' ? 1 : 0)) == 1 && + m1->stride((transpose_r == 'n' ? 0 : 1)) != 0) { transpose_m1 = 't'; m1_ = m1; @@ -310,14 +320,14 @@ THCTensor_(addmm)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real } /* m2 */ - if(m2->stride[(transpose_r == 'n' ? 0 : 1)] == 1 && - m2->stride[(transpose_r == 'n' ? 1 : 0)] != 0) + if(m2->stride((transpose_r == 'n' ? 0 : 1)) == 1 && + m2->stride((transpose_r == 'n' ? 1 : 0)) != 0) { transpose_m2 = 'n'; m2_ = m2; } - else if(m2->stride[(transpose_r == 'n' ? 1 : 0)] == 1 && - m2->stride[(transpose_r == 'n' ? 0 : 1)] != 0) + else if(m2->stride((transpose_r == 'n' ? 1 : 0)) == 1 && + m2->stride((transpose_r == 'n' ? 0 : 1)) != 0) { transpose_m2 = 't'; m2_ = m2; @@ -332,47 +342,47 @@ THCTensor_(addmm)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real THCudaBlas_Hgemm(state, transpose_m1, transpose_m2, - r__->size[(transpose_r == 'n' ? 0 : 1)], - r__->size[(transpose_r == 'n' ? 1 : 0)], - m1_->size[(transpose_r == 'n' ? 1 : 0)], + r__->size((transpose_r == 'n' ? 0 : 1)), + r__->size((transpose_r == 'n' ? 1 : 0)), + m1_->size((transpose_r == 'n' ? 1 : 0)), alpha, THCTensor_(data)(state, m1_), - (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]), + (transpose_m1 == 'n' ? m1_->stride((transpose_r == 'n' ? 1 : 0)) : m1_->stride((transpose_r == 'n' ? 0 : 1))), THCTensor_(data)(state, m2_), - (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]), + (transpose_m2 == 'n' ? m2_->stride((transpose_r == 'n' ? 1 : 0)) : m2_->stride((transpose_r == 'n' ? 0 : 1))), beta, THCTensor_(data)(state, r__), - r__->stride[(transpose_r == 'n' ? 1 : 0)]); + r__->stride((transpose_r == 'n' ? 1 : 0))); #elif defined(THC_REAL_IS_FLOAT) THCudaBlas_Sgemm(state, transpose_m1, transpose_m2, - r__->size[(transpose_r == 'n' ? 0 : 1)], - r__->size[(transpose_r == 'n' ? 1 : 0)], - m1_->size[(transpose_r == 'n' ? 1 : 0)], + r__->size((transpose_r == 'n' ? 0 : 1)), + r__->size((transpose_r == 'n' ? 1 : 0)), + m1_->size((transpose_r == 'n' ? 1 : 0)), alpha, THCTensor_(data)(state, m1_), - (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]), + (transpose_m1 == 'n' ? m1_->stride((transpose_r == 'n' ? 1 : 0)) : m1_->stride((transpose_r == 'n' ? 0 : 1))), THCTensor_(data)(state, m2_), - (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]), + (transpose_m2 == 'n' ? m2_->stride((transpose_r == 'n' ? 1 : 0)) : m2_->stride((transpose_r == 'n' ? 0 : 1))), beta, THCTensor_(data)(state, r__), - r__->stride[(transpose_r == 'n' ? 1 : 0)]); + r__->stride((transpose_r == 'n' ? 1 : 0))); #elif defined(THC_REAL_IS_DOUBLE) THCudaBlas_Dgemm(state, transpose_m1, transpose_m2, - r__->size[(transpose_r == 'n' ? 0 : 1)], - r__->size[(transpose_r == 'n' ? 1 : 0)], - m1_->size[(transpose_r == 'n' ? 1 : 0)], + r__->size((transpose_r == 'n' ? 0 : 1)), + r__->size((transpose_r == 'n' ? 1 : 0)), + m1_->size((transpose_r == 'n' ? 1 : 0)), alpha, THCTensor_(data)(state, m1_), - (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]), + (transpose_m1 == 'n' ? m1_->stride((transpose_r == 'n' ? 1 : 0)) : m1_->stride((transpose_r == 'n' ? 0 : 1))), THCTensor_(data)(state, m2_), - (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]), + (transpose_m2 == 'n' ? m2_->stride((transpose_r == 'n' ? 1 : 0)) : m2_->stride((transpose_r == 'n' ? 0 : 1))), beta, THCTensor_(data)(state, r__), - r__->stride[(transpose_r == 'n' ? 1 : 0)]); + r__->stride((transpose_r == 'n' ? 1 : 0))); #endif /* free intermediate variables */ @@ -397,9 +407,9 @@ THCTensor_(addbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, real alpha, THCTensor *batch1, THCTensor *batch2) { #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2)); - THArgCheck(THCTensor_(_nDimension)(state, t) == 2, 4, "expected 2D tensor"); - THArgCheck(THCTensor_(_nDimension)(state, batch1) == 3, 6, "expected 3D tensor"); - THArgCheck(THCTensor_(_nDimension)(state, batch2) == 3, 7, "expected 3D tensor"); + THArgCheck(THCTensor_(nDimension)(state, t) == 2, 4, "expected 2D tensor"); + THArgCheck(THCTensor_(nDimension)(state, batch1) == 3, 6, "expected 3D tensor"); + THArgCheck(THCTensor_(nDimension)(state, batch2) == 3, 7, "expected 3D tensor"); int64_t batchnum = THCTensor_(size)(state, batch1, 0); int64_t m1d1 = THCTensor_(size)(state, batch1, 1); @@ -462,9 +472,9 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, real alpha, THCTensor *batch1, THCTensor *batch2) { #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2)); - THArgCheck(THCTensor_(_nDimension)(state, t) == 3, 4, "expected 3D tensor"); - THArgCheck(THCTensor_(_nDimension)(state, batch1) == 3, 6, "expected 3D tensor"); - THArgCheck(THCTensor_(_nDimension)(state, batch2) == 3, 7, "expected 3D tensor"); + THArgCheck(THCTensor_(nDimension)(state, t) == 3, 4, "expected 3D tensor"); + THArgCheck(THCTensor_(nDimension)(state, batch1) == 3, 6, "expected 3D tensor"); + THArgCheck(THCTensor_(nDimension)(state, batch2) == 3, 7, "expected 3D tensor"); THArgCheck(THCTensor_(size)(state, t, 0) == THCTensor_(size)(state, batch1, 0), 6, "equal number of batches expected"); THArgCheck(THCTensor_(size)(state, t, 0) == THCTensor_(size)(state, batch2, 0), 7, @@ -487,13 +497,13 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, char transpose_batch1, transpose_batch2; int64_t lda, ldb, ldc; THCTensor *result_, *batch1_, *batch2_; - if (result->stride[1] == 1) + if (result->stride(1) == 1) { transpose_result = false; result_ = result; - ldc = result_->stride[2]; + ldc = result_->stride(2); } - else if (result->stride[2] == 1) + else if (result->stride(2) == 1) { transpose_result = true; @@ -502,7 +512,7 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, batch1 = swap; result_ = result; - ldc = result_->stride[1]; + ldc = result_->stride(1); } else { @@ -513,22 +523,22 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, THCTensor_(free)(state, transp_r_); THCTensor_(transpose)(state, result_, NULL, 1, 2); - ldc = result_->stride[2]; + ldc = result_->stride(2); } - if (batch1->stride[transpose_result ? 2 : 1] == 1 && - batch1->stride[transpose_result ? 1 : 2] != 0) + if (batch1->stride(transpose_result ? 2 : 1) == 1 && + batch1->stride(transpose_result ? 1 : 2) != 0) { transpose_batch1 = 'n'; batch1_ = batch1; - lda = batch1_->stride[transpose_result ? 1 : 2]; + lda = batch1_->stride(transpose_result ? 1 : 2); } - else if (batch1->stride[transpose_result ? 1 : 2] == 1 && - batch1->stride[transpose_result ? 2 : 1] != 0) + else if (batch1->stride(transpose_result ? 1 : 2) == 1 && + batch1->stride(transpose_result ? 2 : 1) != 0) { transpose_batch1 = 't'; batch1_ = batch1; - lda = batch1_->stride[transpose_result ? 2 : 1]; + lda = batch1_->stride(transpose_result ? 2 : 1); } else { @@ -539,22 +549,22 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, } else { batch1_ = THCTensor_(newContiguous)(state, batch1); } - lda = batch1_->stride[1]; + lda = batch1_->stride(1); } - if (batch2->stride[transpose_result ? 2 : 1] == 1 && - batch2->stride[transpose_result ? 1 : 2] != 0) + if (batch2->stride(transpose_result ? 2 : 1) == 1 && + batch2->stride(transpose_result ? 1 : 2) != 0) { transpose_batch2 = 'n'; batch2_ = batch2; - ldb = batch2_->stride[transpose_result ? 1 : 2]; + ldb = batch2_->stride(transpose_result ? 1 : 2); } - else if (batch2->stride[transpose_result ? 1 : 2] == 1 && - batch2->stride[transpose_result ? 2 : 1] != 0) + else if (batch2->stride(transpose_result ? 1 : 2) == 1 && + batch2->stride(transpose_result ? 2 : 1) != 0) { transpose_batch2 = 't'; batch2_ = batch2; - ldb = batch2_->stride[transpose_result ? 2 : 1]; + ldb = batch2_->stride(transpose_result ? 2 : 1); } else { @@ -565,9 +575,9 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, } else { batch2_ = THCTensor_(newContiguous)(state, batch2); } - ldb = batch2_->stride[1]; + ldb = batch2_->stride(1); } - int64_t num_batches = result_->size[0]; + int64_t num_batches = result_->size(0); #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) // Compute pointers to matrices in each batch. @@ -585,16 +595,16 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, createBatchGemmBuffer3<<>>( d_matrices1, d_matrices2, (const real**)d_result_matrices, THCTensor_(data)(state, batch1_), THCTensor_(data)(state, batch2_), THCTensor_(data)(state, result_), - batch1_->stride[0], batch2_->stride[0], result_->stride[0], num_batches); + batch1_->stride(0), batch2_->stride(0), result_->stride(0), num_batches); #ifdef THC_REAL_IS_FLOAT THCudaBlas_SgemmBatched( state, transpose_batch1, transpose_batch2, - result_->size[transpose_result ? 2 : 1], - result_->size[transpose_result ? 1 : 2], - batch1_->size[transpose_result ? 1 : 2], + result_->size(transpose_result ? 2 : 1), + result_->size(transpose_result ? 1 : 2), + batch1_->size(transpose_result ? 1 : 2), alpha, d_matrices1, lda, d_matrices2, ldb, @@ -606,9 +616,9 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, state, transpose_batch1, transpose_batch2, - result_->size[transpose_result ? 2 : 1], - result_->size[transpose_result ? 1 : 2], - batch1_->size[transpose_result ? 1 : 2], + result_->size(transpose_result ? 2 : 1), + result_->size(transpose_result ? 1 : 2), + batch1_->size(transpose_result ? 1 : 2), alpha, d_matrices1, lda, d_matrices2, ldb, @@ -627,28 +637,28 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, state, transpose_batch1, transpose_batch2, - result_->size[transpose_result ? 2 : 1], - result_->size[transpose_result ? 1 : 2], - batch1_->size[transpose_result ? 1 : 2], + result_->size(transpose_result ? 2 : 1), + result_->size(transpose_result ? 1 : 2), + batch1_->size(transpose_result ? 1 : 2), alpha, - THCTensor_(data)(state, batch1_), lda, batch1_->stride[0], - THCTensor_(data)(state, batch2_), ldb, batch2_->stride[0], + THCTensor_(data)(state, batch1_), lda, batch1_->stride(0), + THCTensor_(data)(state, batch2_), ldb, batch2_->stride(0), beta, - THCTensor_(data)(state, result_), ldc, result_->stride[0], + THCTensor_(data)(state, result_), ldc, result_->stride(0), num_batches); #elif defined(THC_REAL_IS_DOUBLE) THCudaBlas_DgemmStridedBatched( state, transpose_batch1, transpose_batch2, - result_->size[transpose_result ? 2 : 1], - result_->size[transpose_result ? 1 : 2], - batch1_->size[transpose_result ? 1 : 2], + result_->size(transpose_result ? 2 : 1), + result_->size(transpose_result ? 1 : 2), + batch1_->size(transpose_result ? 1 : 2), alpha, - THCTensor_(data)(state, batch1_), lda, batch1_->stride[0], - THCTensor_(data)(state, batch2_), ldb, batch2_->stride[0], + THCTensor_(data)(state, batch1_), lda, batch1_->stride(0), + THCTensor_(data)(state, batch2_), ldb, batch2_->stride(0), beta, - THCTensor_(data)(state, result_), ldc, result_->stride[0], + THCTensor_(data)(state, result_), ldc, result_->stride(0), num_batches); #endif //THC_REAL #endif //CUDA_VERSION @@ -662,14 +672,14 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, state, transpose_batch1, transpose_batch2, - result_->size[transpose_result ? 2 : 1], - result_->size[transpose_result ? 1 : 2], - batch1_->size[transpose_result ? 1 : 2], + result_->size(transpose_result ? 2 : 1), + result_->size(transpose_result ? 1 : 2), + batch1_->size(transpose_result ? 1 : 2), alpha, - THCTensor_(data)(state, batch1_) + i * batch1_->stride[0], lda, - THCTensor_(data)(state, batch2_) + i * batch2_->stride[0], ldb, + THCTensor_(data)(state, batch1_) + i * batch1_->stride(0), lda, + THCTensor_(data)(state, batch2_) + i * batch2_->stride(0), ldb, beta, - THCTensor_(data)(state, result_) + i * result_->stride[0], ldc); + THCTensor_(data)(state, result_) + i * result_->stride(0), ldc); } #else cudaDeviceProp* prop = THCState_getCurrentDeviceProperties(state); @@ -679,14 +689,14 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, state, transpose_batch1, transpose_batch2, - result_->size[transpose_result ? 2 : 1], - result_->size[transpose_result ? 1 : 2], - batch1_->size[transpose_result ? 1 : 2], + result_->size(transpose_result ? 2 : 1), + result_->size(transpose_result ? 1 : 2), + batch1_->size(transpose_result ? 1 : 2), alpha, - THCTensor_(data)(state, batch1_), lda, batch1_->stride[0], - THCTensor_(data)(state, batch2_), ldb, batch2_->stride[0], + THCTensor_(data)(state, batch1_), lda, batch1_->stride(0), + THCTensor_(data)(state, batch2_), ldb, batch2_->stride(0), beta, - THCTensor_(data)(state, result_), ldc, result_->stride[0], + THCTensor_(data)(state, result_), ldc, result_->stride(0), num_batches); } else { for (int64_t i = 0; i < num_batches; ++i) { @@ -694,14 +704,14 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, state, transpose_batch1, transpose_batch2, - result_->size[transpose_result ? 2 : 1], - result_->size[transpose_result ? 1 : 2], - batch1_->size[transpose_result ? 1 : 2], + result_->size(transpose_result ? 2 : 1), + result_->size(transpose_result ? 1 : 2), + batch1_->size(transpose_result ? 1 : 2), alpha, - THCTensor_(data)(state, batch1_) + i * batch1_->stride[0], lda, - THCTensor_(data)(state, batch2_) + i * batch2_->stride[0], ldb, + THCTensor_(data)(state, batch1_) + i * batch1_->stride(0), lda, + THCTensor_(data)(state, batch2_) + i * batch2_->stride(0), ldb, beta, - THCTensor_(data)(state, result_) + i * result_->stride[0], ldc); + THCTensor_(data)(state, result_) + i * result_->stride(0), ldc); } } @@ -728,29 +738,26 @@ THC_API void THCTensor_(btrifact)(THCState *state, THCTensor *ra_, THCudaIntTens { #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) THAssert(THCTensor_(checkGPU)(state, 2, ra_, a)); - THArgCheck(THCTensor_(_nDimension)(state, a) == 3, 3, "expected 3D tensor"); + THArgCheck(THCTensor_(nDimension)(state, a) == 3, 3, "expected 3D tensor"); THArgCheck(THCTensor_(size)(state, a, 1) == THCTensor_(size)(state, a, 2), 3, "matrices must be square"); if (ra_ != a) { THCTensor_(resizeAs)(state, ra_, a); - // not sure if this is kosher, but things are nicer if we return in column major - if (ra_->stride[0] == 1) { - THCTensor_(transpose)(state, ra_, NULL, 1, 0); - } else if (ra_->stride[2] == 1) { + if (ra_->stride(2) == 1) { THCTensor_(transpose)(state, ra_, NULL, 1, 2); } THCTensor_(copy)(state, ra_, a); } - int n = a->size[1]; + int n = a->size(1); int lda; THCTensor *ra__; - if (ra_->stride[1] == 1) { + if (ra_->stride(1) == 1) { // column ordered, what BLAS wants - lda = ra_->stride[2]; + lda = ra_->stride(2); ra__ = ra_; } else { // not column ordered, need to make it such (requires copy) @@ -758,10 +765,10 @@ THC_API void THCTensor_(btrifact)(THCState *state, THCTensor *ra_, THCudaIntTens ra__ = THCTensor_(newClone)(state, transp_r_); THCTensor_(free)(state, transp_r_); THCTensor_(transpose)(state, ra__, NULL, 1, 2); - lda = ra__->stride[2]; + lda = ra__->stride(2); } - int64_t num_batches = ra__->size[0]; + int64_t num_batches = ra__->size(0); if (!pivot) { THCudaIntTensor *t = THCudaIntTensor_new(state); @@ -787,11 +794,13 @@ THC_API void THCTensor_(btrifact)(THCState *state, THCTensor *ra_, THCudaIntTens size_t matrices_size = num_batches * sizeof(real*); auto d_result = static_cast(THCudaMalloc(state, matrices_size)); - const int64_t block = 512; - const int64_t grid = (num_batches + block - 1) / block; - createBatchGemmBuffer<<>>( - (const real**)d_result, THCTensor_(data)(state, ra__), - ra__->stride[0], num_batches); + if (num_batches > 0) { + const int64_t block = 512; + const int64_t grid = (num_batches + block - 1) / block; + createBatchGemmBuffer<<>>( + (const real**)d_result, THCTensor_(data)(state, ra__), + ra__->stride(0), num_batches); + } int *pivots_gpu = NULL; if (pivot) { @@ -810,12 +819,16 @@ THC_API void THCTensor_(btrifact)(THCState *state, THCTensor *ra_, THCudaIntTens } if (free_rinfo_) { - int min = THCudaIntTensor_minall(state, rinfo_); - int max = THCudaIntTensor_maxall(state, rinfo_); - THCudaIntTensor_free(state, rinfo_); - if (min != 0 || max != 0) { - THError("failed to factorize some batch elements (min info == %d, max info == %d)", - min, max); + if(THCTensor_nElement(state, rinfo_) != 0) { + int min = THCudaIntTensor_minall(state, rinfo_); + int max = THCudaIntTensor_maxall(state, rinfo_); + THCudaIntTensor_free(state, rinfo_); + if (min != 0 || max != 0) { + THError("failed to factorize some batch elements (min info == %d, max info == %d)", + min, max); + } + } else { + THCudaIntTensor_free(state, rinfo_); } } @@ -846,16 +859,16 @@ THC_API void THCTensor_(btrisolve)(THCState *state, THCTensor *rb_, THCTensor *b } - int n = atf->size[1]; - int nrhs = rb_->_dim() > 2 ? rb_->size[2] : 1; + int n = atf->size(1); + int nrhs = rb_->_dim() > 2 ? rb_->size(2) : 1; THCTensor *atf_; THCTensor *rb__; int lda, ldb; // correct ordering of A_tf - if (atf->stride[1] == 1) { + if (atf->stride(1) == 1) { // column ordered, what BLAS wants - lda = atf->stride[2]; + lda = atf->stride(2); atf_ = atf; } else { // not column ordered, need to make it such (requires copy) @@ -866,16 +879,16 @@ THC_API void THCTensor_(btrisolve)(THCState *state, THCTensor *rb_, THCTensor *b atf_ = THCTensor_(newClone)(state, transp_r_); THCTensor_(free)(state, transp_r_); THCTensor_(transpose)(state, atf_, NULL, 1, 2); - lda = atf_->stride[2]; + lda = atf_->stride(2); } // correct ordering of B - if (rb_->stride[1] == 1) { + if (rb_->stride(1) == 1) { // column ordered - if (rb_->_dim() == 2 || rb_->size[2] == 1) { + if (rb_->_dim() == 2 || rb_->size(2) == 1) { ldb = n; } else { - ldb = rb_->stride[2]; + ldb = rb_->stride(2); } rb__ = rb_; } else { @@ -885,14 +898,14 @@ THC_API void THCTensor_(btrisolve)(THCState *state, THCTensor *rb_, THCTensor *b rb__ = THCTensor_(newClone)(state, transp_r_); THCTensor_(free)(state, transp_r_); THCTensor_(transpose)(state, rb__, NULL, 1, 2); - ldb = rb__->stride[2]; + ldb = rb__->stride(2); } else { rb__ = THCTensor_(newClone)(state, rb_); ldb = n; } } - int64_t num_batches = rb_->size[0]; + int64_t num_batches = rb_->size(0); size_t matrices_size = num_batches * sizeof(real*); // Copy pointers to device. @@ -903,10 +916,10 @@ THC_API void THCTensor_(btrisolve)(THCState *state, THCTensor *rb_, THCTensor *b const int64_t grid = (num_batches + block - 1) / block; createBatchGemmBuffer<<>>( (const real**)d_result, THCTensor_(data)(state, rb__), - rb__->stride[0], num_batches); + rb__->stride(0), num_batches); createBatchGemmBuffer<<>>( d_atf, THCTensor_(data)(state, atf_), - atf_->stride[0], num_batches); + atf_->stride(0), num_batches); if (!THCudaIntTensor_isContiguous(state, pivots)) { THError("Error: pivots is not contiguous."); diff --git a/aten/src/THC/generic/THCTensorMathMagma.cu b/aten/src/THC/generic/THCTensorMathMagma.cu index fa7220729f0cf6..0de79233c122b4 100644 --- a/aten/src/THC/generic/THCTensorMathMagma.cu +++ b/aten/src/THC/generic/THCTensorMathMagma.cu @@ -40,7 +40,7 @@ static void THCTensor_(copyTensor2d)(THCState *state, real *dst, THCTensor *self static THCTensor* THCTensor_(newColumnMajor)(THCState *state, THCTensor *self, THCTensor *src) { THAssert(src->_dim() == 2); - if (self == src && self->stride[0] == 1 && self->stride[1] == self->size[0]) + if (self == src && self->stride(0) == 1 && self->stride(1) == self->size(0)) { THCTensor_(retain)(state, self); return self; @@ -51,8 +51,8 @@ static THCTensor* THCTensor_(newColumnMajor)(THCState *state, THCTensor *self, T else THCTensor_(retain)(state, self); - int64_t size[2] = { src->size[0], src->size[1] }; - int64_t stride[2] = { 1, src->size[0] }; + int64_t size[2] = { src->size(0), src->size(1) }; + int64_t stride[2] = { 1, src->size(0) }; THCTensor_(resizeNd)(state, self, 2, size, stride); THCTensor_(copy)(state, self, src); @@ -65,11 +65,11 @@ THC_API void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, T #ifdef USE_MAGMA THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional"); THArgCheck(!b_->is_empty() && b_->dim() == 2, 2, "b should be (non-empty) 2 dimensional"); - THArgCheck(a_->size[0] == a_->size[1], 1, "A should be square"); - THArgCheck(b_->size[0] == a_->size[0], 2, "A,b size incompatible"); + THArgCheck(a_->size(0) == a_->size(1), 1, "A should be square"); + THArgCheck(b_->size(0) == a_->size(0), 2, "A,b size incompatible"); - int64_t n = a_->size[0]; - int64_t nrhs = b_->size[1]; + int64_t n = a_->size(0); + int64_t nrhs = b_->size(1); THCTensor *a = THCTensor_(newColumnMajor)(state, ra_, a_); THCTensor *b = THCTensor_(newColumnMajor)(state, rb_, b_); @@ -104,8 +104,8 @@ THC_API void THCTensor_(trtrs)(THCState *state, THCTensor *rb_, THCTensor *ra_, #ifdef USE_MAGMA THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional"); THArgCheck(!b_->is_empty() && b_->dim() == 2, 2, "b should be (non-empty) 2 dimensional"); - THArgCheck(a_->size[0] == a_->size[1], 1, "A should be square"); - THArgCheck(b_->size[0] == a_->size[0], 2, "A,b size incompatible"); + THArgCheck(a_->size(0) == a_->size(1), 1, "A should be square"); + THArgCheck(b_->size(0) == a_->size(0), 2, "A,b size incompatible"); magma_side_t sz = MagmaLeft; magma_uplo_t ul = uplo[0] == 'U' ? MagmaUpper : MagmaLower; @@ -114,8 +114,8 @@ THC_API void THCTensor_(trtrs)(THCState *state, THCTensor *rb_, THCTensor *ra_, real alpha = 1; - int64_t n = a_->size[0]; - int64_t nrhs = b_->size[1]; + int64_t n = a_->size(0); + int64_t nrhs = b_->size(1); THCTensor *a = THCTensor_(newColumnMajor)(state, ra_, a_); THCTensor *b = THCTensor_(newColumnMajor)(state, rb_, b_); @@ -140,9 +140,9 @@ THC_API void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, T #ifdef USE_MAGMA THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional"); THArgCheck(!b_->is_empty() && b_->dim() == 2, 1, "b should be (non-empty) 2 dimensional"); - THArgCheck(a_->size[0] == b_->size[0], 2, "Expected A and b to have same size " + THArgCheck(a_->size(0) == b_->size(0), 2, "Expected A and b to have same size " "at dim 0, but they have incompatible sizes"); - THArgCheck(a_->size[0] >= a_->size[1], 2, "Expected A with shape (m x n) to have " + THArgCheck(a_->size(0) >= a_->size(1), 2, "Expected A with shape (m x n) to have " "m >= n. The case for m < n is not implemented yet."); THCTensor *a = THCTensor_(newColumnMajor)(state, ra_, a_); @@ -150,9 +150,9 @@ THC_API void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, T real *a_data = THCTensor_(data)(state, a); real *b_data = THCTensor_(data)(state, b); - int64_t m = a->size[0]; - int64_t n = a->size[1]; - int64_t nrhs = b->size[1]; + int64_t m = a->size(0); + int64_t n = a->size(1); + int64_t nrhs = b->size(1); real wkopt; int info; @@ -185,7 +185,7 @@ THC_API void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, T THC_API void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a, const char *jobzs, const char *uplos) { #ifdef USE_MAGMA - int64_t n = a->size[0]; + int64_t n = a->size(0); int64_t lda = n; magma_uplo_t uplo = uplos[0] == 'U' ? MagmaUpper : MagmaLower; @@ -244,10 +244,10 @@ THC_API void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, T { #ifdef USE_MAGMA THArgCheck(!a_->is_empty() && a_->dim() == 2, 3, "A should be (non-empty) 2 dimensional"); - THArgCheck(a_->size[0] == a_->size[1], 3, "A should be square"); + THArgCheck(a_->size(0) == a_->size(1), 3, "A should be square"); magma_vec_t jobvr = jobvrs[0] == 'N' ? MagmaNoVec : MagmaVec; - int64_t n = a_->size[0]; + int64_t n = a_->size(0); real *a_data = th_magma_malloc_pinned(n * n); THCTensor_(copyTensor2d)(state, a_data, a_); @@ -328,8 +328,8 @@ THC_API void THCTensor_(gesvd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, magma_vec_t jobz = jobus[0] == 'A' ? MagmaAllVec : jobus[0] == 'S' ? MagmaSomeVec : jobus[0] == 'O' ? MagmaOverwriteVec : MagmaNoVec; int iunused[1]; - int64_t m = a->size[0]; - int64_t n = a->size[1]; + int64_t m = a->size(0); + int64_t n = a->size(1); int64_t k = m < n ? m : n; int64_t j = (jobz == MagmaAllVec) ? m : k; int64_t jv = (jobz == MagmaAllVec) ? n : k; @@ -387,11 +387,11 @@ THC_API void THCTensor_(gesvd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, THC_API void THCTensor_(getri)(THCState *state, THCTensor *ra_, THCTensor *a) { THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be non-empty 2 dimensional"); - THArgCheck(a->size[0] == a->size[1], 2, "A should be square"); + THArgCheck(a->size(0) == a->size(1), 2, "A should be square"); #ifdef USE_MAGMA int info; - int64_t n = a->size[0]; + int64_t n = a->size(0); int lwork = n * magma_get_sgetri_nb(n); THCTensor *input = THCTensor_(newColumnMajor)(state, ra_, a); @@ -430,11 +430,11 @@ THC_API void THCTensor_(getri)(THCState *state, THCTensor *ra_, THCTensor *a) magma_free_pinned(ipiv); THCTensor_(freeCopyTo)(state, input, ra_); #else - int64_t n = a->size[0]; + int64_t n = a->size(0); // input THCTensor *input = THCTensor_(newColumnMajor)(state, a, a); - THCTensor_(resizeNd)(state, ra_, 2, input->size, input->stride); + THCTensor_(resizeNd)(state, ra_, 2, THTensor_getSizePtr(input), THTensor_getStridePtr(input)); real *matrices1[1] = { THCTensor_(data)(state, input) }; real *matrices2[1] = { THCTensor_(data)(state, ra_) }; @@ -516,9 +516,9 @@ THC_API void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, co { #ifdef USE_MAGMA THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be non-empty 2 dimensional"); - THArgCheck(a->size[0] == a->size[1], 2, "A should be square"); + THArgCheck(a->size(0) == a->size(1), 2, "A should be square"); - int64_t n = a->size[0]; + int64_t n = a->size(0); magma_uplo_t ul = uplo[0] == 'U' ? MagmaUpper : MagmaLower; THCTensor *input = THCTensor_(newColumnMajor)(state, ra_, a); @@ -556,9 +556,9 @@ THC_API void THCTensor_(potrf)(THCState *state, THCTensor *ra_, THCTensor *a, co { #ifdef USE_MAGMA THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be (non-empty) 2 dimensional"); - THArgCheck(a->size[0] == a->size[1], 2, "A should be square"); + THArgCheck(a->size(0) == a->size(1), 2, "A should be square"); - int64_t n = a->size[0]; + int64_t n = a->size(0); magma_uplo_t ul = uplo[0] == 'U' ? MagmaUpper : MagmaLower; THCTensor *input = THCTensor_(newColumnMajor)(state, ra_, a); @@ -591,10 +591,10 @@ THC_API void THCTensor_(potrf)(THCState *state, THCTensor *ra_, THCTensor *a, co THC_API void THCTensor_(potrs)(THCState *state, THCTensor *rb_, THCTensor *b, THCTensor *a, const char *uplo) { #ifdef USE_MAGMA - THArgCheck(a->size[0] == a->size[1], 2, "A should be square"); + THArgCheck(a->size(0) == a->size(1), 2, "A should be square"); - int64_t n = a->size[0]; - int64_t nrhs = b->size[1]; + int64_t n = a->size(0); + int64_t nrhs = b->size(1); magma_uplo_t ul = uplo[0] == 'U' ? MagmaUpper : MagmaLower; THCTensor *b_ = THCTensor_(newColumnMajor)(state, rb_, b); @@ -626,8 +626,8 @@ THC_API void THCTensor_(geqrf)(THCState *state, THCTensor *ra_, THCTensor *rtau_ THArgCheck(!a_->is_empty() && a_->dim() == 2, 2, "A should be non-empty 2 dimensional"); THCTensor *a = THCTensor_(newColumnMajor)(state, ra_, a_); - int64_t m = a->size[0]; - int64_t n = a->size[1]; + int64_t m = a->size(0); + int64_t n = a->size(1); int64_t k = (m < n ? m : n); #if defined(THC_REAL_IS_FLOAT) @@ -663,8 +663,8 @@ THC_API void THCTensor_(qr)(THCState *state, THCTensor *rq_, THCTensor *rr_, THC THArgCheck(!a_->is_empty() && a_->dim() == 2, 2, "A should be non-empty 2 dimensional"); THCTensor *a = THCTensor_(newColumnMajor)(state, rr_, a_); - int64_t m = a->size[0]; - int64_t n = a->size[1]; + int64_t m = a->size(0); + int64_t n = a->size(1); int64_t k = (m < n ? m : n); #if defined(THC_REAL_IS_FLOAT) diff --git a/aten/src/THC/generic/THCTensorMathPairwise.cu b/aten/src/THC/generic/THCTensorMathPairwise.cu index e0f1219dcf8433..62c57a04380c43 100644 --- a/aten/src/THC/generic/THCTensorMathPairwise.cu +++ b/aten/src/THC/generic/THCTensorMathPairwise.cu @@ -196,8 +196,8 @@ void THCTensor_(tril)(THCState *state, THCTensor *self_, THCTensor *src_, int64_ if (self_ != src_) THCTensor_(resizeAs)(state, self_, src_); - int64_t stride0 = self_->stride[0]; - int64_t stride1 = self_->stride[1]; + int64_t stride0 = self_->stride(0); + int64_t stride1 = self_->stride(1); real *start = THCTensor_(data)(state, self_); TensorTriOp op(start, stride0, stride1, k); @@ -225,8 +225,8 @@ void THCTensor_(triu)(THCState *state, THCTensor *self_, THCTensor *src_, int64_ if (self_ != src_) THCTensor_(resizeAs)(state, self_, src_); - int64_t stride0 = self_->stride[0]; - int64_t stride1 = self_->stride[1]; + int64_t stride0 = self_->stride(0); + int64_t stride1 = self_->stride(1); real *start = THCTensor_(data)(state, self_); TensorTriOp op(start, stride0, stride1, k); diff --git a/aten/src/THC/generic/THCTensorMathPointwise.cu b/aten/src/THC/generic/THCTensorMathPointwise.cu index 50ca326dba376d..7fb6fda38b38a5 100644 --- a/aten/src/THC/generic/THCTensorMathPointwise.cu +++ b/aten/src/THC/generic/THCTensorMathPointwise.cu @@ -114,9 +114,9 @@ THCTensor_(cross)(THCState *state, THCTensor *self, THCTensor *x, THCTensor *y, THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, x, y)); int i; - int nd = THCTensor_(_nDimension)(state, x); + int nd = THCTensor_(nDimension)(state, x); ptrdiff_t nelem = THCTensor_(nElement)(state, x); - THArgCheck(nd == THCTensor_(_nDimension)(state, y), 1, "tensors must have same number of dimensions"); + THArgCheck(nd == THCTensor_(nDimension)(state, y), 1, "tensors must have same number of dimensions"); for (i = 0; i < nd; i++) { THArgCheck(THCTensor_(size)(state, x, i) == THCTensor_(size)(state, y, i), 1, "dimension %i of x and y does not match", i); if (dimension < 0 && THCTensor_(size)(state, x, i) == 3) { diff --git a/aten/src/THC/generic/THCTensorMathReduce.cu b/aten/src/THC/generic/THCTensorMathReduce.cu index e5d8e22e5bb5eb..1c9d9eac6ac603 100644 --- a/aten/src/THC/generic/THCTensorMathReduce.cu +++ b/aten/src/THC/generic/THCTensorMathReduce.cu @@ -61,13 +61,13 @@ THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, real value, THCTensor *self_; THCTensor *src_ = THCTensor_(newTranspose)(state, src, dimension, 0); THCTensor *data = THCTensor_(newClone)(state, src_); - ptrdiff_t size = THCTensor_(nElement)(state, data)/data->size[0]; + ptrdiff_t size = THCTensor_(nElement)(state, data)/data->size(0); THArgCheck(dimension >= 0 && dimension < THCTensor_(_nDimension)(state, src), 3, "invalid dimension"); THArgCheck(THCNumerics::gt(value, scalar_cast(0)), 2, "non-positive-norm not supported"); THArgCheck(THCTensor_(_nDimension)(state, src) > 1, 1, "need at least 2 dimensions"); - dim3 grid(data->size[0]); + dim3 grid(data->size(0)); dim3 threads(32); THCTensor_kernel_renorm diff --git a/aten/src/THC/generic/THCTensorMathScan.cu b/aten/src/THC/generic/THCTensorMathScan.cu index 63657d4aa6027a..5aafb3bae8a6d9 100644 --- a/aten/src/THC/generic/THCTensorMathScan.cu +++ b/aten/src/THC/generic/THCTensorMathScan.cu @@ -79,7 +79,7 @@ void THCTensor_(scanDim)(THCState *state, THCTensor *self_, THCTensor *src, int dimension, real init, BinaryFunction binary_op) { // "init" must be the identity element for binary_op - int ndim = THCTensor_(_nDimension)(state, src); + int ndim = THCTensor_(nDimension)(state, src); THArgCheck(dimension >= 0 && dimension < ndim, 3, "dimension %d out of range", dimension + TH_INDEX_BASE); @@ -87,16 +87,18 @@ void THCTensor_(scanDim)(THCState *state, THCTensor *self_, THCTensor *src, THCTensor *self = THCTensor_(newContiguous)(state, self_); src = THCTensor_(newContiguous)(state, src); -#ifndef THC_REAL_IS_HALF - if (ndim == 1) { - // thrust does not take an "init" - THCTensor_(scanThrust)(state, self, src, binary_op); - } else -#endif - if (dimension == ndim - 1) { - THCTensor_(scanInnermostDim)(state, self, src, init, binary_op); - } else { - THCTensor_(scanOuterDim)(state, self, src, dimension, init, binary_op); + if (!self->is_empty()) { + #ifndef THC_REAL_IS_HALF + if (ndim == 1) { + // thrust does not take an "init" + THCTensor_(scanThrust)(state, self, src, binary_op); + } else + #endif + if (dimension == ndim - 1) { + THCTensor_(scanInnermostDim)(state, self, src, init, binary_op); + } else { + THCTensor_(scanOuterDim)(state, self, src, dimension, init, binary_op); + } } THCTensor_(free)(state, src); diff --git a/aten/src/THCUNN/generic/BatchNormalization.cu b/aten/src/THCUNN/generic/BatchNormalization.cu index 1eb3b820079b7b..ccf5da4da7f25a 100644 --- a/aten/src/THCUNN/generic/BatchNormalization.cu +++ b/aten/src/THCUNN/generic/BatchNormalization.cu @@ -21,11 +21,11 @@ static THCDeviceTensor THNN_(devicetensor)(THCState *state, THCTensor int size[Dim]; for (int i = 0; i < Dim || i < inDim; ++i) { if (i < Dim && i < inDim) { - size[i] = t->size[i]; + size[i] = t->size(i); } else if (i < Dim) { size[i] = 1; } else { - size[Dim - 1] *= t->size[i]; + size[Dim - 1] *= t->size(i); } } return THCDeviceTensor(t->data(), size); diff --git a/aten/src/THCUNN/generic/Col2Im.cu b/aten/src/THCUNN/generic/Col2Im.cu index 03c8dfdd838115..d29dcf75814fd0 100644 --- a/aten/src/THCUNN/generic/Col2Im.cu +++ b/aten/src/THCUNN/generic/Col2Im.cu @@ -6,9 +6,9 @@ static inline void THNN_(Col2Im_shapeCheck)( THCState *state, THCTensor *input, THCTensor *gradOutput, - int outputHeight, int outputWidth, - int kH, int kW, int dH, int dW, - int padH, int padW, int sH, int sW) { + int64_t outputHeight, int64_t outputWidth, + int64_t kH, int64_t kW, int64_t dH, int64_t dW, + int64_t padH, int64_t padW, int64_t sH, int64_t sW) { THArgCheck(kW > 0 && kH > 0, 6, "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); @@ -17,12 +17,12 @@ static inline void THNN_(Col2Im_shapeCheck)( THArgCheck(dW > 0 && dH > 0, 8, "dilation should be greater than zero, but got dH: %d dW: %d", dH, dW); - int ndim = THCTensor_(nDimension)(state, input); + int64_t ndim = THCTensor_(nDimension)(state, input); THCUNN_argCheck(state, !input->is_empty() && (ndim == 2 || ndim == 3), 2, input, "Expected non-empty 2D or 3D input tensor, but got input of shape %s"); int batch_dim = (ndim == 3) ? 0 : -1; - int64_t nInputPlane = input->size[batch_dim + 1]; + int64_t nInputPlane = input->size(batch_dim + 1); if (nInputPlane % (kW * kH) != 0) { THError("Expected size of input's dimension 1 to be divisible by the " @@ -30,7 +30,7 @@ static inline void THNN_(Col2Im_shapeCheck)( "kernel_size=(%d, %d).", (long long) nInputPlane, kH, kW); } - int64_t inputLength = input->size[batch_dim + 2]; + int64_t inputLength = input->size(batch_dim + 2); int64_t nBlocksH = 1 + (outputHeight + 2 * padH - dH * (kH - 1) - 1) / sH; int64_t nBlocksW = 1 + ( outputWidth + 2 * padW - dW * (kW - 1) - 1) / sW; @@ -54,11 +54,11 @@ void THNN_(Col2Im_updateOutput)( THCState *state, THCTensor *input, THCTensor *output, - int outputHeight, int outputWidth, - int kH, int kW, - int dH, int dW, - int padH, int padW, - int sH, int sW) { + int64_t outputHeight, int64_t outputWidth, + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW) { THCUNN_assertSameGPU(state, 2, input, output); @@ -69,11 +69,11 @@ void THNN_(Col2Im_updateOutput)( if (input->dim() == 2) { // Force batch batched_input = false; - THCTensor_(resize3d)(state, input, 1, input->size[0], input->size[1]); + THCTensor_(resize3d)(state, input, 1, input->size(0), input->size(1)); } - int64_t batchSize = input->size[0]; - int64_t nInputPlane = input->size[1]; + int64_t batchSize = input->size(0); + int64_t nInputPlane = input->size(1); int64_t nOutputPlane = nInputPlane / (kW * kH); input = THCTensor_(newContiguous)(state, input); @@ -84,10 +84,10 @@ void THNN_(Col2Im_updateOutput)( THCTensor *input_n = THCTensor_(new)(state); THCTensor *output_n = THCTensor_(new)(state); - int height_col = (outputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1; - int width_col = (outputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1; + int64_t height_col = (outputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1; + int64_t width_col = (outputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1; - for (int elt = 0; elt < batchSize; elt++) { + for (int64_t elt = 0; elt < batchSize; elt++) { THCTensor_(select)(state, input_n, input, 0, elt); THCTensor_(select)(state, output_n, output, 0, elt); @@ -116,10 +116,10 @@ void THNN_(Col2Im_updateGradInput)( THCState *state, THCTensor *gradOutput, THCTensor *gradInput, - int kH, int kW, - int dH, int dW, - int padH, int padW, - int sH, int sW) { + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW) { THNN_(Im2Col_updateOutput)(state, gradOutput, gradInput, kH, kW, dH, dW, padH, padW, sH, sW); diff --git a/aten/src/THCUNN/generic/Im2Col.cu b/aten/src/THCUNN/generic/Im2Col.cu index dd6a6dcd01019f..d0f98f0b17cf5f 100644 --- a/aten/src/THCUNN/generic/Im2Col.cu +++ b/aten/src/THCUNN/generic/Im2Col.cu @@ -6,8 +6,8 @@ static inline void THNN_(Im2Col_shapeCheck)( THCState *state, THCTensor *input, THCTensor *gradOutput, - int kH, int kW, int dH, int dW, - int padH, int padW, int sH, int sW) { + int64_t kH, int64_t kW, int64_t dH, int64_t dW, + int64_t padH, int64_t padW, int64_t sH, int64_t sW) { THArgCheck(kW > 0 && kH > 0, 4, "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); @@ -18,7 +18,7 @@ static inline void THNN_(Im2Col_shapeCheck)( THArgCheck(sW > 0 && sH > 0, 10, "stride should be greater than zero, but got sH: %d sW: %d", sH, sW); - int ndim = THCTensor_(nDimension)(state, input); + int64_t ndim = THCTensor_(nDimension)(state, input); THCUNN_argCheck(state, !input->is_empty() && (ndim == 3 || ndim == 4), 2, input, "Expected non-empty 3D or 4D input tensor, but got input of shape %s"); @@ -26,11 +26,11 @@ static inline void THNN_(Im2Col_shapeCheck)( if (ndim == 3) { dim_batch = -1; } - int nInputPlane = THCTensor_(size)(state, input, dim_batch + 1); - int inputHeight = THCTensor_(size)(state, input, dim_batch + 2); - int inputWidth = THCTensor_(size)(state, input, dim_batch + 3); - int outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1; - int outputWidth = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1; + int64_t nInputPlane = THCTensor_(size)(state, input, dim_batch + 1); + int64_t inputHeight = THCTensor_(size)(state, input, dim_batch + 2); + int64_t inputWidth = THCTensor_(size)(state, input, dim_batch + 3); + int64_t outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1; + int64_t outputWidth = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1; if (outputHeight < 1 || outputWidth < 1) { THError("Given input with spatial size (%d, %d), kernel_size=(%d, %d), " @@ -46,10 +46,10 @@ void THNN_(Im2Col_updateOutput)( THCState *state, THCTensor *input, THCTensor *output, - int kH, int kW, - int dH, int dW, - int padH, int padW, - int sH, int sW) { + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW) { THCUNN_assertSameGPU(state, 2, input, output); @@ -59,18 +59,18 @@ void THNN_(Im2Col_updateOutput)( bool batched_input = true; if (input->dim() == 3) { batched_input = false; - THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]); + THCTensor_(resize4d)(state, input, 1, input->size(0), input->size(1), input->size(2)); } - int batchSize = THCTensor_(size)(state, input, 0); - int nInputPlane = THCTensor_(size)(state, input, 1); - int inputHeight = THCTensor_(size)(state, input, 2); - int inputWidth = THCTensor_(size)(state, input, 3); + int64_t batchSize = THCTensor_(size)(state, input, 0); + int64_t nInputPlane = THCTensor_(size)(state, input, 1); + int64_t inputHeight = THCTensor_(size)(state, input, 2); + int64_t inputWidth = THCTensor_(size)(state, input, 3); - int outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1; - int outputWidth = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1; - int nOutputPlane = nInputPlane * kW * kH; - int outputLength = outputHeight * outputWidth; + int64_t outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1; + int64_t outputWidth = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1; + int64_t nOutputPlane = nInputPlane * kW * kH; + int64_t outputLength = outputHeight * outputWidth; THCTensor_(resize3d)(state, output, batchSize, nOutputPlane, outputLength); THCTensor_(zero)(state, output); @@ -78,7 +78,7 @@ void THNN_(Im2Col_updateOutput)( THCTensor *input_n = THCTensor_(new)(state); THCTensor *output_n = THCTensor_(new)(state); - for (int elt = 0; elt < batchSize; elt++) { + for (int64_t elt = 0; elt < batchSize; elt++) { THCTensor_(select)(state, input_n, input, 0, elt); THCTensor_(select)(state, output_n, output, 0, elt); @@ -104,11 +104,11 @@ void THNN_(Im2Col_updateGradInput)( THCState *state, THCTensor *gradOutput, THCTensor *gradInput, - int inputHeight, int inputWidth, - int kH, int kW, - int dH, int dW, - int padH, int padW, - int sH, int sW) { + int64_t inputHeight, int64_t inputWidth, + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW) { THNN_(Col2Im_updateOutput)(state, gradOutput, gradInput, inputHeight, inputWidth, diff --git a/aten/src/THCUNN/generic/IndexLinear.cu b/aten/src/THCUNN/generic/IndexLinear.cu index 244d2346887328..ea9683d4535390 100644 --- a/aten/src/THCUNN/generic/IndexLinear.cu +++ b/aten/src/THCUNN/generic/IndexLinear.cu @@ -41,12 +41,12 @@ void THNN_(IndexLinear_updateOutput)( THArgCheck(THNN_(checkKeysValues)(state, keys, values), 1, "Keys and values should have the same number of elements"); - int64_t batchSize = sizes->size[0]; - int64_t outDim = bias->size[0]; - int64_t wDim = weight->size[1]; - int64_t weightStride = weight->stride[0]; + int64_t batchSize = sizes->size(0); + int64_t outDim = bias->size(0); + int64_t wDim = weight->size(1); + int64_t weightStride = weight->stride(0); int maxNormalize = wDim - outDim; - int64_t keysSize = keys->size[0]; + int64_t keysSize = keys->size(0); int64_t nnzPerRow = divup(keysSize, batchSize); THCTensor_(resize2d)(state, output, batchSize, outDim); @@ -100,10 +100,10 @@ void THNN_(IndexLinear_accGradParameters)( accreal weightDecay, accreal scale) { - int64_t keysSize = keys->size[0]; - int64_t batchSize = sizes->size[0]; - int64_t outDim = bias->size[0]; - int64_t wDim = weight->size[1]; + int64_t keysSize = keys->size(0); + int64_t batchSize = sizes->size(0); + int64_t outDim = bias->size(0); + int64_t wDim = weight->size(1); int maxNormalize = wDim - outDim; // Make sure these inputs are contiguous to accelerate computations @@ -137,7 +137,7 @@ void THNN_(IndexLinear_accGradParameters)( real *gradOutputData = THCTensor_(data) (state, gradOutput); real *gradBiasData = THCTensor_(data) (state, gradBias); real *gradWeightData = THCTensor_(data) (state, gradWeight); - int64_t gradWeightStride = gradWeight->stride[0]; + int64_t gradWeightStride = gradWeight->stride(0); cudaStream_t stream = THCState_getCurrentStream(state); dim3 threads(THREADS_X, THREADS_Y); @@ -182,10 +182,10 @@ void THNN_(IndexLinear_accUpdateGradParameters)( THArgCheck(THNN_(checkKeysValues)(state, keys, values), 1, "Keys and values should have the same number of elements"); - int64_t batchSize = sizes->size[0]; - int64_t outDim = bias->size[0]; - int64_t keysSize = keys->size[0]; - int64_t wDim = weight->size[1]; + int64_t batchSize = sizes->size(0); + int64_t outDim = bias->size(0); + int64_t keysSize = keys->size(0); + int64_t wDim = weight->size(1); int maxNormalize = wDim - outDim; real *biasData = THCTensor_(data) (state, bias); @@ -194,7 +194,7 @@ void THNN_(IndexLinear_accUpdateGradParameters)( real *valuesData = THCTensor_(data) (state, values); int64_t *keysData = THCudaLongTensor_data (state, keys); int64_t *cumSumSizesData = THCudaLongTensor_data (state, cumSumSizes); - int64_t weightStride = weight->stride[0]; + int64_t weightStride = weight->stride(0); cudaStream_t stream = THCState_getCurrentStream(state); dim3 threads(THREADS_X, THREADS_Y); @@ -241,15 +241,15 @@ void THNN_(IndexLinear_updateParameters)( THArgCheck(THCudaLongTensor_isContiguous(state, cumSumSizes), 6, "cumSumSizes vector must be contiguous"); - int64_t outDim = bias->size[0]; - int64_t wDim = weight->size[1]; + int64_t outDim = bias->size(0); + int64_t wDim = weight->size(1); int maxNormalize = wDim - outDim; - int64_t keysSize = runningKeys->size[0]; - int64_t batchSize = cumSumSizes->size[0]; + int64_t keysSize = runningKeys->size(0); + int64_t batchSize = cumSumSizes->size(0); THCTensor_(cadd)(state, bias, bias, -learningRate, gradBias); - int64_t gradWeightStride = gradWeight->stride[0]; - int64_t weightStride = weight->stride[0]; + int64_t gradWeightStride = gradWeight->stride(0); + int64_t weightStride = weight->stride(0); int64_t *keysData = THCudaLongTensor_data (state, runningKeys); int64_t *cumSumSizesData = THCudaLongTensor_data (state, cumSumSizes); diff --git a/aten/src/THCUNN/generic/MultiLabelMarginCriterion.cu b/aten/src/THCUNN/generic/MultiLabelMarginCriterion.cu index 2b02bf2093ff93..510a8230d74798 100644 --- a/aten/src/THCUNN/generic/MultiLabelMarginCriterion.cu +++ b/aten/src/THCUNN/generic/MultiLabelMarginCriterion.cu @@ -18,8 +18,8 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)( if(input->dim() == 1) { - int dim = input->size[0]; - THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size[0] == dim), 3, + int dim = input->size(0); + THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size(0) == dim), 3, "inconsistent target size"); THCTensor_(resize1d)(state, output, 1); @@ -39,17 +39,17 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)( } else if(input->dim() == 2) { - int nframe = input->size[0]; - int dim = input->size[1]; - THArgCheck(!target->is_empty() && (target->dim() == 2) && (target->size[0] == nframe) - && (target->size[1] == dim), 3, "inconsistent target size"); + int nframe = input->size(0); + int dim = input->size(1); + THArgCheck(!target->is_empty() && (target->dim() == 2) && (target->size(0) == nframe) + && (target->size(1) == dim), 3, "inconsistent target size"); - dim3 blocks(input->size[0]); + dim3 blocks(input->size(0)); dim3 threads(MULTILABELMARGIN_THREADS); if (reduction != Reduction::None) { - THCTensor *output_tmp = THCTensor_(newWithSize1d)(state, input->size[0]); + THCTensor *output_tmp = THCTensor_(newWithSize1d)(state, input->size(0)); THCTensor_(resize1d)(state, output, 1); cunn_MultiLabelMarginCriterion_updateOutput_kernel @@ -67,7 +67,7 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)( } else { - THCTensor_(resize1d)(state, output, input->size[0]); + THCTensor_(resize1d)(state, output, input->size(0)); cunn_MultiLabelMarginCriterion_updateOutput_kernel <<>>( @@ -106,10 +106,10 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)( if(gradInput->dim() == 1) { - int dim = gradInput->size[0]; - THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size[0] == dim), 3, + int dim = gradInput->size(0); + THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size(0) == dim), 3, "inconsistent target size"); - THArgCheck(!istarget->is_empty() && (istarget->dim() == 1) && (istarget->size[0] == dim), 3, + THArgCheck(!istarget->is_empty() && (istarget->dim() == 1) && (istarget->size(0) == dim), 3, "inconsistent isTarget size"); dim3 blocks(1); dim3 threads(MULTILABELMARGIN_THREADS); @@ -121,20 +121,20 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)( THCTensor_(data)(state, input), THCIndexTensor_(data)(state, target), THCTensor_(data)(state, istarget), - 1, gradInput->size[0], + 1, gradInput->size(0), reduction == Reduction::ElementwiseMean, reduction != Reduction::None); } else if(gradInput->dim() == 2) { - int nframe = gradInput->size[0]; - int dim = gradInput->size[1]; - THArgCheck(!target->is_empty() && (target->dim() == 2) && (target->size[0] == nframe) - && (target->size[1] == dim), 3, "inconsistent target size"); - THArgCheck(!istarget->is_empty() && (istarget->dim() == 2) && (istarget->size[0] == nframe) - && (istarget->size[1] == dim), 3, "inconsistent isTarget size"); - dim3 blocks(gradInput->size[0]); + int nframe = gradInput->size(0); + int dim = gradInput->size(1); + THArgCheck(!target->is_empty() && (target->dim() == 2) && (target->size(0) == nframe) + && (target->size(1) == dim), 3, "inconsistent target size"); + THArgCheck(!istarget->is_empty() && (istarget->dim() == 2) && (istarget->size(0) == nframe) + && (istarget->size(1) == dim), 3, "inconsistent isTarget size"); + dim3 blocks(gradInput->size(0)); dim3 threads(MULTILABELMARGIN_THREADS); cunn_MultiLabelMarginCriterion_updateGradInput_kernel @@ -144,7 +144,7 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)( THCTensor_(data)(state, input), THCIndexTensor_(data)(state, target), THCTensor_(data)(state, istarget), - gradInput->size[0], gradInput->size[1], + gradInput->size(0), gradInput->size(1), reduction == Reduction::ElementwiseMean, reduction != Reduction::None); } diff --git a/aten/src/THCUNN/generic/MultiMarginCriterion.cu b/aten/src/THCUNN/generic/MultiMarginCriterion.cu index a620c0f8dad13f..8272b3d4020ec7 100644 --- a/aten/src/THCUNN/generic/MultiMarginCriterion.cu +++ b/aten/src/THCUNN/generic/MultiMarginCriterion.cu @@ -30,7 +30,7 @@ void THNN_(MultiMarginCriterion_updateOutput)( THCTensor_(data)(state, input), THCIndexTensor_(data)(state, target), weights ? THCTensor_(data)(state, weights) : NULL, - 1, input->size[0], + 1, input->size(0), reduction == Reduction::ElementwiseMean, margin ); @@ -42,7 +42,7 @@ void THNN_(MultiMarginCriterion_updateOutput)( THCTensor_(data)(state, input), THCIndexTensor_(data)(state, target), weights ? THCTensor_(data)(state, weights) : NULL, - 1, input->size[0], + 1, input->size(0), reduction == Reduction::ElementwiseMean, margin ); @@ -51,15 +51,15 @@ void THNN_(MultiMarginCriterion_updateOutput)( } else if (input->dim() == 2) { - int nframe = input->size[0]; - THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size[0] == nframe), 3, + int nframe = input->size(0); + THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe), 3, "inconsistent target size"); - dim3 blocks(input->size[0]); + dim3 blocks(input->size(0)); dim3 threads(MULTIMARGIN_THREADS); if (reduction == Reduction::None) { - THCTensor_(resize1d)(state, output, input->size[0]); + THCTensor_(resize1d)(state, output, input->size(0)); if (p == 1) { cunn_MultiMarginCriterion_updateOutput_kernel<1, real, accreal> <<>>( @@ -67,7 +67,7 @@ void THNN_(MultiMarginCriterion_updateOutput)( THCTensor_(data)(state, input), THCIndexTensor_(data)(state, target), weights ? THCTensor_(data)(state, weights) : NULL, - nframe, input->size[1], + nframe, input->size(1), false, margin ); @@ -79,7 +79,7 @@ void THNN_(MultiMarginCriterion_updateOutput)( THCTensor_(data)(state, input), THCIndexTensor_(data)(state, target), weights ? THCTensor_(data)(state, weights) : NULL, - nframe, input->size[1], + nframe, input->size(1), false, margin ); @@ -89,7 +89,7 @@ void THNN_(MultiMarginCriterion_updateOutput)( else { THCTensor_(resize1d)(state, output, 1); - THCTensor *output_ = THCTensor_(newWithSize1d)(state, input->size[0]); // tmp output buffer + THCTensor *output_ = THCTensor_(newWithSize1d)(state, input->size(0)); // tmp output buffer if (p == 1) { cunn_MultiMarginCriterion_updateOutput_kernel<1, real, accreal> <<>>( @@ -97,7 +97,7 @@ void THNN_(MultiMarginCriterion_updateOutput)( THCTensor_(data)(state, input), THCIndexTensor_(data)(state, target), weights ? THCTensor_(data)(state, weights) : NULL, - nframe, input->size[1], + nframe, input->size(1), reduction == Reduction::ElementwiseMean, margin ); @@ -109,7 +109,7 @@ void THNN_(MultiMarginCriterion_updateOutput)( THCTensor_(data)(state, input), THCIndexTensor_(data)(state, target), weights ? THCTensor_(data)(state, weights) : NULL, - input->size[0], input->size[1], + input->size(0), input->size(1), reduction == Reduction::ElementwiseMean, margin ); @@ -162,7 +162,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)( THCTensor_(data)(state, input), THCIndexTensor_(data)(state, target), weights ? THCTensor_(data)(state, weights) : NULL, - 1, gradInput->size[0], + 1, gradInput->size(0), reduction == Reduction::ElementwiseMean, margin, reduction != Reduction::None @@ -176,7 +176,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)( THCTensor_(data)(state, input), THCIndexTensor_(data)(state, target), weights ? THCTensor_(data)(state, weights) : NULL, - 1, gradInput->size[0], + 1, gradInput->size(0), reduction == Reduction::ElementwiseMean, margin, reduction != Reduction::None @@ -186,10 +186,10 @@ void THNN_(MultiMarginCriterion_updateGradInput)( } else if (input->dim() == 2) { - int nframe = gradInput->size[0]; - THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size[0] == nframe), 3, + int nframe = gradInput->size(0); + THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe), 3, "inconsistent target size"); - dim3 blocks(gradInput->size[0]); + dim3 blocks(gradInput->size(0)); dim3 threads(MULTIMARGIN_THREADS); if (p == 1) @@ -200,7 +200,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)( THCTensor_(data)(state, input), THCIndexTensor_(data)(state, target), weights ? THCTensor_(data)(state, weights) : NULL, - nframe, gradInput->size[1], + nframe, gradInput->size(1), reduction == Reduction::ElementwiseMean, margin, reduction != Reduction::None @@ -214,7 +214,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)( THCTensor_(data)(state, input), THCIndexTensor_(data)(state, target), weights ? THCTensor_(data)(state, weights) : NULL, - nframe, gradInput->size[1], + nframe, gradInput->size(1), reduction == Reduction::ElementwiseMean, margin, reduction != Reduction::None diff --git a/aten/src/THCUNN/generic/PReLU.cu b/aten/src/THCUNN/generic/PReLU.cu index e03d5739eb74a6..565ffcccd9ec2d 100644 --- a/aten/src/THCUNN/generic/PReLU.cu +++ b/aten/src/THCUNN/generic/PReLU.cu @@ -24,12 +24,12 @@ void THNN_(PReLU_updateOutput)( input = THCTensor_(newContiguous)(state, input); int n = THCTensor_(nElement)(state, input); - if (input->size[ndim > 1] != nOutputPlane) - THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[ndim > 1]); + if (input->size(ndim > 1) != nOutputPlane) + THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(ndim > 1)); int mapSize = 1; for (int d = 2; d < ndim; d++) { - mapSize *= input->size[d]; + mapSize *= input->size(d); } int nElemsPerSample = nOutputPlane * mapSize; preluForward<<>>( @@ -69,12 +69,12 @@ void THNN_(PReLU_updateGradInput)( gradOutput = THCTensor_(newContiguous)(state, gradOutput); int n = THCTensor_(nElement)(state, input); - if (input->size[ndim > 1] != nOutputPlane) - THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[ndim > 1]); + if (input->size(ndim > 1) != nOutputPlane) + THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(ndim > 1)); int mapSize = 1; for (int d = 2; d < ndim; d++) { - mapSize *= input->size[d]; + mapSize *= input->size(d); } int nElemsPerSample = nOutputPlane * mapSize; preluBackward<<>>( @@ -142,10 +142,10 @@ void THNN_(PReLU_accGradParameters)( THCTensor *buffer = THCTensor_(newContiguous)(state, gradInput); int64_t size3 = 1; for (int d = 2; d < ndim; d++) { - size3 *= input->size[d]; + size3 *= input->size(d); } - THCTensor_(resize3d)(state, buffer, input->size[0], nOutputPlane, size3); - THCTensor_(resize2d)(state, sumbuf, input->size[0], nOutputPlane); + THCTensor_(resize3d)(state, buffer, input->size(0), nOutputPlane, size3); + THCTensor_(resize2d)(state, sumbuf, input->size(0), nOutputPlane); THCTensor_(sum)(state, sumbuf, buffer, 2, 1); THCTensor_(sum)(state, gradWeightBuf, sumbuf, 0, 1); THCTensor_(cadd)(state, gradWeight, gradWeight, scale, gradWeightBuf); diff --git a/aten/src/THCUNN/generic/SparseLinear.cu b/aten/src/THCUNN/generic/SparseLinear.cu index d5270d6689f1fe..a370fffa527c4e 100644 --- a/aten/src/THCUNN/generic/SparseLinear.cu +++ b/aten/src/THCUNN/generic/SparseLinear.cu @@ -4,17 +4,17 @@ static bool THNN_(checkInput)(THCTensor* t) { - return !t->is_empty() && t->_dim() == 2 && t->size[1] == 3; + return !t->is_empty() && t->_dim() == 2 && t->size(1) == 3; } static bool THNN_(checkSize2D)(THCTensor* t, int64_t size0, int64_t size1) { - return !t->is_empty() && t->_dim() == 2 && t->size[0] == size0 && t->size[1] == size1; + return !t->is_empty() && t->_dim() == 2 && t->size(0) == size0 && t->size(1) == size1; } static bool THNN_(checkSize1D)(THCTensor* t, int64_t size0) { - return !t->is_empty() && t->_dim() == 1 && t->size[0] == size0; + return !t->is_empty() && t->_dim() == 1 && t->size(0) == size0; } static inline void THNN_(copyCudaFloatingType)(THCState *state, THCudaIntTensor *buf, THCTensor *t) { diff --git a/aten/src/THCUNN/generic/SpatialAdaptiveAveragePooling.cu b/aten/src/THCUNN/generic/SpatialAdaptiveAveragePooling.cu index 05a7b04e082b99..b25bbb94e4ea5f 100644 --- a/aten/src/THCUNN/generic/SpatialAdaptiveAveragePooling.cu +++ b/aten/src/THCUNN/generic/SpatialAdaptiveAveragePooling.cu @@ -22,13 +22,13 @@ void THNN_(SpatialAdaptiveAveragePooling_updateOutput)( "non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s"); if (input->dim() == 3) { - int64_t sizeD = input->size[0]; - int64_t isizeH = input->size[1]; - int64_t isizeW = input->size[2]; + int64_t sizeD = input->size(0); + int64_t isizeH = input->size(1); + int64_t isizeW = input->size(2); - int64_t istrideD = input->stride[0]; - int64_t istrideH = input->stride[1]; - int64_t istrideW = input->stride[2]; + int64_t istrideD = input->stride(0); + int64_t istrideH = input->stride(1); + int64_t istrideW = input->stride(2); input_data = THCTensor_(data)(state, input); @@ -49,14 +49,14 @@ void THNN_(SpatialAdaptiveAveragePooling_updateOutput)( } else { input = THCTensor_(newContiguous)(state, input); - int64_t sizeB = input->size[0]; - int64_t sizeD = input->size[1]; - int64_t isizeH = input->size[2]; - int64_t isizeW = input->size[3]; + int64_t sizeB = input->size(0); + int64_t sizeD = input->size(1); + int64_t isizeH = input->size(2); + int64_t isizeW = input->size(3); - int64_t istrideD = input->stride[1]; - int64_t istrideH = input->stride[2]; - int64_t istrideW = input->stride[3]; + int64_t istrideD = input->stride(1); + int64_t istrideH = input->stride(2); + int64_t istrideW = input->stride(3); input_data = THCTensor_(data)(state, input); @@ -95,12 +95,12 @@ void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)( gradOutput = THCTensor_(newContiguous)(state, gradOutput); if (input->dim() == 3) { - int64_t sizeD = input->size[0]; - int64_t isizeH = input->size[1]; - int64_t isizeW = input->size[2]; + int64_t sizeD = input->size(0); + int64_t isizeH = input->size(1); + int64_t isizeW = input->size(2); - int64_t osizeH = gradOutput->size[1]; - int64_t osizeW = gradOutput->size[2]; + int64_t osizeH = gradOutput->size(1); + int64_t osizeW = gradOutput->size(2); //bool atomic = (isizeW%osizeW != 0) || (isizeH%osizeH != 0); @@ -129,13 +129,13 @@ void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)( } THCudaCheck(cudaGetLastError()); } else { - int64_t sizeB = input->size[0]; - int64_t sizeD = input->size[1]; - int64_t isizeH = input->size[2]; - int64_t isizeW = input->size[3]; + int64_t sizeB = input->size(0); + int64_t sizeD = input->size(1); + int64_t isizeH = input->size(2); + int64_t isizeW = input->size(3); - int64_t osizeH = gradOutput->size[2]; - int64_t osizeW = gradOutput->size[3]; + int64_t osizeH = gradOutput->size(2); + int64_t osizeW = gradOutput->size(3); //bool atomic = //(isizeW%osizeW != 0) || (isizeH%osizeH != 0); diff --git a/aten/src/THCUNN/generic/SpatialAdaptiveMaxPooling.cu b/aten/src/THCUNN/generic/SpatialAdaptiveMaxPooling.cu index 3e5fab6dd95c14..6ca5c9b42b827d 100644 --- a/aten/src/THCUNN/generic/SpatialAdaptiveMaxPooling.cu +++ b/aten/src/THCUNN/generic/SpatialAdaptiveMaxPooling.cu @@ -24,13 +24,13 @@ void THNN_(SpatialAdaptiveMaxPooling_updateOutput)( "non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s"); if (input->dim() == 3) { - int64_t sizeD = input->size[0]; - int64_t isizeH = input->size[1]; - int64_t isizeW = input->size[2]; + int64_t sizeD = input->size(0); + int64_t isizeH = input->size(1); + int64_t isizeW = input->size(2); - int64_t istrideD = input->stride[0]; - int64_t istrideH = input->stride[1]; - int64_t istrideW = input->stride[2]; + int64_t istrideD = input->stride(0); + int64_t istrideH = input->stride(1); + int64_t istrideW = input->stride(2); input_data = THCTensor_(data)(state, input); @@ -55,14 +55,14 @@ void THNN_(SpatialAdaptiveMaxPooling_updateOutput)( } else { input = THCTensor_(newContiguous)(state, input); - int64_t sizeB = input->size[0]; - int64_t sizeD = input->size[1]; - int64_t isizeH = input->size[2]; - int64_t isizeW = input->size[3]; + int64_t sizeB = input->size(0); + int64_t sizeD = input->size(1); + int64_t isizeH = input->size(2); + int64_t isizeW = input->size(3); - int64_t istrideD = input->stride[1]; - int64_t istrideH = input->stride[2]; - int64_t istrideW = input->stride[3]; + int64_t istrideD = input->stride(1); + int64_t istrideH = input->stride(2); + int64_t istrideW = input->stride(3); input_data = THCTensor_(data)(state, input); @@ -107,12 +107,12 @@ void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)( gradOutput = THCTensor_(newContiguous)(state, gradOutput); if (input->dim() == 3) { - int64_t sizeD = input->size[0]; - int64_t isizeH = input->size[1]; - int64_t isizeW = input->size[2]; + int64_t sizeD = input->size(0); + int64_t isizeH = input->size(1); + int64_t isizeW = input->size(2); - int64_t osizeH = gradOutput->size[1]; - int64_t osizeW = gradOutput->size[2]; + int64_t osizeH = gradOutput->size(1); + int64_t osizeW = gradOutput->size(2); //bool atomic = (isizeH%osizeH != 0) || (isizeW%osizeW != 0); @@ -145,13 +145,13 @@ void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)( } THCudaCheck(cudaGetLastError()); } else { - int64_t sizeB = input->size[0]; - int64_t sizeD = input->size[1]; - int64_t isizeH = input->size[2]; - int64_t isizeW = input->size[3]; + int64_t sizeB = input->size(0); + int64_t sizeD = input->size(1); + int64_t isizeH = input->size(2); + int64_t isizeW = input->size(3); - int64_t osizeH = gradOutput->size[2]; - int64_t osizeW = gradOutput->size[3]; + int64_t osizeH = gradOutput->size(2); + int64_t osizeW = gradOutput->size(3); //bool atomic = (isizeH%osizeH != 0) || (isizeW%osizeW != 0); diff --git a/aten/src/THCUNN/generic/SpatialAveragePooling.cu b/aten/src/THCUNN/generic/SpatialAveragePooling.cu index 7b3d2d4ee4cca6..7811acc4247666 100644 --- a/aten/src/THCUNN/generic/SpatialAveragePooling.cu +++ b/aten/src/THCUNN/generic/SpatialAveragePooling.cu @@ -32,9 +32,9 @@ static inline void THNN_(SpatialAveragePooling_shapeCheck)( "padW = %d, padH = %d, kW = %d, kH = %d", padW, padH, kW, kH); - int64_t nInputPlane = input->size[dimh-1]; - int64_t nInputRows = input->size[dimh]; - int64_t nInputCols = input->size[dimw]; + int64_t nInputPlane = input->size(dimh-1); + int64_t nInputRows = input->size(dimh); + int64_t nInputCols = input->size(dimw); int64_t nOutputRows, nOutputCols; int64_t nOutputPlane = nInputPlane; @@ -88,17 +88,17 @@ void THNN_(SpatialAveragePooling_updateOutput)( int64_t nOutputCols, nOutputRows; if (input->dim() == 3) { - nInputCols = input->size[2]; - nInputRows = input->size[1]; - nInputPlane = input->size[0]; + nInputCols = input->size(2); + nInputRows = input->size(1); + nInputPlane = input->size(0); batchSize = 1; } else { - nInputCols = input->size[3]; - nInputRows = input->size[2]; - nInputPlane = input->size[1]; - batchSize = input->size[0]; + nInputCols = input->size(3); + nInputRows = input->size(2); + nInputPlane = input->size(1); + batchSize = input->size(0); } if(ceil_mode) { @@ -174,18 +174,18 @@ void THNN_(SpatialAveragePooling_updateGradInput)( int dimRow = 1; if (input->dim() == 3) { - nInputPlane = input->size[0]; + nInputPlane = input->size(0); batchSize = 1; } else { dimCol = 3; dimRow = 2; - nInputPlane = input->size[1]; - batchSize = input->size[0]; + nInputPlane = input->size(1); + batchSize = input->size(0); } - nInputCols = input->size[dimCol]; - nInputRows = input->size[dimRow]; + nInputCols = input->size(dimCol); + nInputRows = input->size(dimRow); if(ceil_mode) { nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1; diff --git a/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu b/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu index 64463945e6dadb..f22aba639a2d62 100644 --- a/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu +++ b/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu @@ -30,8 +30,8 @@ static inline void THNN_(SpatialConvolutionLocal_shapeCheck)( THCUNN_argCheck(state, !input->is_empty() && (ndim == 3 || ndim == 4), 2, input, "non-empty 3D or 4D input tensor expected but got: %s"); - int64_t nInputPlane = weight->size[2] / (kH * kW); - int64_t nOutputPlane = weight->size[1]; + int64_t nInputPlane = weight->size(2) / (kH * kW); + int64_t nOutputPlane = weight->size(1); if (bias != NULL) { THCUNN_check_dim_size(state, bias, 3, 0, nOutputPlane); @@ -56,9 +56,9 @@ static THCTensor* THNN_(view_weight_local)( AT_CHECK(!weight->is_empty() && (weight->dim() == 3 || weight->dim() == 6), 4, "weight tensor should be (non-empty) 3D or 6D - got size: ", weight->sizes()); if (weight->dim() == 6) { - int64_t s1 = weight->size[0] * weight->size[1]; - int64_t s2 = weight->size[2]; - int64_t s3 = weight->size[3] * weight->size[4] * weight->size[5]; + int64_t s1 = weight->size(0) * weight->size(1); + int64_t s2 = weight->size(2); + int64_t s3 = weight->size(3) * weight->size(4) * weight->size(5); THCTensor *old_weight = weight; weight = THCTensor_(newWithStorage3d)(state, weight->storage, @@ -105,7 +105,7 @@ void THNN_(SpatialConvolutionLocal_updateOutput)( } // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Resize output THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth); @@ -219,7 +219,7 @@ void THNN_(SpatialConvolutionLocal_updateGradInput)( } // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Resize output THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth); @@ -339,7 +339,7 @@ void THNN_(SpatialConvolutionLocal_accGradParameters)( } // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Helpers THCTensor *input_n = THCTensor_(new)(state); diff --git a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu index b5dab9b34394a5..e276d349d648cf 100644 --- a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu +++ b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu @@ -17,7 +17,7 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)( THCUNN_argCheck(state, !weight->is_empty() && (weight->dim() == 2 || weight->dim() == 4), 5, weight, "non-empty 2D or 4D weight tensor expected, but got: %s"); if (bias != NULL) { - THCUNN_check_dim_size(state, bias, 1, 0, weight->size[0]); + THCUNN_check_dim_size(state, bias, 1, 0, weight->size(0)); } } else if (!weight_nullable) { THError("weight tensor is expected to be non-nullable"); @@ -37,8 +37,8 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)( THCUNN_argCheck(state, !input->is_empty() && (ndim == 3 || ndim == 4), 2, input, "non-empty 3D or 4D input tensor expected but got: %s"); - int64_t inputHeight = input->size[dimh]; - int64_t inputWidth = input->size[dimw]; + int64_t inputHeight = input->size(dimh); + int64_t inputWidth = input->size(dimw); int64_t exactInputHeight = inputHeight + 2 * padH; int64_t exactInputWidth = inputWidth + 2 * padW; @@ -59,7 +59,7 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)( } if (weight != NULL) { - int64_t nInputPlane = weight->size[1]; + int64_t nInputPlane = weight->size(1); if (weight->dim() == 2) { nInputPlane /= (kH * kW); } @@ -68,10 +68,10 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)( if (gradOutput != NULL) { if (weight != NULL) { - int64_t nOutputPlane = weight->size[0]; + int64_t nOutputPlane = weight->size(0); THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - int64_t nOutputPlane = bias->size[0]; + int64_t nOutputPlane = bias->size(0); THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); } THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight); @@ -103,12 +103,12 @@ void THNN_(SpatialConvolutionMM_updateOutput)( int freeWeight = 0; // Params: - int nInputPlane = weight->dim() == 2 ? weight->size[1]/(kH*kW) : weight->size[1]; - int nOutputPlane = weight->size[0]; + int nInputPlane = weight->dim() == 2 ? weight->size(1)/(kH*kW) : weight->size(1); + int nOutputPlane = weight->size(0); if (weight->dim() == 4) { - int64_t s1 = weight->size[0]; - int64_t s2 = weight->size[1] * weight->size[2] * weight->size[3]; + int64_t s1 = weight->size(0); + int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3); weight = THCTensor_(newWithStorage2d)(state, weight->storage, weight->storageOffset, s1, -1, s2, -1); freeWeight = 1; } @@ -121,16 +121,16 @@ void THNN_(SpatialConvolutionMM_updateOutput)( if (input->dim() == 3) { // Force batch is_batch = 0; - THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]); + THCTensor_(resize4d)(state, input, 1, input->size(0), input->size(1), input->size(2)); } - int64_t inputWidth = input->size[3]; - int64_t inputHeight = input->size[2]; + int64_t inputWidth = input->size(3); + int64_t inputHeight = input->size(2); int64_t outputWidth = (inputWidth + 2*padW - kW) / dW + 1; int64_t outputHeight = (inputHeight + 2*padH - kH) / dH + 1; // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Resize output THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth); @@ -141,7 +141,7 @@ void THNN_(SpatialConvolutionMM_updateOutput)( // Define a buffer of ones, for bias accumulation // Note: this buffer can be shared with other modules, it only ever gets increased, // and always contains ones. - if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + if (ones->dim() != 2 || ones->size(0)*ones->size(1) < outputHeight*outputWidth) { // Resize plane and fill with ones... THCTensor_(resize2d)(state, ones, outputHeight, outputWidth); THCTensor_(fill)(state, ones, ScalarConvert::to(1)); @@ -199,7 +199,7 @@ void THNN_(SpatialConvolutionMM_updateOutput)( // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) int64_t m = nOutputPlane; - int64_t n = columns->size[1]; + int64_t n = columns->size(1); int64_t k = nInputPlane*kH*kW; // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) @@ -257,13 +257,13 @@ void THNN_(SpatialConvolutionMM_updateGradInput)( (state, input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW, 0); // Params - int nInputPlane = weight->dim() == 2 ? weight->size[1]/(kW*kH) : weight->size[1]; - int nOutputPlane = weight->size[0]; + int nInputPlane = weight->dim() == 2 ? weight->size(1)/(kW*kH) : weight->size(1); + int nOutputPlane = weight->size(0); int freeWeight = 0; if (weight->dim() == 4) { - int64_t s1 = weight->size[0]; - int64_t s2 = weight->size[1] * weight->size[2] * weight->size[3]; + int64_t s1 = weight->size(0); + int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3); weight = THCTensor_(newWithStorage2d)(state, weight->storage, weight->storageOffset, s1, -1, s2, -1); freeWeight = 1; } @@ -275,17 +275,17 @@ void THNN_(SpatialConvolutionMM_updateGradInput)( if (input->dim() == 3) { // Force batch is_batch = 0; - THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]); - THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); + THCTensor_(resize4d)(state, input, 1, input->size(0), input->size(1), input->size(2)); + THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2)); } - int64_t inputWidth = input->size[3]; - int64_t inputHeight = input->size[2]; + int64_t inputWidth = input->size(3); + int64_t inputHeight = input->size(2); int64_t outputWidth = (inputWidth + 2*padW - kW) / dW + 1; int64_t outputHeight = (inputHeight + 2*padH - kH) / dH + 1; // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Resize output THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth); @@ -306,7 +306,7 @@ void THNN_(SpatialConvolutionMM_updateGradInput)( // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) int64_t m = nInputPlane*kW*kH; - int64_t n = gradColumns->size[1]; + int64_t n = gradColumns->size(1); int64_t k = nOutputPlane; // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) @@ -387,31 +387,31 @@ void THNN_(SpatialConvolutionMM_accGradParameters)( if (input->dim() == 3) { // Force batch is_batch = 0; - THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]); - THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); + THCTensor_(resize4d)(state, input, 1, input->size(0), input->size(1), input->size(2)); + THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2)); } - int64_t nInputPlane = input->size[1]; - int64_t nOutputPlane = gradOutput->size[1]; + int64_t nInputPlane = input->size(1); + int64_t nOutputPlane = gradOutput->size(1); int freeWeight = 0; if (gradWeight && gradWeight->dim() == 4) { - int64_t s1 = gradWeight->size[0]; - int64_t s2 = gradWeight->size[1] * gradWeight->size[2] * gradWeight->size[3]; + int64_t s1 = gradWeight->size(0); + int64_t s2 = gradWeight->size(1) * gradWeight->size(2) * gradWeight->size(3); gradWeight = THCTensor_(newWithStorage2d)(state, gradWeight->storage, gradWeight->storageOffset, s1, -1, s2, -1); freeWeight = 1; } - int64_t inputWidth = input->size[3]; - int64_t inputHeight = input->size[2]; + int64_t inputWidth = input->size(3); + int64_t inputHeight = input->size(2); int64_t outputWidth = (inputWidth + 2*padW - kW) / dW + 1; int64_t outputHeight = (inputHeight + 2*padH - kH) / dH + 1; // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Define a buffer of ones, for bias accumulation - if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + if (ones->dim() != 2 || ones->size(0)*ones->size(1) < outputHeight*outputWidth) { // Resize plane and fill with ones... THCTensor_(resize2d)(state, ones, outputHeight, outputWidth); THCTensor_(fill)(state, ones, ScalarConvert::to(1)); @@ -448,7 +448,7 @@ void THNN_(SpatialConvolutionMM_accGradParameters)( // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) int64_t m = nOutputPlane; int64_t n = nInputPlane*kW*kH; - int64_t k = columns->size[1]; + int64_t k = columns->size(1); // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) #ifdef THC_REAL_IS_FLOAT diff --git a/aten/src/THCUNN/generic/SpatialCrossMapLRN.cu b/aten/src/THCUNN/generic/SpatialCrossMapLRN.cu index fbdd8b4aa072a9..b0e65ed8b8fbab 100644 --- a/aten/src/THCUNN/generic/SpatialCrossMapLRN.cu +++ b/aten/src/THCUNN/generic/SpatialCrossMapLRN.cu @@ -19,16 +19,16 @@ void THNN_(LRNforward)(THCState* state, THCTensor* input, THCTensor* output, if (input->dim() == 3) { batchSize = 1; - nInputPlane = input->size[0]; - imsize_h = input->size[1]; - imsize_w = input->size[2]; + nInputPlane = input->size(0); + imsize_h = input->size(1); + imsize_w = input->size(2); } else { - batchSize = input->size[0]; - nInputPlane = input->size[1]; - imsize_h = input->size[2]; - imsize_w = input->size[3]; + batchSize = input->size(0); + nInputPlane = input->size(1); + imsize_h = input->size(2); + imsize_w = input->size(3); } input = THCTensor_(newContiguous)(state, input); @@ -64,16 +64,16 @@ void THNN_(LRNbackward)(THCState* state, THCTensor* input, THCTensor* output, if (input->dim() == 3) { batchSize = 1; - nInputPlane = input->size[0]; - imsize_h = input->size[1]; - imsize_w = input->size[2]; + nInputPlane = input->size(0); + imsize_h = input->size(1); + imsize_w = input->size(2); } else { - batchSize = input->size[0]; - nInputPlane = input->size[1]; - imsize_h = input->size[2]; - imsize_w = input->size[3]; + batchSize = input->size(0); + nInputPlane = input->size(1); + imsize_h = input->size(2); + imsize_w = input->size(3); } input = THCTensor_(newContiguous)(state, input); diff --git a/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu b/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu index 61cd0e2f10b4d0..16c0f2475860d3 100644 --- a/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu +++ b/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu @@ -23,15 +23,15 @@ void THNN_(SpatialDepthwiseConvolution_updateOutput)( // the caller, so we verify that here to some extent // Weight Tensor is shape (output_channels, 1, kH, kW) - THAssert(weight->size[1] == 1); + THAssert(weight->size(1) == 1); // Input Tensor is shape (N, input_channels, H, W) // We verify that the # of output_channels is a multiple of input_channels - THAssert(weight->size[0] % input->size[1] == 0); + THAssert(weight->size(0) % input->size(1) == 0); // Bias has same # of channels as output if (bias) { - THAssert(bias->size[0] == weight->size[0]); + THAssert(bias->size(0) == weight->size(0)); } input = THCTensor_(newContiguous)(state, input); @@ -41,12 +41,12 @@ void THNN_(SpatialDepthwiseConvolution_updateOutput)( // Following the behvaior of other THCUNN functions, we shape the output // Tensor ourselves - int batchSize = input->size[0]; - int height = input->size[2]; - int width = input->size[3]; + int batchSize = input->size(0); + int height = input->size(2); + int width = input->size(3); int outputHeight = (height + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; int outputWidth = (width + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; - int outputChannels = weight->size[0]; + int outputChannels = weight->size(0); THCTensor_(resize4d)(state, output, batchSize, outputChannels, outputHeight, outputWidth); @@ -61,7 +61,7 @@ void THNN_(SpatialDepthwiseConvolution_updateOutput)( dBias = toDeviceTensor(state, bias); } - int inputChannels = input->size[1]; + int inputChannels = input->size(1); int depthwiseMultiplier = outputChannels / inputChannels; // One thread per output value @@ -113,20 +113,20 @@ void THNN_(SpatialDepthwiseConvolution_updateGradInput)( // Minimal shape checking, as above // Same # of elements in batch - THAssert(input->size[0] == gradOutput->size[0]); + THAssert(input->size(0) == gradOutput->size(0)); // Same # of filters as outputChannels - THAssert(weight->size[0] == gradOutput->size[1]); + THAssert(weight->size(0) == gradOutput->size(1)); // Resize GradInput THCTensor_(resizeAs)(state, gradInput, input); - int inputChannels = input->size[1]; - int height = input->size[2]; - int width = input->size[3]; + int inputChannels = input->size(1); + int height = input->size(2); + int width = input->size(3); - int outputChannels = gradOutput->size[1]; - int outputHeight = gradOutput->size[2]; - int outputWidth = gradOutput->size[3]; + int outputChannels = gradOutput->size(1); + int outputHeight = gradOutput->size(2); + int outputWidth = gradOutput->size(3); int depthwiseMultiplier = outputChannels / inputChannels; @@ -210,18 +210,18 @@ void THNN_(SpatialDepthwiseConvolution_accGradParameters)( // Minimal shape checking as above // Same # of elements in batch - THAssert(input->size[0] == gradOutput->size[0]); + THAssert(input->size(0) == gradOutput->size(0)); // Same # of filters as outputChannels - THAssert(gradWeight->size[0] == gradOutput->size[1]); + THAssert(gradWeight->size(0) == gradOutput->size(1)); - int batchSize = input->size[0]; - int inputChannels = input->size[1]; - int height = input->size[2]; - int width = input->size[3]; + int batchSize = input->size(0); + int inputChannels = input->size(1); + int height = input->size(2); + int width = input->size(3); - int outputChannels = gradOutput->size[1]; - int outputHeight = gradOutput->size[2]; - int outputWidth = gradOutput->size[3]; + int outputChannels = gradOutput->size(1); + int outputHeight = gradOutput->size(2); + int outputWidth = gradOutput->size(3); int depthwiseMultiplier = outputChannels / inputChannels; diff --git a/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu b/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu index 1cac7f604d354f..6d218ab6ca4829 100644 --- a/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu +++ b/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu @@ -21,7 +21,7 @@ static inline void THNN_(SpatialDilatedConvolution_shapeCheck)( "non-empty 4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, " "but got: %s"); if (bias != NULL) { - THCUNN_check_dim_size(state, bias, 1, 0, weight->size[0]); + THCUNN_check_dim_size(state, bias, 1, 0, weight->size(0)); } } else if (!weight_nullable) { THError("weight tensor is expected to be non-nullable"); @@ -41,8 +41,8 @@ static inline void THNN_(SpatialDilatedConvolution_shapeCheck)( THCUNN_argCheck(state, !input->is_empty() && (ndim == 3 || ndim == 4), 2, input, "non-empty 3D or 4D input tensor expected but got: %s"); - int64_t inputHeight = input->size[dimh]; - int64_t inputWidth = input->size[dimw]; + int64_t inputHeight = input->size(dimh); + int64_t inputWidth = input->size(dimw); int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; @@ -54,16 +54,16 @@ static inline void THNN_(SpatialDilatedConvolution_shapeCheck)( } if (weight != NULL) { - int64_t nInputPlane = weight->size[1]; + int64_t nInputPlane = weight->size(1); THCUNN_check_dim_size(state, input, ndim, dimf, nInputPlane); } if (gradOutput != NULL) { if (weight != NULL) { - int64_t nOutputPlane = weight->size[0]; + int64_t nOutputPlane = weight->size(0); THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - int64_t nOutputPlane = bias->size[0]; + int64_t nOutputPlane = bias->size(0); THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); } THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight); @@ -94,8 +94,8 @@ void THNN_(SpatialDilatedConvolution_updateOutput)( dilationH, dilationW, 0); // Params: - int nInputPlane = weight->size[1]; - int nOutputPlane = weight->size[0]; + int nInputPlane = weight->size(1); + int nOutputPlane = weight->size(0); input = THCTensor_(newContiguous)(state, input); weight = THCTensor_(newContiguous)(state, weight); @@ -105,16 +105,16 @@ void THNN_(SpatialDilatedConvolution_updateOutput)( if (input->dim() == 3) { // Force batch is_batch = 0; - THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]); + THCTensor_(resize4d)(state, input, 1, input->size(0), input->size(1), input->size(2)); } - int64_t inputWidth = input->size[3]; - int64_t inputHeight = input->size[2]; + int64_t inputWidth = input->size(3); + int64_t inputHeight = input->size(2); int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Resize output THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth); @@ -125,7 +125,7 @@ void THNN_(SpatialDilatedConvolution_updateOutput)( // Define a buffer of ones, for bias accumulation // Note: this buffer can be shared with other modules, it only ever gets increased, // and always contains ones. - if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + if (ones->dim() != 2 || ones->size(0)*ones->size(1) < outputHeight*outputWidth) { // Resize plane and fill with ones... THCTensor_(resize2d)(state, ones, outputHeight, outputWidth); THCTensor_(fill)(state, ones, ScalarConvert::to(1)); @@ -184,7 +184,7 @@ void THNN_(SpatialDilatedConvolution_updateOutput)( // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) int64_t m = nOutputPlane; - int64_t n = columns->size[1]; + int64_t n = columns->size(1); int64_t k = nInputPlane*kH*kW; // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) @@ -240,8 +240,8 @@ void THNN_(SpatialDilatedConvolution_updateGradInput)( dilationH, dilationW, 0); // Params - int nInputPlane = weight->size[1]; - int nOutputPlane = weight->size[0]; + int nInputPlane = weight->size(1); + int nOutputPlane = weight->size(0); input = THCTensor_(newContiguous)(state, input); gradOutput = THCTensor_(newContiguous)(state, gradOutput); @@ -251,17 +251,17 @@ void THNN_(SpatialDilatedConvolution_updateGradInput)( if (input->dim() == 3) { // Force batch is_batch = 0; - THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]); - THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); + THCTensor_(resize4d)(state, input, 1, input->size(0), input->size(1), input->size(2)); + THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2)); } - int64_t inputWidth = input->size[3]; - int64_t inputHeight = input->size[2]; + int64_t inputWidth = input->size(3); + int64_t inputHeight = input->size(2); int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Resize output THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth); @@ -282,7 +282,7 @@ void THNN_(SpatialDilatedConvolution_updateGradInput)( // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) int64_t m = nInputPlane*kW*kH; - int64_t n = gradColumns->size[1]; + int64_t n = gradColumns->size(1); int64_t k = nOutputPlane; // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) @@ -367,22 +367,22 @@ void THNN_(SpatialDilatedConvolution_accGradParameters)( if (input->dim() == 3) { // Force batch is_batch = 0; - THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]); - THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); + THCTensor_(resize4d)(state, input, 1, input->size(0), input->size(1), input->size(2)); + THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2)); } - int64_t nInputPlane = input->size[1]; - int64_t nOutputPlane = gradOutput->size[1]; - int64_t inputWidth = input->size[3]; - int64_t inputHeight = input->size[2]; + int64_t nInputPlane = input->size(1); + int64_t nOutputPlane = gradOutput->size(1); + int64_t inputWidth = input->size(3); + int64_t inputHeight = input->size(2); int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Define a buffer of ones, for bias accumulation - if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + if (ones->dim() != 2 || ones->size(0)*ones->size(1) < outputHeight*outputWidth) { // Resize plane and fill with ones... THCTensor_(resize2d)(state, ones, outputHeight, outputWidth); THCTensor_(fill)(state, ones, ScalarConvert::to(1)); @@ -420,7 +420,7 @@ void THNN_(SpatialDilatedConvolution_accGradParameters)( // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) int64_t m = nOutputPlane; int64_t n = nInputPlane*kW*kH; - int64_t k = columns->size[1]; + int64_t k = columns->size(1); // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) #ifdef THC_REAL_IS_FLOAT diff --git a/aten/src/THCUNN/generic/SpatialDilatedMaxPooling.cu b/aten/src/THCUNN/generic/SpatialDilatedMaxPooling.cu index 7425345ce2c1fe..48a13720c48f31 100644 --- a/aten/src/THCUNN/generic/SpatialDilatedMaxPooling.cu +++ b/aten/src/THCUNN/generic/SpatialDilatedMaxPooling.cu @@ -25,7 +25,7 @@ static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)( int batchSize = 1; if (ndim == 4) { - batchSize = input->size[0]; + batchSize = input->size(0); dimf++; dimh++; dimw++; @@ -38,9 +38,9 @@ static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)( "padW = %d, padH = %d, kW = %d, kH = %d", padW, padH, kW, kH); - int64_t nInputPlane = input->size[dimh-1]; - int64_t nInputRows = input->size[dimh]; - int64_t nInputCols = input->size[dimw]; + int64_t nInputPlane = input->size(dimh-1); + int64_t nInputRows = input->size(dimh); + int64_t nInputCols = input->size(dimw); int64_t nOutputRows, nOutputCols; int64_t nOutputPlane = nInputPlane; @@ -102,17 +102,17 @@ void THNN_(SpatialDilatedMaxPooling_updateOutput)( int64_t nOutputCols, nOutputRows; if (input->dim() == 3) { - nInputCols = input->size[2]; - nInputRows = input->size[1]; - nInputPlane = input->size[0]; + nInputCols = input->size(2); + nInputRows = input->size(1); + nInputPlane = input->size(0); batchSize = 1; } else { - nInputCols = input->size[3]; - nInputRows = input->size[2]; - nInputPlane = input->size[1]; - batchSize = input->size[0]; + nInputCols = input->size(3); + nInputRows = input->size(2); + nInputPlane = input->size(1); + batchSize = input->size(0); } if(ceil_mode) { @@ -181,17 +181,17 @@ void THNN_(SpatialDilatedMaxPooling_updateGradInput)( int64_t nOutputCols, nOutputRows; if (input->_dim() == 3) { - nInputCols = input->size[2]; - nInputRows = input->size[1]; - nInputPlane = input->size[0]; + nInputCols = input->size(2); + nInputRows = input->size(1); + nInputPlane = input->size(0); batchSize = 1; } else { - nInputCols = input->size[3]; - nInputRows = input->size[2]; - nInputPlane = input->size[1]; - batchSize = input->size[0]; + nInputCols = input->size(3); + nInputRows = input->size(2); + nInputPlane = input->size(1); + batchSize = input->size(0); } if(ceil_mode) { diff --git a/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu b/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu index 58ab364551c76c..76777796e361e4 100644 --- a/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu +++ b/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu @@ -24,7 +24,7 @@ static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)( THCUNN_argCheck(state, !weight->is_empty() && (weight->dim() == 2 || weight->dim() == 4), 5, weight, "non-empty 2D or 4D weight tensor expected, but got: %s"); if (bias != NULL) { - THCUNN_check_dim_size(state, bias, 1, 0, weight->size[1]); + THCUNN_check_dim_size(state, bias, 1, 0, weight->size(1)); } } else if (!weight_nullable) { THError("weight tensor is expected to be non-nullable"); @@ -44,8 +44,8 @@ static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)( THCUNN_argCheck(state, !input->is_empty() && (ndim == 3 || ndim == 4), 2, input, "non-empty 3D or 4D input tensor expected but got: %s"); - int64_t inputHeight = input->size[dimh]; - int64_t inputWidth = input->size[dimw]; + int64_t inputHeight = input->size(dimh); + int64_t inputWidth = input->size(dimw); int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH; int64_t outputWidth = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW; @@ -56,16 +56,16 @@ static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)( } if (weight != NULL) { - int64_t nInputPlane = weight->size[0]; + int64_t nInputPlane = weight->size(0); THCUNN_check_dim_size(state, input, ndim, dimf, nInputPlane); } if (gradOutput != NULL) { if (weight != NULL) { - int64_t nOutputPlane = weight->size[1]; + int64_t nOutputPlane = weight->size(1); THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - int64_t nOutputPlane = bias->size[0]; + int64_t nOutputPlane = bias->size(0); THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); } THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight); @@ -105,16 +105,16 @@ void THNN_(SpatialFullDilatedConvolution_updateOutput)( if (input->dim() == 3) { // Force batch is_batch = 0; - THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]); + THCTensor_(resize4d)(state, input, 1, input->size(0), input->size(1), input->size(2)); } - int64_t inputWidth = input->size[3]; - int64_t inputHeight = input->size[2]; + int64_t inputWidth = input->size(3); + int64_t inputHeight = input->size(2); int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH; int64_t outputWidth = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW; // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Resize output THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth); @@ -125,7 +125,7 @@ void THNN_(SpatialFullDilatedConvolution_updateOutput)( // Define a buffer of ones, for bias accumulation // Note: this buffer can be shared with other modules, it only ever gets increased, // and always contains ones. - if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + if (ones->dim() != 2 || ones->size(0)*ones->size(1) < outputHeight*outputWidth) { // Resize plane and fill with ones... THCTensor_(resize2d)(state, ones, outputHeight, outputWidth); THCTensor_(fill)(state, ones, ScalarConvert::to(1)); @@ -143,9 +143,9 @@ void THNN_(SpatialFullDilatedConvolution_updateOutput)( // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - int64_t m = weight->size[1] * weight->size[2] * weight->size[3]; - int64_t n = columns->size[1]; - int64_t k = weight->size[0]; + int64_t m = weight->size(1) * weight->size(2) * weight->size(3); + int64_t n = columns->size(1); + int64_t k = weight->size(0); // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) #ifdef THC_REAL_IS_FLOAT @@ -244,17 +244,17 @@ void THNN_(SpatialFullDilatedConvolution_updateGradInput)( if (input->dim() == 3) { // Force batch is_batch = 0; - THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]); - THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); + THCTensor_(resize4d)(state, input, 1, input->size(0), input->size(1), input->size(2)); + THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2)); } - int64_t inputWidth = input->size[3]; - int64_t inputHeight = input->size[2]; + int64_t inputWidth = input->size(3); + int64_t inputHeight = input->size(2); int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH; int64_t outputWidth = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW; // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Resize output THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth); @@ -285,9 +285,9 @@ void THNN_(SpatialFullDilatedConvolution_updateGradInput)( // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - int64_t m = weight->size[0]; - int64_t n = gradColumns->size[1]; - int64_t k = weight->size[1] * weight->size[2] * weight->size[3]; + int64_t m = weight->size(0); + int64_t n = gradColumns->size(1); + int64_t k = weight->size(1) * weight->size(2) * weight->size(3); // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) #ifdef THC_REAL_IS_FLOAT @@ -371,20 +371,20 @@ void THNN_(SpatialFullDilatedConvolution_accGradParameters)( if (input->dim() == 3) { // Force batch is_batch = 0; - THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]); - THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); + THCTensor_(resize4d)(state, input, 1, input->size(0), input->size(1), input->size(2)); + THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2)); } - int64_t inputWidth = input->size[3]; - int64_t inputHeight = input->size[2]; + int64_t inputWidth = input->size(3); + int64_t inputHeight = input->size(2); int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH; int64_t outputWidth = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW; // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Define a buffer of ones, for bias accumulation - if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + if (ones->dim() != 2 || ones->size(0)*ones->size(1) < outputHeight*outputWidth) { // Resize plane and fill with ones... THCTensor_(resize2d)(state, ones, outputHeight, outputWidth); THCTensor_(fill)(state, ones, ScalarConvert::to(1)); @@ -419,9 +419,9 @@ void THNN_(SpatialFullDilatedConvolution_accGradParameters)( // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - int64_t n = columns->size[0]; // nOutputPlane * kh * kw - int64_t m = input_n->size[0]; // nInputPlane - int64_t k = columns->size[1]; // inputHeight * inputWidth + int64_t n = columns->size(0); // nOutputPlane * kh * kw + int64_t m = input_n->size(0); // nInputPlane + int64_t k = columns->size(1); // inputHeight * inputWidth // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) #ifdef THC_REAL_IS_FLOAT @@ -488,7 +488,7 @@ void THNN_(SpatialFullDilatedConvolution_accGradParameters)( // Resize if (is_batch == 0) { THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth); - THCTensor_(resize3d)(state, input, input->size[1], inputHeight, inputWidth); + THCTensor_(resize3d)(state, input, input->size(1), inputHeight, inputWidth); } THCTensor_(free)(state, input); diff --git a/aten/src/THCUNN/generic/SpatialMaxUnpooling.cu b/aten/src/THCUNN/generic/SpatialMaxUnpooling.cu index 90d6e0a837f067..e86896e2764434 100644 --- a/aten/src/THCUNN/generic/SpatialMaxUnpooling.cu +++ b/aten/src/THCUNN/generic/SpatialMaxUnpooling.cu @@ -17,17 +17,17 @@ void THNN_(SpatialMaxUnpooling_updateOutput)( int64_t nInputCols, nInputRows, nInputPlane, batchSize; if (input->dim() == 3) { - nInputCols = input->size[2]; - nInputRows = input->size[1]; - nInputPlane = input->size[0]; + nInputCols = input->size(2); + nInputRows = input->size(1); + nInputPlane = input->size(0); batchSize = 1; } else { - nInputCols = input->size[3]; - nInputRows = input->size[2]; - nInputPlane = input->size[1]; - batchSize = input->size[0]; + nInputCols = input->size(3); + nInputRows = input->size(2); + nInputPlane = input->size(1); + batchSize = input->size(0); } input = THCTensor_(newContiguous)(state, input); @@ -65,22 +65,22 @@ void THNN_(SpatialMaxUnpooling_updateGradInput)( int dimh = 1; if (input->dim() == 3) { - nInputPlane = input->size[0]; + nInputPlane = input->size(0); batchSize = 1; } else { ++dimw; ++dimh; - nInputPlane = input->size[1]; - batchSize = input->size[0]; + nInputPlane = input->size(1); + batchSize = input->size(0); } - nInputCols = input->size[dimw]; - nInputRows = input->size[dimh]; + nInputCols = input->size(dimw); + nInputRows = input->size(dimh); - if(owidth!=gradOutput->size[dimw] || oheight!=gradOutput->size[dimh]){ + if(owidth!=gradOutput->size(dimw) || oheight!=gradOutput->size(dimh)){ THError("Inconsistent gradOutput size. oheight= %d, owidth= %d, gradOutput: %dx%d", - oheight, owidth,gradOutput->size[dimh],gradOutput->size[dimw]); + oheight, owidth,gradOutput->size(dimh),gradOutput->size(dimw)); } input = THCTensor_(newContiguous)(state, input); diff --git a/aten/src/THCUNN/generic/SpatialReflectionPadding.cu b/aten/src/THCUNN/generic/SpatialReflectionPadding.cu index 0c90944e84a5f3..4e3ab6c7de24c5 100644 --- a/aten/src/THCUNN/generic/SpatialReflectionPadding.cu +++ b/aten/src/THCUNN/generic/SpatialReflectionPadding.cu @@ -97,8 +97,8 @@ void THNN_(SpatialReflectionPadding_updateGradInput)( dimh++; dimw++; } - int iheight = input->size[dimh]; - int iwidth = input->size[dimw]; + int iheight = input->size(dimh); + int iwidth = input->size(dimw); int oheight = iheight + padT + padB; int owidth = iwidth + padL + padR; diff --git a/aten/src/THCUNN/generic/SpatialReplicationPadding.cu b/aten/src/THCUNN/generic/SpatialReplicationPadding.cu index 6ab694dcacd92e..07c51292bf2b37 100644 --- a/aten/src/THCUNN/generic/SpatialReplicationPadding.cu +++ b/aten/src/THCUNN/generic/SpatialReplicationPadding.cu @@ -87,8 +87,8 @@ void THNN_(SpatialReplicationPadding_updateGradInput)( dimh++; dimw++; } - int iheight = input->size[dimh]; - int iwidth = input->size[dimw]; + int iheight = input->size(dimh); + int iwidth = input->size(dimw); int oheight = iheight + padT + padB; int owidth = iwidth + padL + padR; diff --git a/aten/src/THCUNN/generic/SpatialSubSampling.cu b/aten/src/THCUNN/generic/SpatialSubSampling.cu index ea71c82e943fdb..d44168218e44c9 100644 --- a/aten/src/THCUNN/generic/SpatialSubSampling.cu +++ b/aten/src/THCUNN/generic/SpatialSubSampling.cu @@ -25,9 +25,9 @@ static inline void THNN_(SpatialSubSampling_shapeCheck)( dimp++; } - int64_t nInputCols = input->size[dimc]; - int64_t nInputRows = input->size[dimr]; - THArgCheck(input->size[dimp] == nInputPlane, 2, "invalid number of input planes"); + int64_t nInputCols = input->size(dimc); + int64_t nInputRows = input->size(dimr); + THArgCheck(input->size(dimp) == nInputPlane, 2, "invalid number of input planes"); THArgCheck(nInputCols >= kW && nInputRows >= kH, 2, "input image smaller than kernel size"); } @@ -51,8 +51,8 @@ void THNN_(SpatialSubSampling_updateOutput)( THNN_(SpatialSubSampling_shapeCheck)(state, input, NULL, weight, kW, kH); if (input->dim() == 3) { - int64_t nInputCols = input->size[2]; - int64_t nInputRows = input->size[1]; + int64_t nInputCols = input->size(2); + int64_t nInputRows = input->size(1); int64_t nOutputCols = (nInputCols - kW) / dW + 1; int64_t nOutputRows = (nInputRows - kH) / dH + 1; @@ -74,9 +74,9 @@ void THNN_(SpatialSubSampling_updateOutput)( nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW); THCudaCheck(cudaGetLastError()); } else { - int64_t nInputCols = input->size[3]; - int64_t nInputRows = input->size[2]; - int64_t nbatch = input->size[0]; + int64_t nInputCols = input->size(3); + int64_t nInputRows = input->size(2); + int64_t nbatch = input->size(0); int64_t nOutputCols = (nInputCols - kW) / dW + 1; int64_t nOutputRows = (nInputRows - kH) / dH + 1; @@ -119,8 +119,8 @@ void THNN_(SpatialSubSampling_updateGradInput)( int nInputPlane = THCTensor_(size)(state, weight, 0); if (input->dim() == 3) { - int64_t nInputCols = input->size[2]; - int64_t nInputRows = input->size[1]; + int64_t nInputCols = input->size(2); + int64_t nInputRows = input->size(1); real *weight_data = THCTensor_(data)(state, weight); gradOutput = THCTensor_(newContiguous)(state, gradOutput); @@ -149,9 +149,9 @@ void THNN_(SpatialSubSampling_updateGradInput)( } THCudaCheck(cudaGetLastError()); } else { - int64_t nInputCols = input->size[3]; - int64_t nInputRows = input->size[2]; - int64_t nbatch = input->size[0]; + int64_t nInputCols = input->size(3); + int64_t nInputRows = input->size(2); + int64_t nbatch = input->size(0); real *weight_data = THCTensor_(data)(state, weight); gradOutput = THCTensor_(newContiguous)(state, gradOutput); @@ -199,8 +199,8 @@ void THNN_(SpatialSubSampling_accGradParameters)( int nInputPlane = THCTensor_(size)(state, gradWeight, 0); if (input->dim() == 3) { - int64_t nInputCols = input->size[2]; - int64_t nInputRows = input->size[1]; + int64_t nInputCols = input->size(2); + int64_t nInputRows = input->size(1); real *gradWeight_data = THCTensor_(data)(state, gradWeight); real *gradBias_data = THCTensor_(data)(state, gradBias); @@ -221,9 +221,9 @@ void THNN_(SpatialSubSampling_accGradParameters)( nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW, scale); THCudaCheck(cudaGetLastError()); } else { - int64_t nInputCols = input->size[3]; - int64_t nInputRows = input->size[2]; - int64_t nbatch = input->size[0]; + int64_t nInputCols = input->size(3); + int64_t nInputRows = input->size(2); + int64_t nbatch = input->size(0); real *gradWeight_data = THCTensor_(data)(state, gradWeight); real *gradBias_data = THCTensor_(data)(state, gradBias); @@ -242,8 +242,8 @@ void THNN_(SpatialSubSampling_accGradParameters)( int64_t sl; for (sl=0; sl <<>> ( - input_data + sl*input->stride[0], - gradOutput_data + sl*gradOutput->stride[0], + input_data + sl*input->stride(0), + gradOutput_data + sl*gradOutput->stride(0), gradWeight_data, gradBias_data, nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW, scale); } diff --git a/aten/src/THCUNN/generic/THCUNN.h b/aten/src/THCUNN/generic/THCUNN.h index c48536e4ded57b..eaadf66c8306ee 100644 --- a/aten/src/THCUNN/generic/THCUNN.h +++ b/aten/src/THCUNN/generic/THCUNN.h @@ -183,39 +183,39 @@ THC_API void THNN_(Im2Col_updateOutput)( THCState *state, THCTensor *input, THCTensor *output, - int kH, int kW, - int dH, int dW, - int padH, int padW, - int sH, int sW); + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW); THC_API void THNN_(Im2Col_updateGradInput)( THCState *state, THCTensor *gradOutput, THCTensor *gradInput, - int inputHeight, int inputWidth, - int kH, int kW, - int dH, int dW, - int padH, int padW, - int sH, int sW); + int64_t inputHeight, int64_t inputWidth, + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW); THC_API void THNN_(Col2Im_updateOutput)( THCState *state, THCTensor *input, THCTensor *output, - int outputHeight, int outputWidth, - int kH, int kW, - int dH, int dW, - int padH, int padW, - int sH, int sW); + int64_t outputHeight, int64_t outputWidth, + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW); THC_API void THNN_(Col2Im_updateGradInput)( THCState *state, THCTensor *gradOutput, THCTensor *gradInput, - int kH, int kW, - int dH, int dW, - int padH, int padW, - int sH, int sW); + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW); THC_API void THNN_(LeakyReLU_updateOutput)( THCState *state, diff --git a/aten/src/THCUNN/generic/TemporalConvolution.cu b/aten/src/THCUNN/generic/TemporalConvolution.cu index 1bb17612fd9e89..25baf933b57a98 100644 --- a/aten/src/THCUNN/generic/TemporalConvolution.cu +++ b/aten/src/THCUNN/generic/TemporalConvolution.cu @@ -25,13 +25,13 @@ static inline void THNN_(TemporalConvolution_shapeCheck)( THCUNN_argCheck(state, !input->is_empty() && (input->dim() == 2 || input->dim() == 3), 2, input, "non-empty 2D or 3D (batch mode) tensor expected for input, but got: %s"); if (inputFrameSize != NULL) { - THArgCheck(input->size[dimF] == *inputFrameSize, 2, + THArgCheck(input->size(dimF) == *inputFrameSize, 2, "invalid input frame size. Got: %d, Expected: %d", - input->size[dimF], *inputFrameSize); + input->size(dimF), *inputFrameSize); } - THArgCheck(input->size[dimS] >= kW, 2, + THArgCheck(input->size(dimS) >= kW, 2, "input sequence smaller than kernel size. Got: %d, Expected: %d", - input->size[dimS], kW); + input->size(dimS), kW); } void THNN_(TemporalConvolution_updateOutput)( @@ -65,7 +65,7 @@ void THNN_(TemporalConvolution_updateOutput)( outputWindow = THCTensor_(new)(state); inputWindow = THCTensor_(new)(state); - nInputFrame = input->size[dimS]; + nInputFrame = input->size(dimS); nOutputFrame = (nInputFrame - kW) / dW + 1; if (input->dim() == 2) @@ -91,14 +91,14 @@ void THNN_(TemporalConvolution_updateOutput)( nOutputFrame -= nFrame; THCTensor_(setStorage2d)(state, inputWindow, input->storage, - input->storageOffset+k*dW*input->size[1], - nFrame, inputFrameStride*input->size[1], - kW*input->size[1], 1); + input->storageOffset+k*dW*input->size(1), + nFrame, inputFrameStride*input->size(1), + kW*input->size(1), 1); THCTensor_(setStorage2d)(state, outputWindow, output->storage, - output->storageOffset + k*output->size[1], - nFrame, outputFrameStride*output->size[1], - output->size[1], 1); + output->storageOffset + k*output->size(1), + nFrame, outputFrameStride*output->size(1), + output->size(1), 1); THCTensor *tweight = THCTensor_(new)(state); THCTensor_(transpose)(state, tweight, weight, 0, 1); @@ -110,7 +110,7 @@ void THNN_(TemporalConvolution_updateOutput)( { THCTensor *outputSample = THCTensor_(new)(state); THCTensor *inputSample = THCTensor_(new)(state); - int nBatchFrame = input->size[0]; + int nBatchFrame = input->size(0); THCTensor_(resize3d)(state, output, nBatchFrame, @@ -139,14 +139,14 @@ void THNN_(TemporalConvolution_updateOutput)( nOutputSampleFrame -= nFrame; THCTensor_(setStorage2d)(state, inputWindow, inputSample->storage, - inputSample->storageOffset+k*dW*inputSample->size[1], - nFrame, inputFrameStride*inputSample->size[1], - kW*inputSample->size[1], 1); + inputSample->storageOffset+k*dW*inputSample->size(1), + nFrame, inputFrameStride*inputSample->size(1), + kW*inputSample->size(1), 1); THCTensor_(setStorage2d)(state, outputWindow, outputSample->storage, - outputSample->storageOffset + k*outputSample->size[1], - nFrame, outputFrameStride*outputSample->size[1], - outputSample->size[1], 1); + outputSample->storageOffset + k*outputSample->size(1), + nFrame, outputFrameStride*outputSample->size(1), + outputSample->size(1), 1); THCTensor *tweight = THCTensor_(new)(state); THCTensor_(transpose)(state, tweight, weight, 0, 1); @@ -194,8 +194,8 @@ void THNN_(TemporalConvolution_updateGradInput)( dimS = 1; } - nInputFrame = input->size[dimS]; - nOutputFrame = gradOutput->size[dimS]; + nInputFrame = input->size(dimS); + nOutputFrame = gradOutput->size(dimS); /* Not necessary with partial backprop: */ @@ -216,14 +216,14 @@ void THNN_(TemporalConvolution_updateGradInput)( nOutputFrame -= nFrame; THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutput->storage, - gradOutput->storageOffset + k*gradOutput->size[1], - nFrame, outputFrameStride*gradOutput->size[1], - gradOutput->size[1], 1); + gradOutput->storageOffset + k*gradOutput->size(1), + nFrame, outputFrameStride*gradOutput->size(1), + gradOutput->size(1), 1); THCTensor_(setStorage2d)(state, gradInputWindow, gradInput->storage, - gradInput->storageOffset+k*dW*gradInput->size[1], - nFrame, inputFrameStride*gradInput->size[1], - kW*gradInput->size[1], 1); + gradInput->storageOffset+k*dW*gradInput->size(1), + nFrame, inputFrameStride*gradInput->size(1), + kW*gradInput->size(1), 1); THCTensor_(addmm)(state, gradInputWindow, ScalarConvert::to(1), gradInputWindow, ScalarConvert::to(1), gradOutputWindow, weight); } @@ -232,7 +232,7 @@ void THNN_(TemporalConvolution_updateGradInput)( { THCTensor *gradOutputSample = THCTensor_(new)(state); THCTensor *gradInputSample = THCTensor_(new)(state); - int64_t nBatchFrame = input->size[0]; + int64_t nBatchFrame = input->size(0); for(i = 0; i < nBatchFrame; i++) { THCTensor_(select)(state, gradOutputSample, gradOutput, 0, i); @@ -248,14 +248,14 @@ void THNN_(TemporalConvolution_updateGradInput)( nOutputSampleFrame -= nFrame; THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutputSample->storage, - gradOutputSample->storageOffset + k*gradOutputSample->size[1], - nFrame, outputFrameStride*gradOutputSample->size[1], - gradOutputSample->size[1], 1); + gradOutputSample->storageOffset + k*gradOutputSample->size(1), + nFrame, outputFrameStride*gradOutputSample->size(1), + gradOutputSample->size(1), 1); THCTensor_(setStorage2d)(state, gradInputWindow, gradInputSample->storage, - gradInputSample->storageOffset+k*dW*gradInputSample->size[1], - nFrame, inputFrameStride*gradInputSample->size[1], - kW*gradInputSample->size[1], 1); + gradInputSample->storageOffset+k*dW*gradInputSample->size(1), + nFrame, inputFrameStride*gradInputSample->size(1), + kW*gradInputSample->size(1), 1); THCTensor_(addmm)(state, gradInputWindow, ScalarConvert::to(1), gradInputWindow, ScalarConvert::to(1), gradOutputWindow, weight); } @@ -298,8 +298,8 @@ void THNN_(TemporalConvolution_accGradParameters)( dimS = 1; } - nInputFrame = input->size[dimS]; - nOutputFrame = gradOutput->size[dimS]; + nInputFrame = input->size(dimS); + nOutputFrame = gradOutput->size(dimS); /* Not necessary with partial backprop: */ input = THCTensor_(newContiguous)(state, input); @@ -325,14 +325,14 @@ void THNN_(TemporalConvolution_accGradParameters)( nOutputFrame -= nFrame; THCTensor_(setStorage2d)(state, inputWindow, input->storage, - input->storageOffset+k*dW*input->size[1], - nFrame, inputFrameStride*input->size[1], - kW*input->size[1], 1); + input->storageOffset+k*dW*input->size(1), + nFrame, inputFrameStride*input->size(1), + kW*input->size(1), 1); THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutput->storage, - gradOutput->storageOffset + k*gradOutput->size[1], - nFrame, outputFrameStride*gradOutput->size[1], - gradOutput->size[1], 1); + gradOutput->storageOffset + k*gradOutput->size(1), + nFrame, outputFrameStride*gradOutput->size(1), + gradOutput->size(1), 1); THCTensor *tgradOutputWindow = THCTensor_(new)(state); THCTensor_(transpose)(state, tgradOutputWindow, gradOutputWindow, 0, 1); @@ -344,7 +344,7 @@ void THNN_(TemporalConvolution_accGradParameters)( { THCTensor *gradOutputSample = THCTensor_(new)(state); THCTensor *inputSample = THCTensor_(new)(state); - int64_t nBatchFrame = input->size[0]; + int64_t nBatchFrame = input->size(0); for(i = 0; i < nBatchFrame; i++) { @@ -368,14 +368,14 @@ void THNN_(TemporalConvolution_accGradParameters)( nOutputSampleFrame -= nFrame; THCTensor_(setStorage2d)(state, inputWindow, inputSample->storage, - inputSample->storageOffset+k*dW*inputSample->size[1], - nFrame, inputFrameStride*inputSample->size[1], - kW*inputSample->size[1], 1); + inputSample->storageOffset+k*dW*inputSample->size(1), + nFrame, inputFrameStride*inputSample->size(1), + kW*inputSample->size(1), 1); THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutputSample->storage, - gradOutputSample->storageOffset + k*gradOutputSample->size[1], - nFrame, outputFrameStride*gradOutputSample->size[1], - gradOutputSample->size[1], 1); + gradOutputSample->storageOffset + k*gradOutputSample->size(1), + nFrame, outputFrameStride*gradOutputSample->size(1), + gradOutputSample->size(1), 1); THCTensor *tgradOutputWindow = THCTensor_(new)(state); THCTensor_(transpose)(state, tgradOutputWindow, gradOutputWindow, 0, 1); diff --git a/aten/src/THCUNN/generic/TemporalMaxPooling.cu b/aten/src/THCUNN/generic/TemporalMaxPooling.cu index e355ebd14ee5c6..a950aa730afb59 100644 --- a/aten/src/THCUNN/generic/TemporalMaxPooling.cu +++ b/aten/src/THCUNN/generic/TemporalMaxPooling.cu @@ -27,12 +27,12 @@ static inline void THNN_(TemporalMaxPooling_shapeCheck)( THCUNN_argCheck(state, !input->is_empty() && (input->dim() == 2 || input->dim() == 3), 2, input, "non-empty 2D or 3D (batch mode) tensor expected for input, but got: %s"); - THArgCheck(input->size[dimT] >= kW, 2, + THArgCheck(input->size(dimT) >= kW, 2, "input sequence smaller than kernel size. Got: %d, Expected: %d", - input->size[dimT], kW); + input->size(dimT), kW); - input_w = input->size[dimT]; - input_n = input->size[dimF]; + input_w = input->size(dimT); + input_n = input->size(dimF); output_w = (input_w - kW) / dW + 1; if (gradOutput != NULL) { @@ -71,23 +71,23 @@ void THNN_(TemporalMaxPooling_updateOutput)( { dimT = 1; dimF = 2; - batch = input->size[0]; + batch = input->size(0); } input = THCTensor_(newContiguous)(state, input); - input_w = input->size[dimT]; - input_n = input->size[dimF]; + input_w = input->size(dimT); + input_n = input->size(dimF); output_w = (input_w - kW) / dW + 1; if (input->dim() == 2) { - THCTensor_(resize2d)(state, output, output_w, input->size[dimF]); - THCIndexTensor_(resize2d)(state, indices, output_w, input->size[dimF]); + THCTensor_(resize2d)(state, output, output_w, input->size(dimF)); + THCIndexTensor_(resize2d)(state, indices, output_w, input->size(dimF)); } else { - THCTensor_(resize3d)(state, output, batch, output_w, input->size[dimF]); - THCIndexTensor_(resize3d)(state, indices, batch, output_w, input->size[dimF]); + THCTensor_(resize3d)(state, output, batch, output_w, input->size(dimF)); + THCIndexTensor_(resize3d)(state, indices, batch, output_w, input->size(dimF)); } input_data = THCTensor_(data)(state, input); @@ -146,12 +146,12 @@ void THNN_(TemporalMaxPooling_updateGradInput)( { dimT = 1; dimF = 2; - batch = input->size[0]; + batch = input->size(0); } gradOutput = THCTensor_(newContiguous)(state, gradOutput); - input_w = input->size[dimT]; - input_n = input->size[dimF]; + input_w = input->size(dimT); + input_n = input->size(dimF); output_w = (input_w - kW) / dW + 1; gradInput_data = THCTensor_(data)(state, gradInput); diff --git a/aten/src/THCUNN/generic/TemporalReflectionPadding.cu b/aten/src/THCUNN/generic/TemporalReflectionPadding.cu index 394c796cb9a5ec..0beea05cd2826e 100644 --- a/aten/src/THCUNN/generic/TemporalReflectionPadding.cu +++ b/aten/src/THCUNN/generic/TemporalReflectionPadding.cu @@ -84,7 +84,7 @@ void THNN_(TemporalReflectionPadding_updateGradInput)( planeDim++; dimw++; } - int iwidth = input->size[dimw]; + int iwidth = input->size(dimw); int owidth = iwidth + padL + padR; THArgCheck(owidth == THCTensor_(size)(state, gradOutput, dimw), 3, diff --git a/aten/src/THCUNN/generic/TemporalReplicationPadding.cu b/aten/src/THCUNN/generic/TemporalReplicationPadding.cu index 11637dc9dfa31b..96f0ad6fa46304 100644 --- a/aten/src/THCUNN/generic/TemporalReplicationPadding.cu +++ b/aten/src/THCUNN/generic/TemporalReplicationPadding.cu @@ -79,7 +79,7 @@ void THNN_(TemporalReplicationPadding_updateGradInput)( planeDim++; dimw++; } - int iwidth = input->size[dimw]; + int iwidth = input->size(dimw); int owidth = iwidth + padL + padR; THArgCheck(owidth == THCTensor_(size)(state, gradOutput, dimw), 3, diff --git a/aten/src/THCUNN/generic/TemporalRowConvolution.cu b/aten/src/THCUNN/generic/TemporalRowConvolution.cu index 26361d498eeb55..83a32ca2a063dd 100644 --- a/aten/src/THCUNN/generic/TemporalRowConvolution.cu +++ b/aten/src/THCUNN/generic/TemporalRowConvolution.cu @@ -14,7 +14,7 @@ static inline void THNN_(TemporalRowConvolution_shapeCheck)( weight, "non-empty 2D or 3D weight tensor expected, but got: %s"); if (bias != NULL) { - THCUNN_check_dim_size(state, bias, 1, 0, weight->size[0]); + THCUNN_check_dim_size(state, bias, 1, 0, weight->size(0)); } int ndim = input->dim(); @@ -29,8 +29,8 @@ static inline void THNN_(TemporalRowConvolution_shapeCheck)( THCUNN_argCheck(state, !input->is_empty() && (ndim == 2 || ndim == 3), 1, input, "non-empty 2D or 3D (batch mode) input tensor expected, but got :%s"); - int64_t inputFrameSize = weight->size[0]; - int64_t nInputFrame = input->size[dimS]; + int64_t inputFrameSize = weight->size(0); + int64_t nInputFrame = input->size(dimS); int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1; if (nOutputFrame < 1) { @@ -84,16 +84,16 @@ void THNN_(TemporalRowConvolution_updateOutput)( if (ndim == 2) { // Force batch batch = 0; - THCTensor_(resize3d)(state, input, 1, input->size[0], input->size[1]); + THCTensor_(resize3d)(state, input, 1, input->size(0), input->size(1)); } // Params: - int64_t inputFrameSize = weight->size[0]; - int64_t nInputFrame = input->size[2]; + int64_t inputFrameSize = weight->size(0); + int64_t nInputFrame = input->size(2); int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1; // Batch size - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Resize output THCTensor_(resize3d)(state, output, batchSize, inputFrameSize, nOutputFrame); @@ -104,7 +104,7 @@ void THNN_(TemporalRowConvolution_updateOutput)( // Define a buffer of ones, for bias accumulation // Note: this buffer can be shared with other modules, it only ever // gets increased and always contains ones. - if (ones->dim() != 2 || ones->size[0] * ones->size[1] < nOutputFrame) { + if (ones->dim() != 2 || ones->size(0) * ones->size(1) < nOutputFrame) { // Resize plane and fill with ones... THCTensor_(resize2d)(state, ones, 1, nOutputFrame); THCTensor_(fill)(state, ones, ScalarConvert::to(1)); @@ -218,18 +218,18 @@ void THNN_(TemporalRowConvolution_updateGradInput)( if (ndim == 2) { // Force batch batch = 0; - THCTensor_(resize3d)(state, input, 1, input->size[0], input->size[1]); - THCTensor_(resize3d)(state, gradOutput, 1, gradOutput->size[0], - gradOutput->size[1]); + THCTensor_(resize3d)(state, input, 1, input->size(0), input->size(1)); + THCTensor_(resize3d)(state, gradOutput, 1, gradOutput->size(0), + gradOutput->size(1)); } // Params: - int64_t inputFrameSize = weight->size[0]; - int64_t nInputFrame = input->size[2]; - int64_t nOutputFrame = gradOutput->size[2]; + int64_t inputFrameSize = weight->size(0); + int64_t nInputFrame = input->size(2); + int64_t nOutputFrame = gradOutput->size(2); // Batch size - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Resize output THCTensor_(resize3d)(state, gradInput, batchSize, inputFrameSize, @@ -331,21 +331,21 @@ void THNN_(TemporalRowConvolution_accGradParameters)( if (ndim == 2) { // Force batch batch = 0; - THCTensor_(resize3d)(state, input, 1, input->size[0], input->size[1]); - THCTensor_(resize3d)(state, gradOutput, 1, gradOutput->size[0], - gradOutput->size[1]); + THCTensor_(resize3d)(state, input, 1, input->size(0), input->size(1)); + THCTensor_(resize3d)(state, gradOutput, 1, gradOutput->size(0), + gradOutput->size(1)); } // Params: - int64_t inputFrameSize = gradWeight->size[0]; - int64_t nInputFrame = input->size[2]; - int64_t nOutputFrame = gradOutput->size[2]; + int64_t inputFrameSize = gradWeight->size(0); + int64_t nInputFrame = input->size(2); + int64_t nOutputFrame = gradOutput->size(2); // Batch size - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Define a buffer of ones, for bias accumulation - if (ones->dim() != 2 || ones->size[0] * ones->size[1] < nOutputFrame) { + if (ones->dim() != 2 || ones->size(0) * ones->size(1) < nOutputFrame) { // Resize plane and fill with ones... THCTensor_(resize2d)(state, ones, 1, nOutputFrame); THCTensor_(fill)(state, ones, ScalarConvert::to(1)); diff --git a/aten/src/THCUNN/generic/VolumetricAdaptiveAveragePooling.cu b/aten/src/THCUNN/generic/VolumetricAdaptiveAveragePooling.cu index d29748338f7298..3b34077d1cee84 100644 --- a/aten/src/THCUNN/generic/VolumetricAdaptiveAveragePooling.cu +++ b/aten/src/THCUNN/generic/VolumetricAdaptiveAveragePooling.cu @@ -28,15 +28,15 @@ void THNN_(VolumetricAdaptiveAveragePooling_updateOutput)( int64_t totalZ; if (input->dim() == 4) { - sizeD = input->size[0]; - isizeT = input->size[1]; - isizeH = input->size[2]; - isizeW = input->size[3]; + sizeD = input->size(0); + isizeT = input->size(1); + isizeH = input->size(2); + isizeW = input->size(3); - istrideD = input->stride[0]; - istrideT = input->stride[1]; - istrideH = input->stride[2]; - istrideW = input->stride[3]; + istrideD = input->stride(0); + istrideT = input->stride(1); + istrideH = input->stride(2); + istrideW = input->stride(3); THCTensor_(resize4d)(state, output, sizeD, osizeT, osizeH, osizeW); @@ -44,16 +44,16 @@ void THNN_(VolumetricAdaptiveAveragePooling_updateOutput)( } else { input = THCTensor_(newContiguous)(state, input); - int64_t sizeB = input->size[0]; - sizeD = input->size[1]; - isizeT = input->size[2]; - isizeH = input->size[3]; - isizeW = input->size[4]; + int64_t sizeB = input->size(0); + sizeD = input->size(1); + isizeT = input->size(2); + isizeH = input->size(3); + isizeW = input->size(4); - istrideD = input->stride[1]; - istrideT = input->stride[2]; - istrideH = input->stride[3]; - istrideW = input->stride[4]; + istrideD = input->stride(1); + istrideT = input->stride(2); + istrideH = input->stride(3); + istrideW = input->stride(4); THCTensor_(resize5d)(state, output, sizeB, sizeD, osizeT, osizeH, osizeW); @@ -107,23 +107,23 @@ void THNN_(VolumetricAdaptiveAveragePooling_updateGradInput)( int64_t totalZ; if (input->dim() == 4) { - sizeD = input->size[0]; - isizeT = input->size[1]; - isizeH = input->size[2]; - isizeW = input->size[3]; - - osizeT = gradOutput->size[1]; - osizeH = gradOutput->size[2]; - osizeW = gradOutput->size[3]; + sizeD = input->size(0); + isizeT = input->size(1); + isizeH = input->size(2); + isizeW = input->size(3); + + osizeT = gradOutput->size(1); + osizeH = gradOutput->size(2); + osizeW = gradOutput->size(3); } else { - sizeD = input->size[1]; - isizeT = input->size[2]; - isizeH = input->size[3]; - isizeW = input->size[4]; - - osizeT = gradOutput->size[2]; - osizeH = gradOutput->size[3]; - osizeW = gradOutput->size[4]; + sizeD = input->size(1); + isizeT = input->size(2); + isizeH = input->size(3); + isizeW = input->size(4); + + osizeT = gradOutput->size(2); + osizeH = gradOutput->size(3); + osizeW = gradOutput->size(4); } // somehow nonatomic is passing all test for volumetric case. @@ -132,7 +132,7 @@ void THNN_(VolumetricAdaptiveAveragePooling_updateGradInput)( if (input->dim() == 4) { totalZ = atomic ? sizeD * osizeT : sizeD * isizeT; } else { - int sizeB = input->size[0]; + int sizeB = input->size(0); totalZ = atomic ? sizeB * sizeD * osizeT : sizeB * sizeD * isizeT; } diff --git a/aten/src/THCUNN/generic/VolumetricAdaptiveMaxPooling.cu b/aten/src/THCUNN/generic/VolumetricAdaptiveMaxPooling.cu index 7f876ae26b5721..adc23e15dabedc 100644 --- a/aten/src/THCUNN/generic/VolumetricAdaptiveMaxPooling.cu +++ b/aten/src/THCUNN/generic/VolumetricAdaptiveMaxPooling.cu @@ -29,15 +29,15 @@ void THNN_(VolumetricAdaptiveMaxPooling_updateOutput)( int64_t totalZ; if (input->dim() == 4) { - sizeD = input->size[0]; - isizeT = input->size[1]; - isizeH = input->size[2]; - isizeW = input->size[3]; + sizeD = input->size(0); + isizeT = input->size(1); + isizeH = input->size(2); + isizeW = input->size(3); - istrideD = input->stride[0]; - istrideT = input->stride[1]; - istrideH = input->stride[2]; - istrideW = input->stride[3]; + istrideD = input->stride(0); + istrideT = input->stride(1); + istrideH = input->stride(2); + istrideW = input->stride(3); THCTensor_(resize4d)(state, output, sizeD, osizeT, osizeH, osizeW); THCIndexTensor_(resize4d)(state, indices, sizeD, osizeT, osizeH, osizeW); @@ -46,16 +46,16 @@ void THNN_(VolumetricAdaptiveMaxPooling_updateOutput)( } else { input = THCTensor_(newContiguous)(state, input); - int64_t sizeB = input->size[0]; - sizeD = input->size[1]; - isizeT = input->size[2]; - isizeH = input->size[3]; - isizeW = input->size[4]; + int64_t sizeB = input->size(0); + sizeD = input->size(1); + isizeT = input->size(2); + isizeH = input->size(3); + isizeW = input->size(4); - istrideD = input->stride[1]; - istrideT = input->stride[2]; - istrideH = input->stride[3]; - istrideW = input->stride[4]; + istrideD = input->stride(1); + istrideT = input->stride(2); + istrideH = input->stride(3); + istrideW = input->stride(4); THCTensor_(resize5d)(state, output, sizeB, sizeD, osizeT, osizeH, osizeW); THCIndexTensor_(resize5d)(state, indices, sizeB, sizeD, osizeT, osizeH, osizeW); @@ -113,23 +113,23 @@ void THNN_(VolumetricAdaptiveMaxPooling_updateGradInput)( int64_t totalZ; if (input->dim() == 4) { - sizeD = input->size[0]; - isizeT = input->size[1]; - isizeH = input->size[2]; - isizeW = input->size[3]; - - osizeT = gradOutput->size[1]; - osizeH = gradOutput->size[2]; - osizeW = gradOutput->size[3]; + sizeD = input->size(0); + isizeT = input->size(1); + isizeH = input->size(2); + isizeW = input->size(3); + + osizeT = gradOutput->size(1); + osizeH = gradOutput->size(2); + osizeW = gradOutput->size(3); } else { - sizeD = input->size[1]; - isizeT = input->size[2]; - isizeH = input->size[3]; - isizeW = input->size[4]; - - osizeT = gradOutput->size[2]; - osizeH = gradOutput->size[3]; - osizeW = gradOutput->size[4]; + sizeD = input->size(1); + isizeT = input->size(2); + isizeH = input->size(3); + isizeW = input->size(4); + + osizeT = gradOutput->size(2); + osizeH = gradOutput->size(3); + osizeW = gradOutput->size(4); } bool atomic = (isizeW%osizeW != 0) || (isizeH%osizeH != 0) || (isizeT%osizeT != 0); @@ -137,7 +137,7 @@ void THNN_(VolumetricAdaptiveMaxPooling_updateGradInput)( if (input->dim() == 4) { totalZ = sizeD * osizeT; } else { - int sizeB = input->size[0]; + int sizeB = input->size(0); totalZ = sizeB * sizeD * osizeT; } diff --git a/aten/src/THCUNN/generic/VolumetricAveragePooling.cu b/aten/src/THCUNN/generic/VolumetricAveragePooling.cu index b32643ddc4d091..54987bc26bc366 100644 --- a/aten/src/THCUNN/generic/VolumetricAveragePooling.cu +++ b/aten/src/THCUNN/generic/VolumetricAveragePooling.cu @@ -32,11 +32,11 @@ static inline void THNN_(VolumetricAveragePooling_shapeCheck)( if (!input->is_empty() && THCTensor_(nDimension)(state, input) == 4) { - THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH - && input->size[dimt] >= kT, 2, + THArgCheck(input->size(dimw) >= kW && input->size(dimh) >= kH + && input->size(dimt) >= kT, 2, "input image (T: %d H: %d W: %d) smaller than " "kernel size (kT: %d kH: %d kW: %d)", - input->size[dimt], input->size[dimh], input->size[dimw], + input->size(dimt), input->size(dimh), input->size(dimw), kT, kH, kW); /* sizes */ @@ -47,11 +47,11 @@ static inline void THNN_(VolumetricAveragePooling_shapeCheck)( } else if (!input->is_empty() && THCTensor_(nDimension)(state, input) == 5) { - THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH - && input->size[dimt] >= kT, 2, + THArgCheck(input->size(dimw) >= kW && input->size(dimh) >= kH + && input->size(dimt) >= kT, 2, "input image (T: %d H: %d W: %d) smaller than " "kernel size (kT: %d kH: %d kW: %d)", - input->size[dimt], input->size[dimh], input->size[dimw], + input->size(dimt), input->size(dimh), input->size(dimw), kT, kH, kW); /* sizes */ diff --git a/aten/src/THCUNN/generic/VolumetricConvolution.cu b/aten/src/THCUNN/generic/VolumetricConvolution.cu index e76f8cb42531f9..f21402e65efc23 100644 --- a/aten/src/THCUNN/generic/VolumetricConvolution.cu +++ b/aten/src/THCUNN/generic/VolumetricConvolution.cu @@ -47,11 +47,11 @@ static inline void THNN_(VolumetricConvolution_shapeCheck) if (weight == NULL) { weight = gradWeight; } - int64_t nOutputPlane = weight->size[0]; - int64_t nInputPlane = weight->size[1]; - int64_t kT = weight->size[2]; - int64_t kH = weight->size[3]; - int64_t kW = weight->size[4]; + int64_t nOutputPlane = weight->size(0); + int64_t nInputPlane = weight->size(1); + int64_t kT = weight->size(2); + int64_t kH = weight->size(3); + int64_t kW = weight->size(4); THArgCheck(kT > 0 && kW > 0 && kH > 0, 4, "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d", kT, kH, kW); @@ -69,9 +69,9 @@ static inline void THNN_(VolumetricConvolution_shapeCheck) dimd++; } - int64_t inputWidth = input->size[dimw]; - int64_t inputHeight = input->size[dimh]; - int64_t inputDepth = input->size[dimd]; + int64_t inputWidth = input->size(dimw); + int64_t inputHeight = input->size(dimh); + int64_t inputDepth = input->size(dimd); int64_t exactInputDepth = inputDepth + 2*padT; int64_t exactInputHeight = inputHeight + 2*padH; @@ -97,7 +97,7 @@ static inline void THNN_(VolumetricConvolution_shapeCheck) } if (bias != NULL) { - THCUNN_check_dim_size(state, bias, 1, 0, weight->size[0]); + THCUNN_check_dim_size(state, bias, 1, 0, weight->size(0)); } THCUNN_check_dim_size(state, input, ndim, dimf, nInputPlane); @@ -128,30 +128,30 @@ void THNN_(VolumetricConvolution_updateOutput)( bias, dT, dW, dH, padT, padW, padH); input = THCTensor_(newContiguous)(state, input); - int nOutputPlane = (int)weight->size[0]; - int nInputPlane = (int)weight->size[1]; - int kT = (int)weight->size[2]; - int kH = (int)weight->size[3]; - int kW = (int)weight->size[4]; + int nOutputPlane = (int)weight->size(0); + int nInputPlane = (int)weight->size(1); + int kT = (int)weight->size(2); + int kH = (int)weight->size(3); + int kW = (int)weight->size(4); int batch = 1; if (input->dim() == 4) { // Force batch batch = 0; - THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], - input->size[2], input->size[3]); + THCTensor_(resize5d)(state, input, 1, input->size(0), input->size(1), + input->size(2), input->size(3)); } - int64_t inputWidth = input->size[3]; - int64_t inputHeight = input->size[2]; - int64_t inputDepth = input->size[4]; + int64_t inputWidth = input->size(3); + int64_t inputHeight = input->size(2); + int64_t inputDepth = input->size(4); int64_t outputWidth = (inputWidth + 2*padH - kH) / dH + 1; int64_t outputHeight = (inputHeight + 2*padT - kT) / dT + 1; int64_t outputDepth = (inputDepth + 2*padW - kW) / dW + 1; // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Resize output THCTensor_(resize5d)(state, output, batchSize, nOutputPlane, @@ -163,7 +163,7 @@ void THNN_(VolumetricConvolution_updateOutput)( // Define a buffer of ones, for bias accumulation // Note: this buffer can be shared with other modules, it only ever gets increased, // and always contains ones. - if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) + if (ones->dim() != 3 || ones->size(0)*ones->size(1)*ones->size(2) < outputDepth*outputHeight*outputWidth) { // Resize plane and fill with ones... THCTensor_(resize3d)(state, ones, outputHeight, outputWidth, outputDepth); @@ -220,9 +220,9 @@ void THNN_(VolumetricConvolution_updateOutput)( // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - int64_t m = weight->size[0]; - int64_t n = columns->size[1]; - int64_t k = weight->size[1]*weight->size[2]*weight->size[3]*weight->size[4]; + int64_t m = weight->size(0); + int64_t n = columns->size(1); + int64_t k = weight->size(1)*weight->size(2)*weight->size(3)*weight->size(4); // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) #ifdef THC_REAL_IS_FLOAT @@ -267,11 +267,11 @@ void THNN_(VolumetricConvolution_updateGradInput)( int padT, int padW, int padH) { - int64_t nOutputPlane = weight->size[0]; - int64_t nInputPlane = weight->size[1]; - int64_t kT = weight->size[2]; - int64_t kH = weight->size[3]; - int64_t kW = weight->size[4]; + int64_t nOutputPlane = weight->size(0); + int64_t nInputPlane = weight->size(1); + int64_t kT = weight->size(2); + int64_t kH = weight->size(3); + int64_t kW = weight->size(4); THCTensor *gradColumns = finput; @@ -287,19 +287,19 @@ void THNN_(VolumetricConvolution_updateGradInput)( input = THCTensor_(newContiguous)(state, input); // Force batch batch = 0; - THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); - THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + THCTensor_(resize5d)(state, input, 1, input->size(0), input->size(1), input->size(2), input->size(3)); + THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2), gradOutput->size(3)); } - int64_t inputWidth = input->size[3]; - int64_t inputHeight = input->size[2]; - int64_t inputDepth = input->size[4]; + int64_t inputWidth = input->size(3); + int64_t inputHeight = input->size(2); + int64_t inputDepth = input->size(4); int64_t outputWidth = (inputWidth + 2*padH - kH) / dH + 1; int64_t outputHeight = (inputHeight + 2*padT - kT) / dT + 1; int64_t outputDepth = (inputDepth + 2*padW - kW) / dW + 1; // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Resize output THCTensor_(resize5d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth, inputDepth); @@ -320,9 +320,9 @@ void THNN_(VolumetricConvolution_updateGradInput)( // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - int64_t m = weight->size[1]*weight->size[2]*weight->size[3]*weight->size[4]; - int64_t n = gradColumns->size[1]; - int64_t k = weight->size[0]; + int64_t m = weight->size(1)*weight->size(2)*weight->size(3)*weight->size(4); + int64_t n = gradColumns->size(1); + int64_t k = weight->size(0); // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) #ifdef THC_REAL_IS_FLOAT @@ -387,11 +387,11 @@ void THNN_(VolumetricConvolution_accGradParameters)( state, input, gradOutput, NULL, gradWeight, gradBias, dT, dW, dH, padT, padW, padH); - int nOutputPlane = (int)gradWeight->size[0]; - int nInputPlane = (int)gradWeight->size[1]; - int kT = (int)gradWeight->size[2]; - int kH = (int)gradWeight->size[3]; - int kW = (int)gradWeight->size[4]; + int nOutputPlane = (int)gradWeight->size(0); + int nInputPlane = (int)gradWeight->size(1); + int kT = (int)gradWeight->size(2); + int kH = (int)gradWeight->size(3); + int kW = (int)gradWeight->size(4); input = THCTensor_(newContiguous)(state, input); gradOutput = THCTensor_(newContiguous)(state, gradOutput); @@ -401,22 +401,22 @@ void THNN_(VolumetricConvolution_accGradParameters)( { // Force batch batch = 0; - THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); - THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + THCTensor_(resize5d)(state, input, 1, input->size(0), input->size(1), input->size(2), input->size(3)); + THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2), gradOutput->size(3)); } - int64_t inputWidth = input->size[3]; - int64_t inputHeight = input->size[2]; - int64_t inputDepth = input->size[4]; + int64_t inputWidth = input->size(3); + int64_t inputHeight = input->size(2); + int64_t inputDepth = input->size(4); int64_t outputWidth = (inputWidth + 2*padH - kH) / dH + 1; int64_t outputHeight = (inputHeight + 2*padT - kT) / dT + 1; int64_t outputDepth = (inputDepth + 2*padW - kW) / dW + 1; // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Define a buffer of ones, for bias accumulation - if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) + if (ones->dim() != 3 || ones->size(0)*ones->size(1)*ones->size(2) < outputDepth*outputHeight*outputWidth) { // Resize plane and fill with ones... THCTensor_(resize3d)(state, ones, outputHeight, outputWidth, outputDepth); @@ -447,9 +447,9 @@ void THNN_(VolumetricConvolution_accGradParameters)( // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - int64_t m = gradWeight->size[0]; - int64_t n = gradWeight->size[1]*gradWeight->size[2]*gradWeight->size[3]*gradWeight->size[4]; - int64_t k = columns->size[1]; + int64_t m = gradWeight->size(0); + int64_t n = gradWeight->size(1)*gradWeight->size(2)*gradWeight->size(3)*gradWeight->size(4); + int64_t k = columns->size(1); // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) #ifdef THC_REAL_IS_FLOAT diff --git a/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu b/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu index 5751ab44662ac2..9e28ab80e43c68 100644 --- a/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu +++ b/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu @@ -31,7 +31,7 @@ static inline void THNN_(VolumetricDilatedConvolution_shapeCheck)( "non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor " "expected for weight, but got: %s"); if (bias != NULL) { - THCUNN_check_dim_size(state, bias, 1, 0, weight->size[0]); + THCUNN_check_dim_size(state, bias, 1, 0, weight->size(0)); } } else if (!weight_nullable) { THError("weight tensor is expected to be non-nullable"); @@ -50,9 +50,9 @@ static inline void THNN_(VolumetricDilatedConvolution_shapeCheck)( dimw++; } - int64_t inputDepth = input->size[dimd]; - int64_t inputHeight = input->size[dimh]; - int64_t inputWidth = input->size[dimw]; + int64_t inputDepth = input->size(dimd); + int64_t inputHeight = input->size(dimh); + int64_t inputWidth = input->size(dimw); int64_t outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1; int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; @@ -64,16 +64,16 @@ static inline void THNN_(VolumetricDilatedConvolution_shapeCheck)( } if (weight != NULL) { - int64_t nInputPlane = weight->size[1]; + int64_t nInputPlane = weight->size(1); THCUNN_check_dim_size(state, input, ndim, dimf, nInputPlane); } if (gradOutput != NULL) { if (weight != NULL) { - int64_t nOutputPlane = weight->size[0]; + int64_t nOutputPlane = weight->size(0); THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - int64_t nOutputPlane = bias->size[0]; + int64_t nOutputPlane = bias->size(0); THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); } THCUNN_check_dim_size(state, gradOutput, ndim, dimd, outputDepth); @@ -105,8 +105,8 @@ void THNN_(VolumetricDilatedConvolution_updateOutput)( dilationT, dilationH, dilationW, 0); // Params: - int nInputPlane = weight->size[1]; - int nOutputPlane = weight->size[0]; + int nInputPlane = weight->size(1); + int nOutputPlane = weight->size(0); input = THCTensor_(newContiguous)(state, input); weight = THCTensor_(newContiguous)(state, weight); @@ -116,18 +116,18 @@ void THNN_(VolumetricDilatedConvolution_updateOutput)( if (input->dim() == 4) { // Force batch is_batch = 0; - THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + THCTensor_(resize5d)(state, input, 1, input->size(0), input->size(1), input->size(2), input->size(3)); } - int64_t inputDepth = input->size[2]; - int64_t inputHeight = input->size[3]; - int64_t inputWidth = input->size[4]; + int64_t inputDepth = input->size(2); + int64_t inputHeight = input->size(3); + int64_t inputWidth = input->size(4); int64_t outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1; int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Resize output THCTensor_(resize5d)(state, output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth); @@ -138,7 +138,7 @@ void THNN_(VolumetricDilatedConvolution_updateOutput)( // Define a buffer of ones, for bias accumulation // Note: this buffer can be shared with other modules, it only ever gets increased, // and always contains ones. - if (ones->dim() != 2 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) { + if (ones->dim() != 2 || ones->size(0)*ones->size(1)*ones->size(2) < outputDepth*outputHeight*outputWidth) { // Resize plane and fill with ones... THCTensor_(resize3d)(state, ones, outputDepth, outputHeight, outputWidth); THCTensor_(fill)(state, ones, ScalarConvert::to(1)); @@ -197,7 +197,7 @@ void THNN_(VolumetricDilatedConvolution_updateOutput)( // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) int64_t m = nOutputPlane; - int64_t n = columns->size[1]; + int64_t n = columns->size(1); int64_t k = nInputPlane*kT*kH*kW; // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) @@ -256,8 +256,8 @@ void THNN_(VolumetricDilatedConvolution_updateGradInput)( weight = THCTensor_(newContiguous)(state, weight); // Params - int nInputPlane = weight->size[1]; - int nOutputPlane = weight->size[0]; + int nInputPlane = weight->size(1); + int nOutputPlane = weight->size(0); input = THCTensor_(newContiguous)(state, input); gradOutput = THCTensor_(newContiguous)(state, gradOutput); @@ -265,19 +265,19 @@ void THNN_(VolumetricDilatedConvolution_updateGradInput)( if (input->dim() == 4) { // Force batch is_batch = 0; - THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); - THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + THCTensor_(resize5d)(state, input, 1, input->size(0), input->size(1), input->size(2), input->size(3)); + THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2), gradOutput->size(3)); } - int64_t inputDepth = input->size[2]; - int64_t inputWidth = input->size[4]; - int64_t inputHeight = input->size[3]; + int64_t inputDepth = input->size(2); + int64_t inputWidth = input->size(4); + int64_t inputHeight = input->size(3); int64_t outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1; int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Resize output THCTensor_(resize5d)(state, gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth); @@ -298,7 +298,7 @@ void THNN_(VolumetricDilatedConvolution_updateGradInput)( // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) int64_t m = nInputPlane*kT*kW*kH; - int64_t n = gradColumns->size[1]; + int64_t n = gradColumns->size(1); int64_t k = nOutputPlane; // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) @@ -375,24 +375,24 @@ void THNN_(VolumetricDilatedConvolution_accGradParameters)( if (input->dim() == 4) { // Force batch is_batch = 0; - THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); - THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + THCTensor_(resize5d)(state, input, 1, input->size(0), input->size(1), input->size(2), input->size(3)); + THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2), gradOutput->size(3)); } - int64_t nInputPlane = input->size[1]; - int64_t nOutputPlane = gradOutput->size[1]; - int64_t inputDepth = input->size[2]; - int64_t inputWidth = input->size[4]; - int64_t inputHeight = input->size[3]; + int64_t nInputPlane = input->size(1); + int64_t nOutputPlane = gradOutput->size(1); + int64_t inputDepth = input->size(2); + int64_t inputWidth = input->size(4); + int64_t inputHeight = input->size(3); int64_t outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1; int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Define a buffer of ones, for bias accumulation - if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) { + if (ones->dim() != 3 || ones->size(0)*ones->size(1)*ones->size(2) < outputDepth*outputHeight*outputWidth) { // Resize plane and fill with ones... THCTensor_(resize3d)(state, ones, outputDepth, outputHeight, outputWidth); THCTensor_(fill)(state, ones, ScalarConvert::to(1)); @@ -429,7 +429,7 @@ void THNN_(VolumetricDilatedConvolution_accGradParameters)( // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) int64_t m = nOutputPlane; int64_t n = nInputPlane*kT*kW*kH; - int64_t k = columns->size[1]; + int64_t k = columns->size(1); // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) #ifdef THC_REAL_IS_FLOAT diff --git a/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu b/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu index bd653b9f7d195b..96310609e956f4 100644 --- a/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu +++ b/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu @@ -34,7 +34,7 @@ static inline void THNN_(VolumetricFullDilatedConvolution_shapeCheck)( "non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor " "expected for weight, but got: %s"); if (bias != NULL) { - THCUNN_check_dim_size(state, bias, 1, 0, weight->size[1]); + THCUNN_check_dim_size(state, bias, 1, 0, weight->size(1)); } } else if (!weight_nullable) { THError("weight tensor is expected to be non-nullable"); @@ -58,9 +58,9 @@ static inline void THNN_(VolumetricFullDilatedConvolution_shapeCheck)( THCUNN_check_dim_size(state, input, ndim, dimf, nInputPlane); } - int64_t inputWidth = input->size[dimw]; - int64_t inputHeight = input->size[dimh]; - int64_t inputDepth = input->size[dimd]; + int64_t inputWidth = input->size(dimw); + int64_t inputHeight = input->size(dimh); + int64_t inputDepth = input->size(dimd); int64_t outputDepth = (inputDepth - 1) * dT - 2*padT + (dilationT * (kT - 1) + 1) + adjT; int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH; int64_t outputWidth = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW; @@ -122,18 +122,18 @@ void THNN_(VolumetricFullDilatedConvolution_updateOutput)( if (input->dim() == 4) { // Force batch is_batch = 0; - THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + THCTensor_(resize5d)(state, input, 1, input->size(0), input->size(1), input->size(2), input->size(3)); } - int64_t inputWidth = input->size[4]; - int64_t inputHeight = input->size[3]; - int64_t inputDepth = input->size[2]; + int64_t inputWidth = input->size(4); + int64_t inputHeight = input->size(3); + int64_t inputDepth = input->size(2); int64_t outputDepth = (inputDepth - 1) * dT - 2*padT + (dilationT * (kT - 1) + 1) + adjT; int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH; int64_t outputWidth = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW; // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Resize output THCTensor_(resize5d)(state, output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth); @@ -144,7 +144,7 @@ void THNN_(VolumetricFullDilatedConvolution_updateOutput)( // Define a buffer of ones, for bias accumulation // Note: this buffer can be shared with other modules, it only ever gets increased, // and always contains ones. - if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) { + if (ones->dim() != 3 || ones->size(0)*ones->size(1)*ones->size(2) < outputDepth*outputHeight*outputWidth) { // Resize plane and fill with ones... THCTensor_(resize3d)(state, ones, outputDepth, outputHeight, outputWidth); THCTensor_(fill)(state, ones, ScalarConvert::to(1)); @@ -162,9 +162,9 @@ void THNN_(VolumetricFullDilatedConvolution_updateOutput)( // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - int64_t m = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4]; - int64_t n = columns->size[1]; - int64_t k = weight->size[0]; + int64_t m = weight->size(1) * weight->size(2) * weight->size(3) * weight->size(4); + int64_t n = columns->size(1); + int64_t k = weight->size(0); // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) #ifdef THC_REAL_IS_FLOAT @@ -272,19 +272,19 @@ void THNN_(VolumetricFullDilatedConvolution_updateGradInput)( if (input->dim() == 4) { // Force batch is_batch = 0; - THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); - THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + THCTensor_(resize5d)(state, input, 1, input->size(0), input->size(1), input->size(2), input->size(3)); + THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2), gradOutput->size(3)); } - int64_t inputWidth = input->size[4]; - int64_t inputHeight = input->size[3]; - int64_t inputDepth = input->size[2]; + int64_t inputWidth = input->size(4); + int64_t inputHeight = input->size(3); + int64_t inputDepth = input->size(2); int64_t outputDepth = (inputDepth - 1) * dT - 2*padT + (dilationT * (kT - 1) + 1) + adjT; int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH; int64_t outputWidth = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW; // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Resize output THCTensor_(resize5d)(state, gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth); @@ -316,9 +316,9 @@ void THNN_(VolumetricFullDilatedConvolution_updateGradInput)( // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - int64_t m = weight->size[0]; - int64_t n = gradColumns->size[1]; - int64_t k = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4]; + int64_t m = weight->size(0); + int64_t n = gradColumns->size(1); + int64_t k = weight->size(1) * weight->size(2) * weight->size(3) * weight->size(4); // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) #ifdef THC_REAL_IS_FLOAT @@ -407,22 +407,22 @@ void THNN_(VolumetricFullDilatedConvolution_accGradParameters)( if (input->dim() == 4) { // Force batch is_batch = 0; - THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); - THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + THCTensor_(resize5d)(state, input, 1, input->size(0), input->size(1), input->size(2), input->size(3)); + THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2), gradOutput->size(3)); } - int64_t inputWidth = input->size[4]; - int64_t inputHeight = input->size[3]; - int64_t inputDepth = input->size[2]; + int64_t inputWidth = input->size(4); + int64_t inputHeight = input->size(3); + int64_t inputDepth = input->size(2); int64_t outputDepth = (inputDepth - 1) * dT - 2*padT + (dilationT * (kT - 1) + 1) + adjT; int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH; int64_t outputWidth = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW; // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Define a buffer of ones, for bias accumulation - if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) { + if (ones->dim() != 3 || ones->size(0)*ones->size(1)*ones->size(2) < outputDepth*outputHeight*outputWidth) { // Resize plane and fill with ones... THCTensor_(resize3d)(state, ones, outputDepth, outputHeight, outputWidth); THCTensor_(fill)(state, ones, ScalarConvert::to(1)); @@ -458,9 +458,9 @@ void THNN_(VolumetricFullDilatedConvolution_accGradParameters)( // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - int64_t n = columns->size[0]; // nOutputPlane * kt * kh * kw - int64_t m = input_n->size[0]; // nInputPlane - int64_t k = columns->size[1]; // inputHeight * inputWidth + int64_t n = columns->size(0); // nOutputPlane * kt * kh * kw + int64_t m = input_n->size(0); // nInputPlane + int64_t k = columns->size(1); // inputHeight * inputWidth // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) #ifdef THC_REAL_IS_FLOAT @@ -527,7 +527,7 @@ void THNN_(VolumetricFullDilatedConvolution_accGradParameters)( // Resize if (is_batch == 0) { THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth); - THCTensor_(resize4d)(state, input, input->size[1], inputDepth, inputHeight, inputWidth); + THCTensor_(resize4d)(state, input, input->size(1), inputDepth, inputHeight, inputWidth); } THCTensor_(free)(state, input); diff --git a/aten/src/THCUNN/generic/VolumetricMaxUnpooling.cu b/aten/src/THCUNN/generic/VolumetricMaxUnpooling.cu index 0b5a17d0eec02f..8d482f04f873ed 100644 --- a/aten/src/THCUNN/generic/VolumetricMaxUnpooling.cu +++ b/aten/src/THCUNN/generic/VolumetricMaxUnpooling.cu @@ -51,11 +51,11 @@ static inline void THNN_(VolumetricMaxUnpooling_shapeCheck)( } if (gradOutput != NULL) { - if (oT != gradOutput->size[dimt] || oW != gradOutput->size[dimw] || oH != gradOutput->size[dimh]) + if (oT != gradOutput->size(dimt) || oW != gradOutput->size(dimw) || oH != gradOutput->size(dimh)) { THError( "Inconsistent gradOutput size. oT= %d, oH= %d, oW= %d, gradOutput: %dx%dx%d", - oT, oH, oW, gradOutput->size[dimt], gradOutput->size[dimh], gradOutput->size[dimw]); + oT, oH, oW, gradOutput->size(dimt), gradOutput->size(dimh), gradOutput->size(dimw)); } THCUNN_check_dim_size(state, gradOutput, input->dim(), dimn, inputSlices); diff --git a/aten/src/THCUNN/generic/VolumetricReplicationPadding.cu b/aten/src/THCUNN/generic/VolumetricReplicationPadding.cu index 071b322232126b..fab12533901c0d 100644 --- a/aten/src/THCUNN/generic/VolumetricReplicationPadding.cu +++ b/aten/src/THCUNN/generic/VolumetricReplicationPadding.cu @@ -28,9 +28,9 @@ static inline void THNN_(VolumetricReplicationPadding_shapeCheck)( } int numPlanes = THCTensor_(size)(state, input, planeDim); - int idepth = input->size[dimd]; - int iheight = input->size[dimh]; - int iwidth = input->size[dimw]; + int idepth = input->size(dimd); + int iheight = input->size(dimh); + int iwidth = input->size(dimw); int odepth = idepth + pfront + pback; int oheight = iheight + ptop + pbottom; int owidth = iwidth + pleft + pright; diff --git a/aten/src/THCUNN/im2col.h b/aten/src/THCUNN/im2col.h index 22f47e0eb9b25e..ba905609d3135f 100644 --- a/aten/src/THCUNN/im2col.h +++ b/aten/src/THCUNN/im2col.h @@ -8,28 +8,28 @@ // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu) template __launch_bounds__(CUDA_NUM_THREADS) -__global__ void im2col_kernel(const int n, const Dtype* data_im, - const int height, const int width, - const int ksize_h, const int ksize_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - const int dilation_h, const int dilation_w, - const int height_col, const int width_col, +__global__ void im2col_kernel(const int64_t n, const Dtype* data_im, + const int64_t height, const int64_t width, + const int64_t ksize_h, const int64_t ksize_w, + const int64_t pad_h, const int64_t pad_w, + const int64_t stride_h, const int64_t stride_w, + const int64_t dilation_h, const int64_t dilation_w, + const int64_t height_col, const int64_t width_col, Dtype* data_col) { CUDA_KERNEL_LOOP(index, n) { - int w_out = index % width_col; + int64_t w_out = index % width_col; index /= width_col; - int h_out = index % height_col; - int channel_in = index / height_col; - int channel_out = channel_in * ksize_h * ksize_w; - int h_in = h_out * stride_h - pad_h; - int w_in = w_out * stride_w - pad_w; + int64_t h_out = index % height_col; + int64_t channel_in = index / height_col; + int64_t channel_out = channel_in * ksize_h * ksize_w; + int64_t h_in = h_out * stride_h - pad_h; + int64_t w_in = w_out * stride_w - pad_w; data_col += (channel_out * height_col + h_out) * width_col + w_out; data_im += (channel_in * height + h_in) * width + w_in; - for (int i = 0; i < ksize_h; ++i) { - for (int j = 0; j < ksize_w; ++j) { - int h = h_in + i * dilation_h; - int w = w_in + j * dilation_w; + for (int64_t i = 0; i < ksize_h; ++i) { + for (int64_t j = 0; j < ksize_w; ++j) { + int64_t h = h_in + i * dilation_h; + int64_t w = w_in + j * dilation_w; *data_col = (h >= 0 && w >= 0 && h < height && w < width) ? data_im[i * dilation_h * width + j * dilation_w] : ScalarConvert::to(0); data_col += height_col * width_col; @@ -39,15 +39,15 @@ __global__ void im2col_kernel(const int n, const Dtype* data_im, } template -void im2col(cudaStream_t stream, const Dtype* data_im, const int channels, - const int height, const int width, - const int height_col, const int width_col, - const int ksize_h, const int ksize_w, const int pad_h, - const int pad_w, const int stride_h, const int stride_w, - const int dilation_h, const int dilation_w, Dtype* data_col) { +void im2col(cudaStream_t stream, const Dtype* data_im, const int64_t channels, + const int64_t height, const int64_t width, + const int64_t height_col, const int64_t width_col, + const int64_t ksize_h, const int64_t ksize_w, const int64_t pad_h, + const int64_t pad_w, const int64_t stride_h, const int64_t stride_w, + const int64_t dilation_h, const int64_t dilation_w, Dtype* data_col) { // We are going to launch channels * height_col * width_col kernels, each // kernel responsible for copying a single-channel grid. - int num_kernels = channels * height_col * width_col; + int64_t num_kernels = channels * height_col * width_col; // Launch im2col_kernel <<>> ( num_kernels, data_im, height, width, ksize_h, ksize_w, @@ -60,37 +60,37 @@ void im2col(cudaStream_t stream, const Dtype* data_im, const int channels, template __launch_bounds__(CUDA_NUM_THREADS) -__global__ void col2im_kernel(const int n, const Dtype* data_col, - const int height, const int width, const int channels, - const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - const int dilation_h, const int dilation_w, - const int height_col, const int width_col, +__global__ void col2im_kernel(const int64_t n, const Dtype* data_col, + const int64_t height, const int64_t width, const int64_t channels, + const int64_t kernel_h, const int64_t kernel_w, + const int64_t pad_h, const int64_t pad_w, + const int64_t stride_h, const int64_t stride_w, + const int64_t dilation_h, const int64_t dilation_w, + const int64_t height_col, const int64_t width_col, Dtype* data_im) { CUDA_KERNEL_LOOP(index, n) { Acctype val = Acctype(0); - const int w_im = index % width + pad_w; - const int h_im = (index / width) % height + pad_h; - const int c_im = index / (width * height); - int kernel_extent_w = (kernel_w - 1) * dilation_w + 1; - int kernel_extent_h = (kernel_h - 1) * dilation_h + 1; + const int64_t w_im = index % width + pad_w; + const int64_t h_im = (index / width) % height + pad_h; + const int64_t c_im = index / (width * height); + int64_t kernel_extent_w = (kernel_w - 1) * dilation_w + 1; + int64_t kernel_extent_h = (kernel_h - 1) * dilation_h + 1; // compute the start and end of the output - const int w_col_start = + const int64_t w_col_start = (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1; - const int w_col_end = min(w_im / stride_w + 1, width_col); - const int h_col_start = + const int64_t w_col_end = min(w_im / stride_w + 1, width_col); + const int64_t h_col_start = (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1; - const int h_col_end = min(h_im / stride_h + 1, height_col); + const int64_t h_col_end = min(h_im / stride_h + 1, height_col); // TODO: use LCM of stride and dilation to avoid unnecessary loops - for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) { - for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) { - int h_k = (h_im - h_col * stride_h); - int w_k = (w_im - w_col * stride_w); + for (int64_t h_col = h_col_start; h_col < h_col_end; h_col += 1) { + for (int64_t w_col = w_col_start; w_col < w_col_end; w_col += 1) { + int64_t h_k = (h_im - h_col * stride_h); + int64_t w_k = (w_im - w_col * stride_w); if (h_k % dilation_h == 0 && w_k % dilation_w == 0) { h_k /= dilation_h; w_k /= dilation_w; - int data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) * + int64_t data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) * height_col + h_col) * width_col + w_col; val += data_col[data_col_index]; } @@ -101,21 +101,21 @@ __global__ void col2im_kernel(const int n, const Dtype* data_col, } template -void col2im(cudaStream_t stream, const Dtype* data_col, const int channels, - const int height, const int width, - const int output_height, const int output_width, - const int patch_h, const int patch_w, const int pad_h, - const int pad_w, const int stride_h, const int stride_w, - const int dilation_h, const int dilation_w, Dtype* data_im); +void col2im(cudaStream_t stream, const Dtype* data_col, const int64_t channels, + const int64_t height, const int64_t width, + const int64_t output_height, const int64_t output_width, + const int64_t patch_h, const int64_t patch_w, const int64_t pad_h, + const int64_t pad_w, const int64_t stride_h, const int64_t stride_w, + const int64_t dilation_h, const int64_t dilation_w, Dtype* data_im); template -void col2im(cudaStream_t stream, const Dtype* data_col, const int channels, - const int height, const int width, - const int output_height, const int output_width, - const int patch_h, const int patch_w, const int pad_h, - const int pad_w, const int stride_h, const int stride_w, - const int dilation_h, const int dilation_w, Dtype* data_im) { - int num_kernels = channels * height * width; +void col2im(cudaStream_t stream, const Dtype* data_col, const int64_t channels, + const int64_t height, const int64_t width, + const int64_t output_height, const int64_t output_width, + const int64_t patch_h, const int64_t patch_w, const int64_t pad_h, + const int64_t pad_w, const int64_t stride_h, const int64_t stride_w, + const int64_t dilation_h, const int64_t dilation_w, Dtype* data_im) { + int64_t num_kernels = channels * height * width; // To avoid involving atomic operations, we will launch one kernel per // bottom dimension, and then in the kernel add up the top dimensions. col2im_kernel <<>> ( diff --git a/aten/src/THNN/generic/Col2Im.c b/aten/src/THNN/generic/Col2Im.c index cb95715d6f9b0d..97ed60b7274330 100644 --- a/aten/src/THNN/generic/Col2Im.c +++ b/aten/src/THNN/generic/Col2Im.c @@ -54,25 +54,25 @@ // // ALSO do vol2col -static void THNN_(im2col)(const real* data_im, const int channels, - const int height, const int width, - const int output_height, const int output_width, - const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - const int dilation_h, const int dilation_w, +static void THNN_(im2col)(const real* data_im, const int64_t channels, + const int64_t height, const int64_t width, + const int64_t output_height, const int64_t output_width, + const int64_t kernel_h, const int64_t kernel_w, + const int64_t pad_h, const int64_t pad_w, + const int64_t stride_h, const int64_t stride_w, + const int64_t dilation_h, const int64_t dilation_w, real* data_col) { - const int height_col = output_height; - const int width_col = output_width; - const int channels_col = channels * kernel_h * kernel_w; - for (int c_col = 0; c_col < channels_col; ++c_col) { - int w_offset = c_col % kernel_w; - int h_offset = (c_col / kernel_w) % kernel_h; - int c_im = c_col / kernel_h / kernel_w; - for (int h_col = 0; h_col < height_col; ++h_col) { - int h_im = h_col * stride_h - pad_h + h_offset * dilation_h; - for (int w_col = 0; w_col < width_col; ++w_col) { - int w_im = w_col * stride_w - pad_w + w_offset * dilation_w; + const int64_t height_col = output_height; + const int64_t width_col = output_width; + const int64_t channels_col = channels * kernel_h * kernel_w; + for (int64_t c_col = 0; c_col < channels_col; ++c_col) { + int64_t w_offset = c_col % kernel_w; + int64_t h_offset = (c_col / kernel_w) % kernel_h; + int64_t c_im = c_col / kernel_h / kernel_w; + for (int64_t h_col = 0; h_col < height_col; ++h_col) { + int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h; + for (int64_t w_col = 0; w_col < width_col; ++w_col) { + int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w; data_col[(c_col * height_col + h_col) * width_col + w_col] = (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ? data_im[(c_im * height + h_im) * width + w_im] : 0; @@ -81,26 +81,26 @@ static void THNN_(im2col)(const real* data_im, const int channels, } } -static void THNN_(col2im)(const real* data_col, const int channels, - const int height, const int width, - const int output_height, const int output_width, - const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - const int dilation_h, const int dilation_w, +static void THNN_(col2im)(const real* data_col, const int64_t channels, + const int64_t height, const int64_t width, + const int64_t output_height, const int64_t output_width, + const int64_t kernel_h, const int64_t kernel_w, + const int64_t pad_h, const int64_t pad_w, + const int64_t stride_h, const int64_t stride_w, + const int64_t dilation_h, const int64_t dilation_w, real* data_im) { memset(data_im, 0, sizeof(real) * height * width * channels); - const int height_col = output_height; - const int width_col = output_width; - const int channels_col = channels * kernel_h * kernel_w; - for (int c_col = 0; c_col < channels_col; ++c_col) { - int w_offset = c_col % kernel_w; - int h_offset = (c_col / kernel_w) % kernel_h; - int c_im = c_col / kernel_h / kernel_w; - for (int h_col = 0; h_col < height_col; ++h_col) { - int h_im = h_col * stride_h - pad_h + h_offset * dilation_h; - for (int w_col = 0; w_col < width_col; ++w_col) { - int w_im = w_col * stride_w - pad_w + w_offset * dilation_w; + const int64_t height_col = output_height; + const int64_t width_col = output_width; + const int64_t channels_col = channels * kernel_h * kernel_w; + for (int64_t c_col = 0; c_col < channels_col; ++c_col) { + int64_t w_offset = c_col % kernel_w; + int64_t h_offset = (c_col / kernel_w) % kernel_h; + int64_t c_im = c_col / kernel_h / kernel_w; + for (int64_t h_col = 0; h_col < height_col; ++h_col) { + int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h; + for (int64_t w_col = 0; w_col < width_col; ++w_col) { + int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w; if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width) data_im[(c_im * height + h_im) * width + w_im] += data_col[(c_col * height_col + h_col) * width_col + w_col]; @@ -113,9 +113,9 @@ static inline void THNN_(Col2Im_shapeCheck)( THNNState *state, THTensor *input, THTensor *gradOutput, - int outputHeight, int outputWidth, - int kH, int kW, int dH, int dW, - int padH, int padW, int sH, int sW) { + int64_t outputHeight, int64_t outputWidth, + int64_t kH, int64_t kW, int64_t dH, int64_t dW, + int64_t padH, int64_t padW, int64_t sH, int64_t sW) { THArgCheck(kW > 0 && kH > 0, 6, "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); @@ -124,12 +124,12 @@ static inline void THNN_(Col2Im_shapeCheck)( THArgCheck(dW > 0 && dH > 0, 8, "dilation should be greater than zero, but got dH: %d dW: %d", dH, dW); - int ndim = THTensor_(nDimension)(input); + int64_t ndim = THTensor_(nDimension)(input); THNN_ARGCHECK(!input->is_empty() && (ndim == 2 || ndim == 3), 2, input, "Expected non-empty 2D or 3D input tensor, but got input of shape %s"); - int batch_dim = (ndim == 3) ? 0 : -1; - int64_t nInputPlane = input->size[batch_dim + 1]; + int64_t batch_dim = (ndim == 3) ? 0 : -1; + int64_t nInputPlane = input->size(batch_dim + 1); if (nInputPlane % (kW * kH) != 0) { THError("Expected size of input's dimension 1 to be divisible by the " @@ -137,7 +137,7 @@ static inline void THNN_(Col2Im_shapeCheck)( "kernel_size=(%d, %d).", (long long) nInputPlane, kH, kW); } - int64_t inputLength = input->size[batch_dim + 2]; + int64_t inputLength = input->size(batch_dim + 2); int64_t nBlocksH = 1 + (outputHeight + 2 * padH - dH * (kH - 1) - 1) / sH; int64_t nBlocksW = 1 + ( outputWidth + 2 * padW - dW * (kW - 1) - 1) / sW; @@ -161,11 +161,11 @@ void THNN_(Col2Im_updateOutput)( THNNState *state, THTensor *input, THTensor *output, - int outputHeight, int outputWidth, - int kH, int kW, - int dH, int dW, - int padH, int padW, - int sH, int sW) { + int64_t outputHeight, int64_t outputWidth, + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW) { THNN_(Col2Im_shapeCheck)(state, input, NULL, outputHeight, outputWidth, kH, kW, dH, dW, padH, padW, sH, sW); @@ -174,11 +174,11 @@ void THNN_(Col2Im_updateOutput)( if (input->dim() == 2) { // Force batch batched_input = false; - THTensor_(resize3d)(input, 1, input->size[0], input->size[1]); + THTensor_(resize3d)(input, 1, input->size(0), input->size(1)); } - long batchSize = input->size[0]; - long nInputPlane = input->size[1]; + long batchSize = input->size(0); + long nInputPlane = input->size(1); long nOutputPlane = nInputPlane / (kW * kH); input = THTensor_(newContiguous)(input); @@ -189,10 +189,10 @@ void THNN_(Col2Im_updateOutput)( THTensor *input_n = THTensor_(new)(); THTensor *output_n = THTensor_(new)(); - int height_col = (outputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1; - int width_col = (outputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1; + int64_t height_col = (outputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1; + int64_t width_col = (outputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1; - for (int elt = 0; elt < batchSize; elt++) { + for (int64_t elt = 0; elt < batchSize; elt++) { THTensor_(select)(input_n, input, 0, elt); THTensor_(select)(output_n, output, 0, elt); @@ -220,10 +220,10 @@ void THNN_(Col2Im_updateGradInput)( THNNState *state, THTensor *gradOutput, THTensor *gradInput, - int kH, int kW, - int dH, int dW, - int padH, int padW, - int sH, int sW) { + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW) { THNN_(Im2Col_updateOutput)(state, gradOutput, gradInput, kH, kW, dH, dW, padH, padW, sH, sW); diff --git a/aten/src/THNN/generic/FeatureLPPooling.c b/aten/src/THNN/generic/FeatureLPPooling.c index fdb4bbefa78878..fec3d5d7e38a46 100644 --- a/aten/src/THNN/generic/FeatureLPPooling.c +++ b/aten/src/THNN/generic/FeatureLPPooling.c @@ -24,9 +24,9 @@ static inline size_t flpGetOffset(FeatureLPPoolingSizes* s, FEATURE_LP_SIZE_TYPE opt1, FEATURE_LP_SIZE_TYPE opt2) { return s->stride[0] * batch + - s->stride[1] * feature + - s->stride[2] * opt1 + - s->stride[3] * opt2; + s->stride[1] * feature + + s->stride[2] * opt1 + + s->stride[3] * opt2; } static inline size_t flpOutputSize(FEATURE_LP_SIZE_TYPE inputSize, diff --git a/aten/src/THNN/generic/Im2Col.c b/aten/src/THNN/generic/Im2Col.c index 8678a6ea8946f9..5ae83c5416c99c 100644 --- a/aten/src/THNN/generic/Im2Col.c +++ b/aten/src/THNN/generic/Im2Col.c @@ -6,8 +6,8 @@ static inline void THNN_(Im2Col_shapeCheck)( THNNState *state, THTensor *input, THTensor *gradOutput, - int kH, int kW, int dH, int dW, - int padH, int padW, int sH, int sW) { + int64_t kH, int64_t kW, int64_t dH, int64_t dW, + int64_t padH, int64_t padW, int64_t sH, int64_t sW) { THArgCheck(kW > 0 && kH > 0, 4, "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); @@ -16,21 +16,21 @@ static inline void THNN_(Im2Col_shapeCheck)( THArgCheck(sW > 0 && sH > 0, 10, "stride should be greater than zero, but got sH: %d sW: %d", sH, sW); - int ndim = THTensor_(nDimension)(input); + int64_t ndim = THTensor_(nDimension)(input); THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input, "Expected non-empty 3D or 4D input tensor, but got input of shape %s"); - int dim_batch = 0; + int64_t dim_batch = 0; if (ndim == 3) { dim_batch = -1; } - int nInputPlane = THTensor_(size)(input, dim_batch + 1); - int inputHeight = THTensor_(size)(input, dim_batch + 2); - int inputWidth = THTensor_(size)(input, dim_batch + 3); - int outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1; - int outputWidth = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1; - int nOutputPlane = nInputPlane * kW * kH; - int outputLength = outputHeight * outputWidth; + int64_t nInputPlane = THTensor_(size)(input, dim_batch + 1); + int64_t inputHeight = THTensor_(size)(input, dim_batch + 2); + int64_t inputWidth = THTensor_(size)(input, dim_batch + 3); + int64_t outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1; + int64_t outputWidth = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1; + int64_t nOutputPlane = nInputPlane * kW * kH; + int64_t outputLength = outputHeight * outputWidth; if (outputHeight < 1 || outputWidth < 1) { THError("Given input with spatial size (%d, %d), kernel_size=(%d, %d), " @@ -46,10 +46,10 @@ void THNN_(Im2Col_updateOutput)( THNNState *state, THTensor *input, THTensor *output, - int kH, int kW, - int dH, int dW, - int padH, int padW, - int sH, int sW) { + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW) { THNN_(Im2Col_shapeCheck)(state, input, NULL, kH, kW, dH, dW, padH, padW, sH, sW); @@ -57,18 +57,18 @@ void THNN_(Im2Col_updateOutput)( bool batched_input = true; if (input->dim() == 3) { batched_input = false; - THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); + THTensor_(resize4d)(input, 1, input->size(0), input->size(1), input->size(2)); } - int batchSize = THTensor_(size)(input, 0); - int nInputPlane = THTensor_(size)(input, 1); - int inputHeight = THTensor_(size)(input, 2); - int inputWidth = THTensor_(size)(input, 3); + int64_t batchSize = THTensor_(size)(input, 0); + int64_t nInputPlane = THTensor_(size)(input, 1); + int64_t inputHeight = THTensor_(size)(input, 2); + int64_t inputWidth = THTensor_(size)(input, 3); - int outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1; - int outputWidth = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1; - int nOutputPlane = nInputPlane * kW * kH; - int outputLength = outputHeight * outputWidth; + int64_t outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1; + int64_t outputWidth = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1; + int64_t nOutputPlane = nInputPlane * kW * kH; + int64_t outputLength = outputHeight * outputWidth; THTensor_(resize3d)(output, batchSize, nOutputPlane, outputLength); THTensor_(zero)(output); @@ -76,7 +76,7 @@ void THNN_(Im2Col_updateOutput)( THTensor *input_n = THTensor_(new)(); THTensor *output_n = THTensor_(new)(); - for (int elt = 0; elt < batchSize; elt++) { + for (int64_t elt = 0; elt < batchSize; elt++) { THTensor_(select)(input_n, input, 0, elt); THTensor_(select)(output_n, output, 0, elt); @@ -102,11 +102,11 @@ void THNN_(Im2Col_updateGradInput)( THNNState *state, THTensor *gradOutput, THTensor *gradInput, - int inputHeight, int inputWidth, - int kH, int kW, - int dH, int dW, - int padH, int padW, - int sH, int sW) { + int64_t inputHeight, int64_t inputWidth, + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW) { THNN_(Col2Im_updateOutput)(state, gradOutput, gradInput, diff --git a/aten/src/THNN/generic/IndexLinear.c b/aten/src/THNN/generic/IndexLinear.c index 50aa93d788c6ec..5bc4c548fc24d0 100644 --- a/aten/src/THNN/generic/IndexLinear.c +++ b/aten/src/THNN/generic/IndexLinear.c @@ -65,7 +65,7 @@ void THNN_(IndexLinear_updateOutput)( real* outputData = THTensor_(data)(output); real* valuesData = THTensor_(data)(values); real* weightData = THTensor_(data)(weight); - int64_t weightStride0 = weight->stride[0]; + int64_t weightStride0 = weight->stride(0); real* biasData = THTensor_(data)(bias); int64_t* keysData = THLongTensor_data(keys); @@ -258,7 +258,7 @@ void THNN_(IndexLinear_updateParameters)( /* Access the storage data/strides */ real* gradWeightData = THTensor_(data)(gradWeight); real* weightData = THTensor_(data)(weight); - int64_t weightStride0 = weight->stride[0]; + int64_t weightStride0 = weight->stride(0); real* gradBiasData = THTensor_(data)(gradBias); real* biasData = THTensor_(data)(bias); int64_t* keysData = THLongTensor_data(runningKeys); @@ -406,7 +406,7 @@ void THNN_(IndexLinear_accUpdateGradParameters)( real* valuesData =THTensor_(data)(values); real* weightData = THTensor_(data)(weight); real* biasData = THTensor_(data)(bias); - int64_t weightStride0 = weight->stride[0]; + int64_t weightStride0 = weight->stride(0); int64_t* keysData = THLongTensor_data(keys); int64_t* sizesData = THLongTensor_data(sizes); diff --git a/aten/src/THNN/generic/LookupTable.c b/aten/src/THNN/generic/LookupTable.c index 05694fc70d2336..d040d00ade3515 100644 --- a/aten/src/THNN/generic/LookupTable.c +++ b/aten/src/THNN/generic/LookupTable.c @@ -40,7 +40,7 @@ void THNN_(LookupTable_accGradParameters)( if (scaleGradByFreq) { - THIntegerTensor_(resize1d)(count, gradWeight->size[0]); + THIntegerTensor_(resize1d)(count, gradWeight->size(0)); count_data = THIntegerTensor_(data)(count); } diff --git a/aten/src/THNN/generic/MultiLabelMarginCriterion.c b/aten/src/THNN/generic/MultiLabelMarginCriterion.c index 3072c03a03ea81..0699c3ac471c55 100644 --- a/aten/src/THNN/generic/MultiLabelMarginCriterion.c +++ b/aten/src/THNN/generic/MultiLabelMarginCriterion.c @@ -23,16 +23,16 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)( if (input->dim() == 1) { nframe = 1; - dim = input->size[0]; - AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size[0] == dim), + dim = input->size(0); + AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size(0) == dim), "inconsistent target size"); } else { - nframe = input->size[0]; - dim = input->size[1]; - AT_CHECK(!target->is_empty() && target->dim() == 2 && (target->size[0] == nframe) - && (target->size[1] == dim), "inconsistent target size"); + nframe = input->size(0); + dim = input->size(1); + AT_CHECK(!target->is_empty() && target->dim() == 2 && (target->size(0) == nframe) + && (target->size(1) == dim), "inconsistent target size"); } THArgCheck(THIndexTensor_(minall)(target) >= -1+TH_INDEX_BASE, 3, "target out of range"); @@ -161,20 +161,20 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)( if (input->dim() == 1) { nframe = 1; - dim = input->size[0]; - AT_CHECK((!target->is_empty() && target->dim() == 1) && (target->size[0] == dim), + dim = input->size(0); + AT_CHECK((!target->is_empty() && target->dim() == 1) && (target->size(0) == dim), "inconsistent target size"); - AT_CHECK((!isTarget->is_empty() && isTarget->dim() == 1) && (isTarget->size[0] == dim), + AT_CHECK((!isTarget->is_empty() && isTarget->dim() == 1) && (isTarget->size(0) == dim), "inconsistent isTarget size"); } else { - nframe = input->size[0]; - dim = input->size[1]; - AT_CHECK(!target->is_empty() && (target->dim() == 2) && (target->size[0] == nframe) - && (target->size[1] == dim), 3, "inconsistent target size"); - AT_CHECK(!isTarget->is_empty() && (isTarget->dim() == 2) && (isTarget->size[0] == nframe) - && (isTarget->size[1] == dim), 3, "inconsistent isTarget size"); + nframe = input->size(0); + dim = input->size(1); + AT_CHECK(!target->is_empty() && (target->dim() == 2) && (target->size(0) == nframe) + && (target->size(1) == dim), 3, "inconsistent target size"); + AT_CHECK(!isTarget->is_empty() && (isTarget->dim() == 2) && (isTarget->size(0) == nframe) + && (isTarget->size(1) == dim), 3, "inconsistent isTarget size"); } THArgCheck(THIndexTensor_(minall)(target) >= -1+TH_INDEX_BASE, 3, "target out of range"); diff --git a/aten/src/THNN/generic/MultiMarginCriterion.c b/aten/src/THNN/generic/MultiMarginCriterion.c index 620e13c5c02b16..424669e5de8515 100644 --- a/aten/src/THNN/generic/MultiMarginCriterion.c +++ b/aten/src/THNN/generic/MultiMarginCriterion.c @@ -26,13 +26,13 @@ void THNN_(MultiMarginCriterion_updateOutput)( if (input->dim() == 1) { nframe = 1; - dim = input->size[0]; + dim = input->size(0); } else { - nframe = input->size[0]; - dim = input->size[1]; - AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size[0] == nframe), + nframe = input->size(0); + dim = input->size(1); + AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe), "inconsistent target size, got: ", target->sizes()); } @@ -142,13 +142,13 @@ void THNN_(MultiMarginCriterion_updateGradInput)( if (input->dim() == 1) { nframe = 1; - dim = input->size[0]; + dim = input->size(0); } else { - nframe = input->size[0]; - dim = input->size[1]; - AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size[0] == nframe), + nframe = input->size(0); + dim = input->size(1); + AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe), "inconsistent target size, got: ", target->sizes()); } diff --git a/aten/src/THNN/generic/PReLU.c b/aten/src/THNN/generic/PReLU.c index 462280c92a1b60..81486227115068 100644 --- a/aten/src/THNN/generic/PReLU.c +++ b/aten/src/THNN/generic/PReLU.c @@ -26,13 +26,13 @@ void THNN_(PReLU_updateOutput)( int64_t bs = 1, ks = 1; { int64_t input_ndim = THTensor_(_nDimension)(input); - if (input->size[input_ndim > 1] != nOutputPlane) - THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[input_ndim > 1]); + if (input->size(input_ndim > 1) != nOutputPlane) + THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(input_ndim > 1)); if (input_ndim > 1) { - bs = input->size[0]; + bs = input->size(0); for (int d = 2; d < input_ndim; d++) { - ks *= input->size[d]; + ks *= input->size(d); } } } @@ -91,13 +91,13 @@ void THNN_(PReLU_updateGradInput)( int64_t bs = 1, ks = 1; { int64_t input_ndim = THTensor_(_nDimension)(input); - if (input->size[input_ndim > 1] != nOutputPlane) - THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[input_ndim > 1]); + if (input->size(input_ndim > 1) != nOutputPlane) + THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(input_ndim > 1)); if (input_ndim > 1) { - bs = input->size[0]; + bs = input->size(0); for (int d = 2; d < input_ndim; d++) { - ks *= input->size[d]; + ks *= input->size(d); } } } @@ -162,13 +162,13 @@ void THNN_(PReLU_accGradParameters)( int64_t bs = 1, ks = 1; { int64_t input_ndim = THTensor_(_nDimension)(input); - if (input->size[input_ndim > 1] != nOutputPlane) - THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[input_ndim > 1]); + if (input->size(input_ndim > 1) != nOutputPlane) + THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(input_ndim > 1)); if (input_ndim > 1) { - bs = input->size[0]; + bs = input->size(0); for (int d = 2; d < input_ndim; d++) { - ks *= input->size[d]; + ks *= input->size(d); } } } diff --git a/aten/src/THNN/generic/SparseLinear.c b/aten/src/THNN/generic/SparseLinear.c index a0c078b4f895c9..c7a00e335dbc6b 100644 --- a/aten/src/THNN/generic/SparseLinear.c +++ b/aten/src/THNN/generic/SparseLinear.c @@ -6,39 +6,39 @@ #include #endif -#define ROW_PTR2(t, r) (THTensor_(data)(t) + (r) * (t)->stride[0]) -#define COL_PTR2(t, c) (THTensor_(data)(t) + (c) * (t)->stride[1]) +#define ROW_PTR2(t, r) (THTensor_(data)(t) + (r) * (t)->stride(0)) +#define COL_PTR2(t, c) (THTensor_(data)(t) + (c) * (t)->stride(1)) static bool THNN_(checkLegacyInput)(THTensor* t) { - return !t->is_empty() && t->dim() == 3 && t->size[2] == 2; + return !t->is_empty() && t->dim() == 3 && t->size(2) == 2; } static bool THNN_(checkInput)(THTensor* t) { - return!t->is_empty() && t->dim() == 2 && t->size[1] == 3; + return!t->is_empty() && t->dim() == 2 && t->size(1) == 3; } static bool THNN_(checkSize2D)(THTensor* t, int64_t size0, int64_t size1) { - return !t->is_empty() && t->dim() == 2 && t->size[0] == size0 && t->size[1] == size1; + return !t->is_empty() && t->dim() == 2 && t->size(0) == size0 && t->size(1) == size1; } static bool THNN_(checkSize1D)(THTensor* t, int64_t size0) { - return !t->is_empty() && t->dim() == 1 && t->size[0] == size0; + return !t->is_empty() && t->dim() == 1 && t->size(0) == size0; } static void THNN_(set1d)(THTensor *t, int64_t x0, real value) { - THStorage_(set)(t->storage, t->storageOffset + x0*t->stride[0], value); + THStorage_(set)(t->storage, t->storageOffset + x0*t->stride(0), value); } static real THNN_(get3d)(const THTensor *t, int64_t x0, int64_t x1, int64_t x2) { return THStorage_(get)(t->storage, t->storageOffset + - x0*t->stride[0] + x1*t->stride[1] + x2*t->stride[2]); + x0*t->stride(0) + x1*t->stride(1) + x2*t->stride(2)); } static real THNN_(get2d)(const THTensor *t, int64_t x0, int64_t x1) { return THStorage_(get)(t->storage, t->storageOffset + - x0*t->stride[0] + x1*t->stride[1]); + x0*t->stride(0) + x1*t->stride(1)); } void THNN_(SparseLinear_updateOutput)( @@ -92,8 +92,8 @@ void THNN_(SparseLinear_updateOutput)( if (offset >= 0 && offset < inDim) { THBlas_(axpy)(outDim, val, - COL_PTR2(weight, offset), weight->stride[0], - ROW_PTR2(output, h), output->stride[1]); + COL_PTR2(weight, offset), weight->stride(0), + ROW_PTR2(output, h), output->stride(1)); } else { THError("index out of bound. updateOutput: %d not between 1 and %d", offset + 1, inDim); @@ -147,8 +147,8 @@ void THNN_(SparseLinear_legacyUpdateOutput)( if (offset >= 0 && offset < inDim) { THBlas_(axpy)(outDim, val, - COL_PTR2(weight, offset), weight->stride[0], - ROW_PTR2(output, h), output->stride[1]); + COL_PTR2(weight, offset), weight->stride(0), + ROW_PTR2(output, h), output->stride(1)); } else { THError("index out of bound. updateOutput: %d not between 1 and %d", offset + 1, inDim); @@ -221,8 +221,8 @@ void THNN_(SparseLinear_accGradParameters)( if (offset >= 0 && offset < inDim) { THBlas_(axpy)(outDim, val, - ROW_PTR2(gradOutput, h), gradOutput->stride[1], - COL_PTR2(gradWeight, offset), gradWeight->stride[0]); + ROW_PTR2(gradOutput, h), gradOutput->stride(1), + COL_PTR2(gradWeight, offset), gradWeight->stride(0)); } else { THError( "index out of bound. accGradParameters: %d not between 1 and %d", @@ -289,8 +289,8 @@ void THNN_(SparseLinear_legacyAccGradParameters)( if (offset >= 0 && offset < inDim) { THBlas_(axpy)(outDim, val, - ROW_PTR2(gradOutput, h), gradOutput->stride[1], - COL_PTR2(gradWeight, offset), gradWeight->stride[0]); + ROW_PTR2(gradOutput, h), gradOutput->stride(1), + COL_PTR2(gradWeight, offset), gradWeight->stride(0)); } else { THError( "index out of bound. accGradParameters: %d not between 1 and %d", @@ -324,8 +324,8 @@ void THNN_(SparseLinear_updateParameters)( { real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_); int64_t i; - int64_t outDim = weight->size[0]; - int64_t inDim = weight->size[1]; + int64_t outDim = weight->size(0); + int64_t inDim = weight->size(1); THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, "gradWeight size wrong"); @@ -380,8 +380,8 @@ void THNN_(SparseLinear_updateParameters)( int64_t offset = (int64_t)uniqueOffsets_p[i]; THBlas_(axpy)(outDim, -learningRate, - COL_PTR2(gradWeight, offset), gradWeight->stride[0], - COL_PTR2(weight, offset), weight->stride[0]); + COL_PTR2(gradWeight, offset), gradWeight->stride(0), + COL_PTR2(weight, offset), weight->stride(0)); } THTensor_(free)(uniqueOffsets); @@ -398,8 +398,8 @@ void THNN_(SparseLinear_legacyUpdateParameters)( { real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_); int64_t h, i; - int64_t outDim = weight->size[0]; - int64_t inDim = weight->size[1]; + int64_t outDim = weight->size(0); + int64_t inDim = weight->size(1); THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, "gradWeight size wrong"); @@ -456,8 +456,8 @@ void THNN_(SparseLinear_legacyUpdateParameters)( int64_t offset = (int64_t)uniqueOffsets_p[i]; THBlas_(axpy)(outDim, -learningRate, - COL_PTR2(gradWeight, offset), gradWeight->stride[0], - COL_PTR2(weight, offset), weight->stride[0]); + COL_PTR2(gradWeight, offset), gradWeight->stride(0), + COL_PTR2(weight, offset), weight->stride(0)); } THTensor_(free)(uniqueOffsets); @@ -471,8 +471,8 @@ void THNN_(SparseLinear_zeroGradParameters)( { int64_t i, j; - int64_t outDim = gradWeight->size[0]; - int64_t inDim = gradWeight->size[1]; + int64_t outDim = gradWeight->size(0); + int64_t inDim = gradWeight->size(1); THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong"); THArgCheck(THNN_(checkInput)(lastInput), 4, @@ -492,10 +492,10 @@ void THNN_(SparseLinear_zeroGradParameters)( int64_t offset = (int64_t)(THNN_(get2d)(lastInput, i, 1)) - 1; if (offset >= 0 && offset < inDim) { real* pGradWeight = COL_PTR2(gradWeight, offset); - if (gradWeight->stride[0] == 1) { + if (gradWeight->stride(0) == 1) { THVector_(fill)(pGradWeight, 0, outDim); } else { - int64_t stride = gradWeight->stride[0]; + int64_t stride = gradWeight->stride(0); for (j = 0; j < outDim; ++j) { pGradWeight[j * stride] = 0; } @@ -517,8 +517,8 @@ void THNN_(SparseLinear_legacyZeroGradParameters)( { int64_t h, i, j; - int64_t outDim = gradWeight->size[0]; - int64_t inDim = gradWeight->size[1]; + int64_t outDim = gradWeight->size(0); + int64_t inDim = gradWeight->size(1); THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong"); THArgCheck(THNN_(checkLegacyInput)(lastInput), 4, @@ -540,10 +540,10 @@ void THNN_(SparseLinear_legacyZeroGradParameters)( int64_t offset = (int64_t)(THNN_(get3d)(lastInput, h, i, 0)) - 1; if (offset >= 0 && offset < inDim) { real* pGradWeight = COL_PTR2(gradWeight, offset); - if (gradWeight->stride[0] == 1) { + if (gradWeight->stride(0) == 1) { THVector_(fill)(pGradWeight, 0, outDim); } else { - int64_t stride = gradWeight->stride[0]; + int64_t stride = gradWeight->stride(0); for (j = 0; j < outDim; ++j) { pGradWeight[j * stride] = 0; } diff --git a/aten/src/THNN/generic/SpatialAdaptiveAveragePooling.c b/aten/src/THNN/generic/SpatialAdaptiveAveragePooling.c index c81657f7718c24..e7c47485f969d7 100644 --- a/aten/src/THNN/generic/SpatialAdaptiveAveragePooling.c +++ b/aten/src/THNN/generic/SpatialAdaptiveAveragePooling.c @@ -92,21 +92,21 @@ void THNN_(SpatialAdaptiveAveragePooling_updateOutput)( if (input->dim() == 4) { - istrideB = input->stride[0]; - sizeB = input->size[0]; + istrideB = input->stride(0); + sizeB = input->size(0); dimD++; dimH++; dimW++; } /* sizes */ - sizeD = input->size[dimD]; - isizeH = input->size[dimH]; - isizeW = input->size[dimW]; + sizeD = input->size(dimD); + isizeH = input->size(dimH); + isizeW = input->size(dimW); /* strides */ - istrideD = input->stride[dimD]; - istrideH = input->stride[dimH]; - istrideW = input->stride[dimW]; + istrideD = input->stride(dimD); + istrideH = input->stride(dimH); + istrideW = input->stride(dimW); /* resize output */ if (input->dim() == 3) @@ -218,18 +218,18 @@ void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)( THTensor_(zero)(gradInput); if (input->dim() == 4) { - sizeB = input->size[0]; + sizeB = input->size(0); dimD++; dimH++; dimW++; } /* sizes */ - sizeD = input->size[dimD]; - isizeH = input->size[dimH]; - isizeW = input->size[dimW]; - osizeH = gradOutput->size[dimH]; - osizeW = gradOutput->size[dimW]; + sizeD = input->size(dimD); + isizeH = input->size(dimH); + isizeW = input->size(dimW); + osizeH = gradOutput->size(dimH); + osizeW = gradOutput->size(dimW); /* get raw pointers */ gradInput_data = THTensor_(data)(gradInput); diff --git a/aten/src/THNN/generic/SpatialAdaptiveMaxPooling.c b/aten/src/THNN/generic/SpatialAdaptiveMaxPooling.c index 711fa73b46555a..fc49a2388cfb6f 100644 --- a/aten/src/THNN/generic/SpatialAdaptiveMaxPooling.c +++ b/aten/src/THNN/generic/SpatialAdaptiveMaxPooling.c @@ -102,20 +102,20 @@ void THNN_(SpatialAdaptiveMaxPooling_updateOutput)( if (input->dim() == 4) { - istrideB = input->stride[0]; - sizeB = input->size[0]; + istrideB = input->stride(0); + sizeB = input->size(0); dimW++; dimH++; } /* sizes */ - sizeD = input->size[dimH-1]; - isizeH = input->size[dimH]; - isizeW = input->size[dimW]; + sizeD = input->size(dimH-1); + isizeH = input->size(dimH); + isizeW = input->size(dimW); /* strides */ - istrideD = input->stride[dimH-1]; - istrideH = input->stride[dimH]; - istrideW = input->stride[dimW]; + istrideD = input->stride(dimH-1); + istrideH = input->stride(dimH); + istrideW = input->stride(dimW); /* resize output */ if (input->dim() == 3) @@ -223,17 +223,17 @@ void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)( THTensor_(zero)(gradInput); if (input->dim() == 4) { - sizeB = input->size[0]; + sizeB = input->size(0); dimW++; dimH++; } /* sizes */ - sizeD = input->size[dimH-1]; - isizeH = input->size[dimH]; - isizeW = input->size[dimW]; - osizeH = gradOutput->size[dimH]; - osizeW = gradOutput->size[dimW]; + sizeD = input->size(dimH-1); + isizeH = input->size(dimH); + isizeW = input->size(dimW); + osizeH = gradOutput->size(dimH); + osizeW = gradOutput->size(dimW); /* get raw pointers */ gradInput_data = THTensor_(data)(gradInput); diff --git a/aten/src/THNN/generic/SpatialAveragePooling.c b/aten/src/THNN/generic/SpatialAveragePooling.c index 2a057e43d294b9..bdf851cf5f2a19 100644 --- a/aten/src/THNN/generic/SpatialAveragePooling.c +++ b/aten/src/THNN/generic/SpatialAveragePooling.c @@ -31,9 +31,9 @@ static inline void THNN_(SpatialAveragePooling_shapeCheck)( "padW = %d, padH = %d, kW = %d, kH = %d", padW, padH, kW, kH); - int64_t nInputPlane = input->size[dimh-1]; - int64_t inputHeight = input->size[dimh]; - int64_t inputWidth = input->size[dimw]; + int64_t nInputPlane = input->size(dimh-1); + int64_t inputHeight = input->size(dimh); + int64_t inputWidth = input->size(dimw); int64_t outputHeight, outputWidth; int64_t nOutputPlane = nInputPlane; @@ -103,15 +103,15 @@ void THNN_(SpatialAveragePooling_updateOutput)( (input, NULL, kH, kW, dH, dW, padH, padW, ceil_mode); if (input->dim() == 4) { - nbatch = input->size[0]; + nbatch = input->size(0); dimw++; dimh++; dimc++; } - inputWidth = input->size[dimw]; - inputHeight = input->size[dimh]; - nInputPlane = input->size[dimc]; + inputWidth = input->size(dimw); + inputHeight = input->size(dimh); + nInputPlane = input->size(dimc); if(ceil_mode) { @@ -136,7 +136,7 @@ void THNN_(SpatialAveragePooling_updateOutput)( if (input->dim() == 3) THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth); else - THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth); + THTensor_(resize4d)(output, input->size(0), nInputPlane, outputHeight, outputWidth); input = THTensor_(newContiguous)(input); THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous"); @@ -232,16 +232,16 @@ void THNN_(SpatialAveragePooling_updateGradInput)( if (input->dim() == 4) { - nbatch = input->size[0]; + nbatch = input->size(0); dimw++; dimh++; dimc++; ndim = 4; } - inputWidth = input->size[dimw]; - inputHeight = input->size[dimh]; - nInputPlane = input->size[dimc]; + inputWidth = input->size(dimw); + inputHeight = input->size(dimh); + nInputPlane = input->size(dimc); if(ceil_mode) { diff --git a/aten/src/THNN/generic/SpatialConvolutionLocal.c b/aten/src/THNN/generic/SpatialConvolutionLocal.c index 443901a37e70c1..6461285ca931fb 100644 --- a/aten/src/THNN/generic/SpatialConvolutionLocal.c +++ b/aten/src/THNN/generic/SpatialConvolutionLocal.c @@ -29,8 +29,8 @@ static inline void THNN_(SpatialConvolutionLocal_shapeCheck)( THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input, "non-empty 3D or 4D input tensor expected but got: %s"); - int64_t nInputPlane = weight->size[2] / (kH * kW); - int64_t nOutputPlane = weight->size[1]; + int64_t nInputPlane = weight->size(2) / (kH * kW); + int64_t nOutputPlane = weight->size(1); if (bias != NULL) { THNN_CHECK_DIM_SIZE(bias, 3, 0, nOutputPlane); @@ -53,9 +53,9 @@ static THTensor* THNN_(view_weight_local)(THTensor *_weight) AT_CHECK(!weight->is_empty() && (weight->dim() == 3 || weight->dim() == 6), "weight tensor should be (non-empty) 3D or 6D - got size: ", weight->sizes()); if (weight->dim() == 6) { - int64_t s1 = weight->size[0] * weight->size[1]; - int64_t s2 = weight->size[2]; - int64_t s3 = weight->size[3] * weight->size[4] * weight->size[5]; + int64_t s1 = weight->size(0) * weight->size(1); + int64_t s2 = weight->size(2); + int64_t s3 = weight->size(3) * weight->size(4) * weight->size(5); THTensor *old_weight = weight; weight = THTensor_(newWithStorage3d)(weight->storage, weight->storageOffset, @@ -140,7 +140,7 @@ void THNN_(SpatialConvolutionLocal_updateOutput)( } else { - int64_t T = input->size[0]; + int64_t T = input->size(0); int64_t t; THTensor_(resize3d)(finput, T, kW*kH*nInputPlane, outputHeight*outputWidth); @@ -243,7 +243,7 @@ void THNN_(SpatialConvolutionLocal_updateGradInput)( } else { - int64_t T = input->size[0]; + int64_t T = input->size(0); int64_t t; #pragma omp parallel for private(t) @@ -339,7 +339,7 @@ void THNN_(SpatialConvolutionLocal_accGradParameters)( } else { - int64_t T = input->size[0]; + int64_t T = input->size(0); int64_t t; for(t = 0; t < T; t++) diff --git a/aten/src/THNN/generic/SpatialConvolutionMM.c b/aten/src/THNN/generic/SpatialConvolutionMM.c index cdbff690b248af..434320a3cfdfd7 100644 --- a/aten/src/THNN/generic/SpatialConvolutionMM.c +++ b/aten/src/THNN/generic/SpatialConvolutionMM.c @@ -16,7 +16,7 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)( THNN_ARGCHECK(!weight->is_empty() && (weight->dim() == 2 || weight->dim() == 4), 5, weight, "non-empty 2D or 4D weight tensor expected, but got: %s"); if (bias != NULL) { - THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]); + THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size(0)); } } else if (!weight_nullable) { THError("weight tensor is expected to be non-nullable"); @@ -36,8 +36,8 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)( THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input, "non-empty 3D or 4D input tensor expected but got: %s"); - int64_t inputHeight = input->size[dimh]; - int64_t inputWidth = input->size[dimw]; + int64_t inputHeight = input->size(dimh); + int64_t inputWidth = input->size(dimw); int64_t exactInputHeight = inputHeight + 2 * padH; int64_t exactInputWidth = inputWidth + 2 * padW; @@ -58,7 +58,7 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)( } if (weight != NULL) { - int64_t nInputPlane = weight->size[1]; + int64_t nInputPlane = weight->size(1); if (weight->dim() == 2) { nInputPlane /= (kH * kW); } @@ -67,10 +67,10 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)( if (gradOutput != NULL) { if (weight != NULL) { - int64_t nOutputPlane = weight->size[0]; + int64_t nOutputPlane = weight->size(0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - int64_t nOutputPlane = bias->size[0]; + int64_t nOutputPlane = bias->size(0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); @@ -81,8 +81,8 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)( static THTensor* THNN_(newViewWeightMM2d)(THTensor *weight) { weight = THTensor_(newContiguous)(weight); if (weight->dim() == 4) { - int64_t s1 = weight->size[0]; - int64_t s2 = weight->size[1] * weight->size[2] * weight->size[3]; + int64_t s1 = weight->size(0); + int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3); THTensor *old_weight = weight; weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset, s1, -1, s2, -1); @@ -123,7 +123,7 @@ static void THNN_(SpatialConvolutionMM_updateOutput_frame)( if (bias) { for(i = 0; i < nOutputPlane; i++) THVector_(fill) - (THStorage_(data)(output->storage) + output->storageOffset + output->stride[0] * i, + (THStorage_(data)(output->storage) + output->storageOffset + output->stride(0) * i, THTensor_(get1d)(bias, i), outputHeight*outputWidth); } else { THTensor_(zero)(output); @@ -166,10 +166,10 @@ void THNN_(SpatialConvolutionMM_updateOutput)( dimw++; } - int64_t nInputPlane = input->size[dimf]; - int64_t inputHeight = input->size[dimh]; - int64_t inputWidth = input->size[dimw]; - int64_t nOutputPlane = weight->size[0]; + int64_t nInputPlane = input->size(dimf); + int64_t inputHeight = input->size(dimh); + int64_t inputWidth = input->size(dimw); + int64_t nOutputPlane = weight->size(0); int64_t outputHeight = (inputHeight + 2*padH - kH) / dH + 1; int64_t outputWidth = (inputWidth + 2*padW - kW) / dW + 1; @@ -186,7 +186,7 @@ void THNN_(SpatialConvolutionMM_updateOutput)( } else { - int64_t T = input->size[0]; + int64_t T = input->size(0); int64_t t; THTensor_(resize3d)(finput, T, kW*kH*nInputPlane, outputHeight*outputWidth); @@ -229,8 +229,8 @@ static void THNN_(SpatialConvolutionMM_updateGradInput_frame)( { THTensor *gradOutput2d = THTensor_(newWithStorage2d) (gradOutput->storage, gradOutput->storageOffset, - gradOutput->size[0], -1, - gradOutput->size[1]*gradOutput->size[2], -1); + gradOutput->size(0), -1, + gradOutput->size(1)*gradOutput->size(2), -1); THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d); THTensor_(free)(gradOutput2d); @@ -238,8 +238,8 @@ static void THNN_(SpatialConvolutionMM_updateGradInput_frame)( THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH, padW, padH, - gradInput->size[0], gradInput->size[2], gradInput->size[1], - gradOutput->size[2], gradOutput->size[1]); + gradInput->size(0), gradInput->size(2), gradInput->size(1), + gradOutput->size(2), gradOutput->size(1)); } void THNN_(SpatialConvolutionMM_updateGradInput)( @@ -283,7 +283,7 @@ void THNN_(SpatialConvolutionMM_updateGradInput)( } else { - int64_t T = input->size[0]; + int64_t T = input->size(0); int64_t t; #pragma omp parallel for private(t) @@ -319,8 +319,8 @@ static void THNN_(SpatialConvolutionMM_accGradParameters_frame)( int64_t i; THTensor *gradOutput2d = THTensor_(newWithStorage2d) (gradOutput->storage, gradOutput->storageOffset, - gradOutput->size[0], -1, - gradOutput->size[1]*gradOutput->size[2], -1); + gradOutput->size(0), -1, + gradOutput->size(1)*gradOutput->size(2), -1); if (gradWeight) { THTensor *tfinput = THTensor_(new)(); @@ -330,12 +330,12 @@ static void THNN_(SpatialConvolutionMM_accGradParameters_frame)( } if (gradBias) { - for(i = 0; i < gradBias->size[0]; i++) + for(i = 0; i < gradBias->size(0); i++) { int64_t k; real sum = 0; - real *data = THStorage_(data)(gradOutput2d->storage) + gradOutput2d->storageOffset + i*gradOutput2d->stride[0]; - for(k = 0; k < gradOutput2d->size[1]; k++) + real *data = THStorage_(data)(gradOutput2d->storage) + gradOutput2d->storageOffset + i*gradOutput2d->stride(0); + for(k = 0; k < gradOutput2d->size(1); k++) sum += data[k]; (THStorage_(data)(gradBias->storage) + gradBias->storageOffset)[i] += scale*sum; } @@ -382,7 +382,7 @@ void THNN_(SpatialConvolutionMM_accGradParameters)( } else { - int64_t T = input->size[0]; + int64_t T = input->size(0); int64_t t; for(t = 0; t < T; t++) diff --git a/aten/src/THNN/generic/SpatialConvolutionMap.c b/aten/src/THNN/generic/SpatialConvolutionMap.c index cdd74ed304f9d6..f91c372b4a7ac8 100644 --- a/aten/src/THNN/generic/SpatialConvolutionMap.c +++ b/aten/src/THNN/generic/SpatialConvolutionMap.c @@ -9,7 +9,7 @@ void THNN_(SpatialConvolutionMap_updateOutput)( { THArgCheck( weight != NULL && !weight->is_empty() && weight->dim() == 3 - && connTable != NULL && connTable->size[0] == weight->size[0], 4, + && connTable != NULL && connTable->size(0) == weight->size(0), 4, "non-empty 3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE ); @@ -22,27 +22,27 @@ void THNN_(SpatialConvolutionMap_updateOutput)( if (input->dim() == 4) { - nbatch = input->size[0]; + nbatch = input->size(0); dimc++; dimw++; dimh++; } - const int64_t kH = weight->size[1]; - const int64_t kW = weight->size[2]; + const int64_t kH = weight->size(1); + const int64_t kW = weight->size(2); - THArgCheck(input->size[dimc] >= nInputPlane, 2, "invalid number of input planes"); - THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH, 2, "input image smaller than kernel size"); + THArgCheck(input->size(dimc) >= nInputPlane, 2, "invalid number of input planes"); + THArgCheck(input->size(dimw) >= kW && input->size(dimh) >= kH, 2, "input image smaller than kernel size"); - const int64_t input_w = input->size[dimw]; - const int64_t input_h = input->size[dimh]; + const int64_t input_w = input->size(dimw); + const int64_t input_h = input->size(dimh); const int64_t output_w = (input_w - kW) / dW + 1; const int64_t output_h = (input_h - kH) / dH + 1; if (input->dim() == 3) THTensor_(resize3d)(output, nOutputPlane, output_h, output_w); else - THTensor_(resize4d)(output, input->size[0], nOutputPlane, output_h, output_w); + THTensor_(resize4d)(output, input->size(0), nOutputPlane, output_h, output_w); /* contiguous */ input = THTensor_(newContiguous)(input); @@ -73,7 +73,7 @@ void THNN_(SpatialConvolutionMap_updateOutput)( ptr_output[j] = z; /* convolve all maps */ - int nweight = connTable->size[0]; + int nweight = connTable->size(0); for (k = 0; k < nweight; k++) { /* get offsets for input/output */ @@ -110,7 +110,7 @@ void THNN_(SpatialConvolutionMap_updateGradInput)( { THArgCheck( weight != NULL && !weight->is_empty() && weight->dim() == 3 - && connTable != NULL && connTable->size[0] == weight->size[0], 5, + && connTable != NULL && connTable->size(0) == weight->size(0), 5, "non-empty 3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE ); @@ -120,17 +120,17 @@ void THNN_(SpatialConvolutionMap_updateGradInput)( int64_t nbatch = 1; if (input->dim() == 4) { - nbatch = input->size[0]; + nbatch = input->size(0); dimw++; dimh++; } - const int64_t input_h = input->size[dimh]; - const int64_t input_w = input->size[dimw]; - const int64_t output_h = gradOutput->size[dimh]; - const int64_t output_w = gradOutput->size[dimw]; - const int64_t kH = weight->size[1]; - const int64_t kW = weight->size[2]; + const int64_t input_h = input->size(dimh); + const int64_t input_w = input->size(dimw); + const int64_t output_h = gradOutput->size(dimh); + const int64_t output_w = gradOutput->size(dimw); + const int64_t kH = weight->size(1); + const int64_t kW = weight->size(2); /* contiguous */ gradInput = THTensor_(newContiguous)(gradInput); @@ -157,7 +157,7 @@ void THNN_(SpatialConvolutionMap_updateGradInput)( { int64_t k; /* backward all */ - int nkernel = connTable->size[0]; + int nkernel = connTable->size(0); for (k = 0; k < nkernel; k++) { int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE; @@ -197,7 +197,7 @@ void THNN_(SpatialConvolutionMap_accGradParameters)( real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); THArgCheck( gradWeight != NULL && !gradWeight->is_empty() && gradWeight->dim() == 3 - && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5, + && connTable != NULL && connTable->size(0) == gradWeight->size(0), 5, "3D gradWeight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE ); @@ -207,17 +207,17 @@ void THNN_(SpatialConvolutionMap_accGradParameters)( int64_t nbatch = 1; if (input->dim() == 4) { - nbatch = input->size[0]; + nbatch = input->size(0); dimw++; dimh++; } - const int64_t input_h = input->size[dimh]; - const int64_t input_w = input->size[dimw]; - const int64_t output_h = gradOutput->size[dimh]; - const int64_t output_w = gradOutput->size[dimw]; - const int64_t kH = gradWeight->size[1]; - const int64_t kW = gradWeight->size[2]; + const int64_t input_h = input->size(dimh); + const int64_t input_w = input->size(dimw); + const int64_t output_h = gradOutput->size(dimh); + const int64_t output_w = gradOutput->size(dimw); + const int64_t kH = gradWeight->size(1); + const int64_t kW = gradWeight->size(2); /* contiguous */ input = THTensor_(newContiguous)(input); @@ -248,7 +248,7 @@ void THNN_(SpatialConvolutionMap_accGradParameters)( } /* gradients wrt weight */ - const int nkernel = connTable->size[0]; + const int nkernel = connTable->size(0); #pragma omp parallel for private(k) for (k = 0; k < nkernel; k++) { diff --git a/aten/src/THNN/generic/SpatialDilatedConvolution.c b/aten/src/THNN/generic/SpatialDilatedConvolution.c index bddf79be0b785b..10b792a1d61146 100644 --- a/aten/src/THNN/generic/SpatialDilatedConvolution.c +++ b/aten/src/THNN/generic/SpatialDilatedConvolution.c @@ -20,7 +20,7 @@ static inline void THNN_(SpatialDilatedConvolution_shapeCheck)( "non-empty 4D weight tensor (nOutputPlane, nInputPlane, kH, kW) expected, " "but got: %s"); if (bias != NULL) { - THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]); + THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size(0)); } } else if (!weight_nullable) { THError("weight tensor is expected to be non-nullable"); @@ -40,8 +40,8 @@ static inline void THNN_(SpatialDilatedConvolution_shapeCheck)( THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input, "non-empty 3D or 4D input tensor expected but got: %s"); - int64_t inputHeight = input->size[dimh]; - int64_t inputWidth = input->size[dimw]; + int64_t inputHeight = input->size(dimh); + int64_t inputWidth = input->size(dimw); int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; @@ -53,16 +53,16 @@ static inline void THNN_(SpatialDilatedConvolution_shapeCheck)( } if (weight != NULL) { - int64_t nInputPlane = weight->size[1]; + int64_t nInputPlane = weight->size(1); THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane); } if (gradOutput != NULL) { if (weight != NULL) { - int64_t nOutputPlane = weight->size[0]; + int64_t nOutputPlane = weight->size(0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - int64_t nOutputPlane = bias->size[0]; + int64_t nOutputPlane = bias->size(0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); @@ -89,8 +89,8 @@ void THNN_(SpatialDilatedConvolution_updateOutput)( dilationH, dilationW, 0); // Params: - int nInputPlane = weight->size[1]; - int nOutputPlane = weight->size[0]; + int nInputPlane = weight->size(1); + int nOutputPlane = weight->size(0); input = THTensor_(newContiguous)(input); weight = THTensor_(newContiguous)(weight); @@ -103,15 +103,15 @@ void THNN_(SpatialDilatedConvolution_updateOutput)( if (input->dim() == 3) { // Force batch is_batch = 0; - THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); + THTensor_(resize4d)(input, 1, input->size(0), input->size(1), input->size(2)); } - int64_t inputWidth = input->size[3]; - int64_t inputHeight = input->size[2]; + int64_t inputWidth = input->size(3); + int64_t inputHeight = input->size(2); int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Resize output THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth); @@ -124,7 +124,7 @@ void THNN_(SpatialDilatedConvolution_updateOutput)( // Note: this buffer can be shared with other modules, it only ever gets increased, // and always contains ones. if (!THTensor_(isContiguous)(ones) || ones->dim() != 2 || - ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + ones->size(0)*ones->size(1) < outputHeight*outputWidth) { // Resize plane and fill with ones... THTensor_(resize2d)(ones, outputHeight, outputWidth); THTensor_(fill)(ones, 1); @@ -173,7 +173,7 @@ void THNN_(SpatialDilatedConvolution_updateOutput)( // M,N,K are dims of matrix A and B int64_t m = nOutputPlane; - int64_t n = columns->size[1]; + int64_t n = columns->size(1); int64_t k = nInputPlane*kH*kW; // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) @@ -220,8 +220,8 @@ void THNN_(SpatialDilatedConvolution_updateGradInput)( dilationH, dilationW, 0); // Params - int nInputPlane = weight->size[1]; - int nOutputPlane = weight->size[0]; + int64_t nInputPlane = weight->size(1); + int64_t nOutputPlane = weight->size(0); input = THTensor_(newContiguous)(input); weight = THTensor_(newContiguous)(weight); @@ -231,18 +231,18 @@ void THNN_(SpatialDilatedConvolution_updateGradInput)( if (input->dim() == 3) { // Force batch is_batch = 0; - THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); - THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], - gradOutput->size[2]); + THTensor_(resize4d)(input, 1, input->size(0), input->size(1), input->size(2)); + THTensor_(resize4d)(gradOutput, 1, gradOutput->size(0), gradOutput->size(1), + gradOutput->size(2)); } - int64_t inputWidth = input->size[3]; - int64_t inputHeight = input->size[2]; + int64_t inputWidth = input->size(3); + int64_t inputHeight = input->size(2); int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Resize output THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth); @@ -263,7 +263,7 @@ void THNN_(SpatialDilatedConvolution_updateGradInput)( // M,N,K are dims of matrix A and B int64_t m = nInputPlane*kW*kH; - int64_t n = gradColumns->size[1]; + int64_t n = gradColumns->size(1); int64_t k = nOutputPlane; // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) @@ -338,20 +338,20 @@ void THNN_(SpatialDilatedConvolution_accGradParameters)( if (input->dim() == 3) { // Force batch is_batch = 0; - THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); - THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], - gradOutput->size[1], gradOutput->size[2]); + THTensor_(resize4d)(input, 1, input->size(0), input->size(1), input->size(2)); + THTensor_(resize4d)(gradOutput, 1, gradOutput->size(0), + gradOutput->size(1), gradOutput->size(2)); } - int64_t nInputPlane = input->size[1]; - int64_t nOutputPlane = gradOutput->size[1]; - int64_t inputWidth = input->size[3]; - int64_t inputHeight = input->size[2]; + int64_t nInputPlane = input->size(1); + int64_t nOutputPlane = gradOutput->size(1); + int64_t inputWidth = input->size(3); + int64_t inputHeight = input->size(2); int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Resize temporary columns THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth); @@ -383,7 +383,7 @@ void THNN_(SpatialDilatedConvolution_accGradParameters)( // M,N,K are dims of matrix A and B int64_t m = nOutputPlane; int64_t n = nInputPlane*kW*kH; - int64_t k = columns->size[1]; + int64_t k = columns->size(1); // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) THBlas_(gemm)( @@ -405,7 +405,7 @@ void THNN_(SpatialDilatedConvolution_accGradParameters)( // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) // Define a buffer of ones, for bias accumulation - if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + if (ones->dim() != 2 || ones->size(0)*ones->size(1) < outputHeight*outputWidth) { // Resize plane and fill with ones... THTensor_(resize2d)(ones, outputHeight, outputWidth); THTensor_(fill)(ones, 1); diff --git a/aten/src/THNN/generic/SpatialDilatedMaxPooling.c b/aten/src/THNN/generic/SpatialDilatedMaxPooling.c index 2d595b7c449dc5..2b77fcee76d028 100644 --- a/aten/src/THNN/generic/SpatialDilatedMaxPooling.c +++ b/aten/src/THNN/generic/SpatialDilatedMaxPooling.c @@ -34,9 +34,9 @@ static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)( "padW = %d, padH = %d, kW = %d, kH = %d", padW, padH, kW, kH); - int64_t nInputPlane = input->size[dimh-1]; - int64_t inputHeight = input->size[dimh]; - int64_t inputWidth = input->size[dimw]; + int64_t nInputPlane = input->size(dimh-1); + int64_t inputHeight = input->size(dimh); + int64_t inputWidth = input->size(dimw); int64_t outputHeight, outputWidth; int64_t nOutputPlane = nInputPlane; @@ -184,15 +184,15 @@ void THNN_(SpatialDilatedMaxPooling_updateOutput)( if (input->dim() == 4) { - nbatch = input->size[0]; + nbatch = input->size(0); dimw++; dimh++; } /* sizes */ - nInputPlane = input->size[dimh-1]; - inputHeight = input->size[dimh]; - inputWidth = input->size[dimw]; + nInputPlane = input->size(dimh-1); + inputHeight = input->size(dimh); + inputWidth = input->size(dimw); if (ceil_mode) { outputHeight = (int64_t)(ceil((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1; @@ -349,17 +349,17 @@ void THNN_(SpatialDilatedMaxPooling_updateGradInput)( THTensor_(zero)(gradInput); if (input->dim() == 4) { - nbatch = input->size[0]; + nbatch = input->size(0); dimw++; dimh++; } /* sizes */ - nInputPlane = input->size[dimh-1]; - inputHeight = input->size[dimh]; - inputWidth = input->size[dimw]; - outputHeight = gradOutput->size[dimh]; - outputWidth = gradOutput->size[dimw]; + nInputPlane = input->size(dimh-1); + inputHeight = input->size(dimh); + inputWidth = input->size(dimw); + outputHeight = gradOutput->size(dimh); + outputWidth = gradOutput->size(dimw); /* get raw pointers */ gradInput_data = THTensor_(data)(gradInput); diff --git a/aten/src/THNN/generic/SpatialFullConvolutionMap.c b/aten/src/THNN/generic/SpatialFullConvolutionMap.c index a6fe50725df6b1..a989ba207c17da 100644 --- a/aten/src/THNN/generic/SpatialFullConvolutionMap.c +++ b/aten/src/THNN/generic/SpatialFullConvolutionMap.c @@ -12,20 +12,20 @@ void THNN_(SpatialFullConvolutionMap_updateOutput)( // What does this mean? THArgCheck( weight != NULL && !weight->is_empty() && weight->dim() == 3 - && connTable != NULL && connTable->size[0] == weight->size[0], 4, + && connTable != NULL && connTable->size(0) == weight->size(0), 4, "non-empty 3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE ); - const int kH = (int)weight->size[1]; - const int kW = (int)weight->size[2]; + const int kH = (int)weight->size(1); + const int kW = (int)weight->size(2); THArgCheck(input != NULL && !input->is_empty() && input->dim() == 3, 2, "non-empty 3D tensor expected"); - THArgCheck(input->size[0] >= nInputPlane, 2, "invalid number of input planes"); + THArgCheck(input->size(0) >= nInputPlane, 2, "invalid number of input planes"); THTensor_(resize3d)( output_, nOutputPlane, - (input->size[1] - 1) * dH + kH, - (input->size[2] - 1) * dW + kW + (input->size(1) - 1) * dH + kH, + (input->size(2) - 1) * dW + kW ); /* contiguous */ @@ -40,12 +40,12 @@ void THNN_(SpatialFullConvolutionMap_updateOutput)( real *connTable_data = THTensor_(data)(connTable); /* and dims */ - const int64_t input_h = input->size[1]; - const int64_t input_w = input->size[2]; - const int64_t output_h = output->size[1]; - const int64_t output_w = output->size[2]; - const int64_t weight_h = weight->size[1]; - const int64_t weight_w = weight->size[2]; + const int64_t input_h = input->size(1); + const int64_t input_w = input->size(2); + const int64_t output_h = output->size(1); + const int64_t output_w = output->size(2); + const int64_t weight_h = weight->size(1); + const int64_t weight_w = weight->size(2); int64_t p; #pragma omp parallel for private(p) @@ -61,7 +61,7 @@ void THNN_(SpatialFullConvolutionMap_updateOutput)( ptr_output[j] = bias_data[p]; /* convolve all maps */ - nweight = connTable->size[0]; + nweight = connTable->size(0); for (k = 0; k < nweight; k++) { /* get offsets for input/output */ @@ -93,7 +93,7 @@ void THNN_(SpatialFullConvolutionMap_updateGradInput)( { THArgCheck( weight != NULL && !weight->is_empty() && weight->dim() == 3 - && connTable != NULL && connTable->size[0] == weight->size[0], 5, + && connTable != NULL && connTable->size(0) == weight->size(0), 5, "non-empty 3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE ); @@ -112,12 +112,12 @@ void THNN_(SpatialFullConvolutionMap_updateGradInput)( real *connTable_data = THTensor_(data)(connTable); /* and dims */ - const int64_t input_h = input->size[1]; - const int64_t input_w = input->size[2]; - const int64_t output_h = gradOutput->size[1]; - const int64_t output_w = gradOutput->size[2]; - const int64_t kH = weight->size[1]; - const int64_t kW = weight->size[2]; + const int64_t input_h = input->size(1); + const int64_t input_w = input->size(2); + const int64_t output_h = gradOutput->size(1); + const int64_t output_w = gradOutput->size(2); + const int64_t kH = weight->size(1); + const int64_t kW = weight->size(2); int64_t p; #pragma omp parallel for private(p) @@ -125,7 +125,7 @@ void THNN_(SpatialFullConvolutionMap_updateGradInput)( { int64_t k; /* backward all */ - int nkernel = connTable->size[0]; + int nkernel = connTable->size(0); for (k = 0; k < nkernel; k++) { int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE; @@ -164,7 +164,7 @@ void THNN_(SpatialFullConvolutionMap_accGradParameters)( real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); THArgCheck( gradWeight != NULL && !gradWeight->is_empty() && gradWeight->dim() == 3 - && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5, + && connTable != NULL && connTable->size(0) == gradWeight->size(0), 5, "non-empty 3D gradWeight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE ); @@ -179,12 +179,12 @@ void THNN_(SpatialFullConvolutionMap_accGradParameters)( real *gradBias_data = THTensor_(data)(gradBias); /* and dims */ - const int64_t input_h = input->size[1]; - const int64_t input_w = input->size[2]; - const int64_t output_h = gradOutput->size[1]; - const int64_t output_w = gradOutput->size[2]; - const int64_t weight_h = gradWeight->size[1]; - const int64_t weight_w = gradWeight->size[2]; + const int64_t input_h = input->size(1); + const int64_t input_w = input->size(2); + const int64_t output_h = gradOutput->size(1); + const int64_t output_w = gradOutput->size(2); + const int64_t weight_h = gradWeight->size(1); + const int64_t weight_w = gradWeight->size(2); /* gradients wrt bias */ int64_t k; @@ -198,7 +198,7 @@ void THNN_(SpatialFullConvolutionMap_accGradParameters)( } /* gradients wrt weight */ - int nkernel = connTable->size[0]; + int nkernel = connTable->size(0); #pragma omp parallel for private(k) for (k = 0; k < nkernel; k++) { diff --git a/aten/src/THNN/generic/SpatialFullDilatedConvolution.c b/aten/src/THNN/generic/SpatialFullDilatedConvolution.c index 35098095d6a7ef..7226db67ef1a74 100644 --- a/aten/src/THNN/generic/SpatialFullDilatedConvolution.c +++ b/aten/src/THNN/generic/SpatialFullDilatedConvolution.c @@ -23,7 +23,7 @@ static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)( THNN_ARGCHECK(!weight->is_empty() && (weight->dim() == 2 || weight->dim() == 4), 5, weight, "non-empty 2D or 4D weight tensor expected, but got: %s"); if (bias != NULL) { - THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[1]); + THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size(1)); } } else if (!weight_nullable) { THError("weight tensor is expected to be non-nullable"); @@ -43,8 +43,8 @@ static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)( THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input, "non-empty 3D or 4D input tensor expected but got: %s"); - int64_t inputHeight = input->size[dimh]; - int64_t inputWidth = input->size[dimw]; + int64_t inputHeight = input->size(dimh); + int64_t inputWidth = input->size(dimw); int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH; int64_t outputWidth = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW; @@ -55,16 +55,16 @@ static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)( } if (weight != NULL) { - int64_t nInputPlane = weight->size[0]; + int64_t nInputPlane = weight->size(0); THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane); } if (gradOutput != NULL) { if (weight != NULL) { - int64_t nOutputPlane = weight->size[1]; + int64_t nOutputPlane = weight->size(1); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - int64_t nOutputPlane = bias->size[0]; + int64_t nOutputPlane = bias->size(0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); @@ -105,16 +105,16 @@ void THNN_(SpatialFullDilatedConvolution_updateOutput)( if (input->dim() == 3) { // Force batch is_batch = 0; - THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); + THTensor_(resize4d)(input, 1, input->size(0), input->size(1), input->size(2)); } - int64_t inputHeight = input->size[2]; - int64_t inputWidth = input->size[3]; + int64_t inputHeight = input->size(2); + int64_t inputWidth = input->size(3); int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH; int64_t outputWidth = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW; // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Resize output THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth); @@ -126,7 +126,7 @@ void THNN_(SpatialFullDilatedConvolution_updateOutput)( // Define a buffer of ones, for bias accumulation // Note: this buffer can be shared with other modules, it only ever gets increased, // and always contains ones. - if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + if (ones->dim() != 2 || ones->size(0)*ones->size(1) < outputHeight*outputWidth) { // Resize plane and fill with ones... THTensor_(resize2d)(ones, outputHeight, outputWidth); THTensor_(fill)(ones, 1); @@ -145,9 +145,9 @@ void THNN_(SpatialFullDilatedConvolution_updateOutput)( // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - int64_t m = weight->size[1] * weight->size[2] * weight->size[3]; - int64_t n = columns->size[1]; - int64_t k = weight->size[0]; + int64_t m = weight->size(1) * weight->size(2) * weight->size(3); + int64_t n = columns->size(1); + int64_t k = weight->size(0); // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) THBlas_(gemm)( @@ -221,8 +221,8 @@ void THNN_(SpatialFullDilatedConvolution_updateGradInput)( (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW, dilationH, dilationW, adjH, adjW, 0); - int nInputPlane = THTensor_(size)(weight,0); - int nOutputPlane = THTensor_(size)(weight,1); + int64_t nInputPlane = THTensor_(size)(weight,0); + int64_t nOutputPlane = THTensor_(size)(weight,1); input = THTensor_(newContiguous)(input); gradOutput = THTensor_(newContiguous)(gradOutput); @@ -233,17 +233,17 @@ void THNN_(SpatialFullDilatedConvolution_updateGradInput)( if (input->dim() == 3) { // Force batch is_batch = 0; - THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); - THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); + THTensor_(resize4d)(input, 1, input->size(0), input->size(1), input->size(2)); + THTensor_(resize4d)(gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2)); } - int64_t inputWidth = input->size[3]; - int64_t inputHeight = input->size[2]; + int64_t inputWidth = input->size(3); + int64_t inputHeight = input->size(2); int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH; int64_t outputWidth = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW; // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Resize output THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth); @@ -275,9 +275,9 @@ void THNN_(SpatialFullDilatedConvolution_updateGradInput)( // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - int64_t m = weight->size[0]; - int64_t n = gradColumns->size[1]; - int64_t k = weight->size[1] * weight->size[2] * weight->size[3]; + int64_t m = weight->size(0); + int64_t n = gradColumns->size(1); + int64_t k = weight->size(1) * weight->size(2) * weight->size(3); // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) THBlas_(gemm)( @@ -328,7 +328,7 @@ void THNN_(SpatialFullDilatedConvolution_accGradParameters)( (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW, dilationH, dilationW, adjH, adjW, 1); - int nOutputPlane; + int64_t nOutputPlane; if (gradWeight) { nOutputPlane = THTensor_(size)(gradWeight, 1); } else if (gradBias) { @@ -352,20 +352,20 @@ void THNN_(SpatialFullDilatedConvolution_accGradParameters)( if (input->dim() == 3) { // Force batch is_batch = 0; - THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); - THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); + THTensor_(resize4d)(input, 1, input->size(0), input->size(1), input->size(2)); + THTensor_(resize4d)(gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2)); } - int64_t inputWidth = input->size[3]; - int64_t inputHeight = input->size[2]; + int64_t inputWidth = input->size(3); + int64_t inputHeight = input->size(2); int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH; int64_t outputWidth = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW; // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Define a buffer of ones, for bias accumulation - if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + if (ones->dim() != 2 || ones->size(0)*ones->size(1) < outputHeight*outputWidth) { // Resize plane and fill with ones... THTensor_(resize2d)(ones, outputHeight, outputWidth); THTensor_(fill)(ones, 1); @@ -401,9 +401,9 @@ void THNN_(SpatialFullDilatedConvolution_accGradParameters)( // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - int64_t n = columns->size[0]; // nOutputPlane * kh * kw - int64_t m = input_n->size[0]; // nInputPlane - int64_t k = columns->size[1]; // inputHeight * inputWidth + int64_t n = columns->size(0); // nOutputPlane * kh * kw + int64_t m = input_n->size(0); // nInputPlane + int64_t k = columns->size(1); // inputHeight * inputWidth // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) THBlas_(gemm)( @@ -444,7 +444,7 @@ void THNN_(SpatialFullDilatedConvolution_accGradParameters)( // Resize if (is_batch == 0) { THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth); - THTensor_(resize3d)(input, input->size[1], inputHeight, inputWidth); + THTensor_(resize3d)(input, input->size(1), inputHeight, inputWidth); } THTensor_(free)(input); diff --git a/aten/src/THNN/generic/SpatialMaxUnpooling.c b/aten/src/THNN/generic/SpatialMaxUnpooling.c index 64179b52779a01..cbfb34ed924b57 100644 --- a/aten/src/THNN/generic/SpatialMaxUnpooling.c +++ b/aten/src/THNN/generic/SpatialMaxUnpooling.c @@ -67,15 +67,15 @@ void THNN_(SpatialMaxUnpooling_updateOutput)( if (input->dim() == 4) { - nbatch = input->size[0]; + nbatch = input->size(0); dimw++; dimh++; } /* sizes */ - nslices = input->size[dimh-1]; - iheight = input->size[dimh]; - iwidth = input->size[dimw]; + nslices = input->size(dimh-1); + iheight = input->size(dimh); + iwidth = input->size(dimw); /* get contiguous input and indices */ input = THTensor_(newContiguous)(input); @@ -184,19 +184,19 @@ void THNN_(SpatialMaxUnpooling_updateGradInput)( THTensor_(zero)(gradInput); if (input->dim() == 4) { - nbatch = input->size[0]; + nbatch = input->size(0); dimw++; dimh++; } /* sizes */ - nslices = input->size[dimh-1]; - iheight = input->size[dimh]; - iwidth = input->size[dimw]; + nslices = input->size(dimh-1); + iheight = input->size(dimh); + iwidth = input->size(dimw); - if(owidth!=gradOutput->size[dimw] || oheight!=gradOutput->size[dimh]){ + if(owidth!=gradOutput->size(dimw) || oheight!=gradOutput->size(dimh)){ THError("Inconsistent gradOutput size. oheight= %d, owidth= %d, gradOutput: %dx%d", - oheight, owidth, gradOutput->size[dimh], gradOutput->size[dimw]); + oheight, owidth, gradOutput->size(dimh), gradOutput->size(dimw)); } /* get raw pointers */ diff --git a/aten/src/THNN/generic/SpatialReflectionPadding.c b/aten/src/THNN/generic/SpatialReflectionPadding.c index 4ccdca8abde381..dec9ffd3f94ae6 100644 --- a/aten/src/THNN/generic/SpatialReflectionPadding.c +++ b/aten/src/THNN/generic/SpatialReflectionPadding.c @@ -72,26 +72,24 @@ void THNN_(SpatialReflectionPadding_updateOutput)(THNNState *state, if (input->dim() == 4) { - nbatch = input->size[0]; + nbatch = input->size(0); dimw++; dimh++; dimslices++; } /* input sizes */ - nslices = input->size[dimslices]; - iheight = input->size[dimh]; - iwidth = input->size[dimw]; + nslices = input->size(dimslices); + iheight = input->size(dimh); + iwidth = input->size(dimw); - THArgCheck(pad_l < iwidth && pad_r < iwidth, 4, - "Padding size should be less than the corresponding input dimension, " - "but got: padding (%d, %d) at dimension %d of input %s", - pad_l, pad_r, dimw, _THSizeDesc(input->size, input->dim()).str); + AT_CHECK(pad_l < iwidth && pad_r < iwidth, + "Argument #4: Padding size should be less than the corresponding input dimension, " + "but got: padding (", pad_l, ", ", pad_r, ") at dimension ", dimw, " of input ", input->sizes()); - THArgCheck(pad_t < iheight && pad_b < iheight, 6, - "Padding size should be less than the corresponding input dimension, " - "but got: padding (%d, %d) at dimension %d of input %s", - pad_t, pad_b, dimh, _THSizeDesc(input->size, input->dim()).str); + AT_CHECK(pad_t < iheight && pad_b < iheight, + "Argument #6: Padding size should be less than the corresponding input dimension, " + "but got: padding (", pad_t, ", ", pad_b, ") at dimension ", dimh, " of input ", input->sizes()); /* output sizes */ oheight = iheight + pad_t + pad_b; @@ -213,16 +211,16 @@ void THNN_(SpatialReflectionPadding_updateGradInput)(THNNState *state, if (input->dim() == 4) { - nbatch = input->size[0]; + nbatch = input->size(0); dimw++; dimh++; dimslices++; } /* sizes */ - nslices = input->size[dimslices]; - iheight = input->size[dimh]; - iwidth = input->size[dimw]; + nslices = input->size(dimslices); + iheight = input->size(dimh); + iwidth = input->size(dimw); oheight = iheight + pad_t + pad_b; owidth = iwidth + pad_l + pad_r; diff --git a/aten/src/THNN/generic/SpatialReplicationPadding.c b/aten/src/THNN/generic/SpatialReplicationPadding.c index 32c125d87c4c49..9275768001e1b6 100644 --- a/aten/src/THNN/generic/SpatialReplicationPadding.c +++ b/aten/src/THNN/generic/SpatialReplicationPadding.c @@ -71,16 +71,16 @@ void THNN_(SpatialReplicationPadding_updateOutput)(THNNState *state, if (input->dim() == 4) { - nbatch = input->size[0]; + nbatch = input->size(0); dimw++; dimh++; dimslices++; } /* sizes */ - nslices = input->size[dimslices]; - iheight = input->size[dimh]; - iwidth = input->size[dimw]; + nslices = input->size(dimslices); + iheight = input->size(dimh); + iwidth = input->size(dimw); oheight = iheight + pad_t + pad_b; owidth = iwidth + pad_l + pad_r; @@ -200,16 +200,16 @@ void THNN_(SpatialReplicationPadding_updateGradInput)(THNNState *state, if (input->dim() == 4) { - nbatch = input->size[0]; + nbatch = input->size(0); dimw++; dimh++; dimslices++; } /* sizes */ - nslices = input->size[dimslices]; - iheight = input->size[dimh]; - iwidth = input->size[dimw]; + nslices = input->size(dimslices); + iheight = input->size(dimh); + iwidth = input->size(dimw); oheight = iheight + pad_t + pad_b; owidth = iwidth + pad_l + pad_r; diff --git a/aten/src/THNN/generic/SpatialSubSampling.c b/aten/src/THNN/generic/SpatialSubSampling.c index 8f9f95d48252e8..10303e951c615f 100644 --- a/aten/src/THNN/generic/SpatialSubSampling.c +++ b/aten/src/THNN/generic/SpatialSubSampling.c @@ -24,10 +24,10 @@ static inline void THNN_(SpatialSubSampling_shapeCheck)( dimh++; } - inputWidth = input->size[dimw]; - inputHeight = input->size[dimh]; + inputWidth = input->size(dimw); + inputHeight = input->size(dimh); - THArgCheck(input->size[dimh-1] == nInputPlane, 2, "invalid number of input planes"); + THArgCheck(input->size(dimh-1) == nInputPlane, 2, "invalid number of input planes"); THArgCheck(inputWidth >= kW && inputHeight >= kH, 2, "input image smaller than kernel size"); } @@ -63,20 +63,20 @@ void THNN_(SpatialSubSampling_updateOutput)( THNN_(SpatialSubSampling_shapeCheck)(input, NULL, weight, kW, kH); if (input->dim() == 4) { - nbatch = input->size[0]; + nbatch = input->size(0); dimw++; dimh++; } - inputWidth = input->size[dimw]; - inputHeight = input->size[dimh]; + inputWidth = input->size(dimw); + inputHeight = input->size(dimh); outputWidth = (inputWidth - kW) / dW + 1; outputHeight = (inputHeight - kH) / dH + 1; if (input->dim() == 3) THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth); else - THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth); + THTensor_(resize4d)(output, input->size(0), nInputPlane, outputHeight, outputWidth); input = THTensor_(newContiguous)(input); input_data = THTensor_(data)(input); @@ -152,13 +152,13 @@ void THNN_(SpatialSubSampling_updateGradInput)( int64_t k; if (input->dim() == 4) { - nbatch = input->size[0]; + nbatch = input->size(0); dimw++; dimh++; } - inputWidth = input->size[dimw]; - inputHeight = input->size[dimh]; + inputWidth = input->size(dimw); + inputHeight = input->size(dimh); outputWidth = (inputWidth - kW) / dW + 1; outputHeight = (inputHeight - kH) / dH + 1; @@ -239,11 +239,11 @@ void THNN_(SpatialSubSampling_accGradParameters)( if (input->dim() == 4) { dimw++; dimh++; - nbatch = input->size[0]; + nbatch = input->size(0); } - inputWidth = input->size[dimw]; - inputHeight = input->size[dimh]; + inputWidth = input->size(dimw); + inputHeight = input->size(dimh); outputWidth = (inputWidth - kW) / dW + 1; outputHeight = (inputHeight - kH) / dH + 1; diff --git a/aten/src/THNN/generic/THNN.h b/aten/src/THNN/generic/THNN.h index 98338d64da9748..455da04c7e4454 100644 --- a/aten/src/THNN/generic/THNN.h +++ b/aten/src/THNN/generic/THNN.h @@ -147,39 +147,39 @@ TH_API void THNN_(Im2Col_updateOutput)( THNNState *state, THTensor *input, THTensor *output, - int kH, int kW, - int dH, int dW, - int padH, int padW, - int sH, int sW); + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW); TH_API void THNN_(Im2Col_updateGradInput)( THNNState *state, THTensor *gradOutput, THTensor *gradInput, - int inputHeight, int inputWidth, - int kH, int kW, - int dH, int dW, - int padH, int padW, - int sH, int sW); + int64_t inputHeight, int64_t inputWidth, + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW); TH_API void THNN_(Col2Im_updateOutput)( THNNState *state, THTensor *input, THTensor *output, - int outputHeight, int outputWidth, - int kH, int kW, - int dH, int dW, - int padH, int padW, - int sH, int sW); + int64_t outputHeight, int64_t outputWidth, + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW); TH_API void THNN_(Col2Im_updateGradInput)( THNNState *state, THTensor *gradOutput, THTensor *gradInput, - int kH, int kW, - int dH, int dW, - int padH, int padW, - int sH, int sW); + int64_t kH, int64_t kW, + int64_t dH, int64_t dW, + int64_t padH, int64_t padW, + int64_t sH, int64_t sW); TH_API void THNN_(L1Cost_updateOutput)( THNNState *state, // library's state diff --git a/aten/src/THNN/generic/TemporalConvolution.c b/aten/src/THNN/generic/TemporalConvolution.c index a7fdd3f96444eb..2c3e1da84de5ea 100644 --- a/aten/src/THNN/generic/TemporalConvolution.c +++ b/aten/src/THNN/generic/TemporalConvolution.c @@ -25,13 +25,13 @@ static inline void THNN_(TemporalConvolution_shapeCheck)( THNN_ARGCHECK(!input->is_empty() && (input->dim() == 2 || input->dim() == 3), 2, input, "non-empty 2D or 3D (batch mode) tensor expected for input, but got: %s"); if (inputFrameSize != NULL) { - THArgCheck(input->size[dimF] == *inputFrameSize, 2, + THArgCheck(input->size(dimF) == *inputFrameSize, 2, "invalid input frame size. Got: %d, Expected: %d", - input->size[dimF], *inputFrameSize); + input->size(dimF), *inputFrameSize); } - THArgCheck(input->size[dimS] >= kW, 2, + THArgCheck(input->size(dimS) >= kW, 2, "input sequence smaller than kernel size. Got: %d, Expected: %d", - input->size[dimS], kW); + input->size(dimS), kW); } void THNN_(TemporalConvolution_updateOutput)( @@ -64,7 +64,7 @@ void THNN_(TemporalConvolution_updateOutput)( outputWindow = THTensor_(new)(); inputWindow = THTensor_(new)(); - nInputFrame = input->size[dimS]; + nInputFrame = input->size(dimS); nOutputFrame = (nInputFrame - kW) / dW + 1; if (input->dim() == 2) @@ -89,14 +89,14 @@ void THNN_(TemporalConvolution_updateOutput)( nOutputFrame -= nFrame; THTensor_(setStorage2d)(inputWindow, input->storage, - input->storageOffset+k*dW*input->size[1], - nFrame, inputFrameStride*input->size[1], - kW*input->size[1], 1); + input->storageOffset+k*dW*input->size(1), + nFrame, inputFrameStride*input->size(1), + kW*input->size(1), 1); THTensor_(setStorage2d)(outputWindow, output->storage, - output->storageOffset + k*output->size[1], - nFrame, outputFrameStride*output->size[1], - output->size[1], 1); + output->storageOffset + k*output->size(1), + nFrame, outputFrameStride*output->size(1), + output->size(1), 1); THTensor *tweight = THTensor_(new)(); THTensor_(transpose)(tweight, weight, 0, 1); @@ -108,7 +108,7 @@ void THNN_(TemporalConvolution_updateOutput)( { THTensor *outputSample = THTensor_(new)(); THTensor *inputSample = THTensor_(new)(); - int nBatchFrame = input->size[0]; + int nBatchFrame = input->size(0); THTensor_(resize3d)(output, nBatchFrame, @@ -137,14 +137,14 @@ void THNN_(TemporalConvolution_updateOutput)( nOutputSampleFrame -= nFrame; THTensor_(setStorage2d)(inputWindow, inputSample->storage, - inputSample->storageOffset+k*dW*inputSample->size[1], - nFrame, inputFrameStride*inputSample->size[1], - kW*inputSample->size[1], 1); + inputSample->storageOffset+k*dW*inputSample->size(1), + nFrame, inputFrameStride*inputSample->size(1), + kW*inputSample->size(1), 1); THTensor_(setStorage2d)(outputWindow, outputSample->storage, - outputSample->storageOffset + k*outputSample->size[1], - nFrame, outputFrameStride*outputSample->size[1], - outputSample->size[1], 1); + outputSample->storageOffset + k*outputSample->size(1), + nFrame, outputFrameStride*outputSample->size(1), + outputSample->size(1), 1); THTensor *tweight = THTensor_(new)(); THTensor_(transpose)(tweight, weight, 0, 1); @@ -188,8 +188,8 @@ void THNN_(TemporalConvolution_updateGradInput)( THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous"); THNN_(TemporalConvolution_shapeCheck)( state, input, kW, dW, NULL); - nInputFrame = input->size[dimS]; - nOutputFrame = gradOutput->size[dimS]; + nInputFrame = input->size(dimS); + nOutputFrame = gradOutput->size(dimS); input = THTensor_(newContiguous)(input); gradOutput = THTensor_(newContiguous)(gradOutput); @@ -211,14 +211,14 @@ void THNN_(TemporalConvolution_updateGradInput)( nOutputFrame -= nFrame; THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage, - gradOutput->storageOffset + k*gradOutput->size[1], - nFrame, outputFrameStride*gradOutput->size[1], - gradOutput->size[1], 1); + gradOutput->storageOffset + k*gradOutput->size(1), + nFrame, outputFrameStride*gradOutput->size(1), + gradOutput->size(1), 1); THTensor_(setStorage2d)(gradInputWindow, gradInput->storage, - gradInput->storageOffset+k*dW*gradInput->size[1], - nFrame, inputFrameStride*gradInput->size[1], - kW*gradInput->size[1], 1); + gradInput->storageOffset+k*dW*gradInput->size(1), + nFrame, inputFrameStride*gradInput->size(1), + kW*gradInput->size(1), 1); THTensor_(addmm)(gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight); } @@ -227,7 +227,7 @@ void THNN_(TemporalConvolution_updateGradInput)( { THTensor *gradOutputSample = THTensor_(new)(); THTensor *gradInputSample = THTensor_(new)(); - int nBatchFrame = input->size[0]; + int nBatchFrame = input->size(0); for(i = 0; i < nBatchFrame; i++) { @@ -244,14 +244,14 @@ void THNN_(TemporalConvolution_updateGradInput)( nOutputSampleFrame -= nFrame; THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage, - gradOutputSample->storageOffset + k*gradOutputSample->size[1], - nFrame, outputFrameStride*gradOutputSample->size[1], - gradOutputSample->size[1], 1); + gradOutputSample->storageOffset + k*gradOutputSample->size(1), + nFrame, outputFrameStride*gradOutputSample->size(1), + gradOutputSample->size(1), 1); THTensor_(setStorage2d)(gradInputWindow, gradInputSample->storage, - gradInputSample->storageOffset+k*dW*gradInputSample->size[1], - nFrame, inputFrameStride*gradInputSample->size[1], - kW*gradInputSample->size[1], 1); + gradInputSample->storageOffset+k*dW*gradInputSample->size(1), + nFrame, inputFrameStride*gradInputSample->size(1), + kW*gradInputSample->size(1), 1); THTensor_(addmm)(gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight); } @@ -294,8 +294,8 @@ void THNN_(TemporalConvolution_accGradParameters)( THNN_(TemporalConvolution_shapeCheck)( state, input, kW, dW, NULL); - nInputFrame = input->size[dimS]; - nOutputFrame = gradOutput->size[dimS]; + nInputFrame = input->size(dimS); + nOutputFrame = gradOutput->size(dimS); input = THTensor_(newContiguous)(input); gradOutput = THTensor_(newContiguous)(gradOutput); @@ -320,14 +320,14 @@ void THNN_(TemporalConvolution_accGradParameters)( nOutputFrame -= nFrame; THTensor_(setStorage2d)(inputWindow, input->storage, - input->storageOffset+k*dW*input->size[1], - nFrame, inputFrameStride*input->size[1], - kW*input->size[1], 1); + input->storageOffset+k*dW*input->size(1), + nFrame, inputFrameStride*input->size(1), + kW*input->size(1), 1); THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage, - gradOutput->storageOffset + k*gradOutput->size[1], - nFrame, outputFrameStride*gradOutput->size[1], - gradOutput->size[1], 1); + gradOutput->storageOffset + k*gradOutput->size(1), + nFrame, outputFrameStride*gradOutput->size(1), + gradOutput->size(1), 1); THTensor *tgradOutputWindow = THTensor_(new)(); THTensor_(transpose)(tgradOutputWindow, gradOutputWindow, 0, 1); @@ -339,7 +339,7 @@ void THNN_(TemporalConvolution_accGradParameters)( { THTensor *gradOutputSample = THTensor_(new)(); THTensor *inputSample = THTensor_(new)(); - int nBatchFrame = input->size[0]; + int nBatchFrame = input->size(0); for(i = 0; i < nBatchFrame; i++) { @@ -363,14 +363,14 @@ void THNN_(TemporalConvolution_accGradParameters)( nOutputSampleFrame -= nFrame; THTensor_(setStorage2d)(inputWindow, inputSample->storage, - inputSample->storageOffset+k*dW*inputSample->size[1], - nFrame, inputFrameStride*inputSample->size[1], - kW*inputSample->size[1], 1); + inputSample->storageOffset+k*dW*inputSample->size(1), + nFrame, inputFrameStride*inputSample->size(1), + kW*inputSample->size(1), 1); THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage, - gradOutputSample->storageOffset + k*gradOutputSample->size[1], - nFrame, outputFrameStride*gradOutputSample->size[1], - gradOutputSample->size[1], 1); + gradOutputSample->storageOffset + k*gradOutputSample->size(1), + nFrame, outputFrameStride*gradOutputSample->size(1), + gradOutputSample->size(1), 1); THTensor *tgradOutputWindow = THTensor_(new)(); THTensor_(transpose)(tgradOutputWindow, gradOutputWindow, 0, 1); diff --git a/aten/src/THNN/generic/TemporalMaxPooling.c b/aten/src/THNN/generic/TemporalMaxPooling.c index faef3059d09619..69f1c94eced211 100644 --- a/aten/src/THNN/generic/TemporalMaxPooling.c +++ b/aten/src/THNN/generic/TemporalMaxPooling.c @@ -23,8 +23,8 @@ static inline void THNN_(TemporalMaxPooling_shapeCheck)( dimF = 2; } - niframe = input->size[dimS]; - framesize = input->size[dimF]; + niframe = input->size(dimS); + framesize = input->size(dimF); noframe = (niframe - kW) / dW + 1; THArgCheck(kW > 0, 5, @@ -34,9 +34,9 @@ static inline void THNN_(TemporalMaxPooling_shapeCheck)( THNN_ARGCHECK(!input->is_empty() && (input->dim() == 2 || input->dim() == 3), 2, input, "non-empty 2D or 3D (batch mode) tensor expected for input, but got: %s"); - THArgCheck(input->size[dimS] >= kW, 2, + THArgCheck(input->size(dimS) >= kW, 2, "input sequence smaller than kernel size. Got: %d, Expected: %d", - input->size[dimS], kW); + input->size(dimS), kW); if (gradOutput != NULL) { THNN_CHECK_DIM_SIZE(gradOutput, ndims, dimS, noframe); @@ -78,8 +78,8 @@ void THNN_(TemporalMaxPooling_updateOutput)( } /* sizes */ - niframe = input->size[dimS]; - framesize = input->size[dimF]; + niframe = input->size(dimS); + framesize = input->size(dimF); noframe = (niframe - kW) / dW + 1; /* get contiguous input */ @@ -129,7 +129,7 @@ void THNN_(TemporalMaxPooling_updateOutput)( else { /* number of batch frames */ - int64_t nbframe = input->size[0]; + int64_t nbframe = input->size(0); int64_t i; /* resize output */ @@ -221,9 +221,9 @@ void THNN_(TemporalMaxPooling_updateGradInput)( dimF = 2; } /* sizes */ - niframe = input->size[dimS]; - noframe = gradOutput->size[dimS]; - framesize = gradOutput->size[dimF]; + niframe = input->size(dimS); + noframe = gradOutput->size(dimS); + framesize = gradOutput->size(dimF); /* get raw pointers */ gradInput_data = THTensor_(data)(gradInput); @@ -250,7 +250,7 @@ void THNN_(TemporalMaxPooling_updateGradInput)( else { /* number of batch frames */ - int64_t nbframe = input->size[0]; + int64_t nbframe = input->size(0); int64_t i; for(i = 0; i < nbframe; i++) diff --git a/aten/src/THNN/generic/TemporalReflectionPadding.c b/aten/src/THNN/generic/TemporalReflectionPadding.c index ea6ea9ab60ebb5..43eb604a78972c 100644 --- a/aten/src/THNN/generic/TemporalReflectionPadding.c +++ b/aten/src/THNN/generic/TemporalReflectionPadding.c @@ -55,19 +55,18 @@ void THNN_(TemporalReflectionPadding_updateOutput)(THNNState *state, if (input->dim() == 3) { - nbatch = input->size[0]; + nbatch = input->size(0); dimw++; dimslices++; } /* input size */ - nslices = input->size[dimslices]; - iwidth = input->size[dimw]; + nslices = input->size(dimslices); + iwidth = input->size(dimw); - THArgCheck(pad_l < iwidth && pad_r < iwidth, 4, - "Padding size should be less than the corresponding input dimension, " - "but got: padding (%d, %d) at dimension %d of input %s", - pad_l, pad_r, dimw, _THSizeDesc(input->size, input->dim()).str); + AT_CHECK(pad_l < iwidth && pad_r < iwidth, + "Argument #4: Padding size should be less than the corresponding input dimension, " + "but got: padding (", pad_l, ", ", pad_r, ") at dimension ", dimw, " of input ", input->sizes()); /* output size */ owidth = iwidth + pad_l + pad_r; @@ -168,14 +167,14 @@ void THNN_(TemporalReflectionPadding_updateGradInput)(THNNState *state, if (input->dim() == 3) { - nbatch = input->size[0]; + nbatch = input->size(0); dimw++; dimslices++; } /* sizes */ - nslices = input->size[dimslices]; - iwidth = input->size[dimw]; + nslices = input->size(dimslices); + iwidth = input->size(dimw); owidth = iwidth + pad_l + pad_r; THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3, diff --git a/aten/src/THNN/generic/TemporalReplicationPadding.c b/aten/src/THNN/generic/TemporalReplicationPadding.c index da8aeb5d8e52a0..e47a94144530c3 100644 --- a/aten/src/THNN/generic/TemporalReplicationPadding.c +++ b/aten/src/THNN/generic/TemporalReplicationPadding.c @@ -53,14 +53,14 @@ void THNN_(TemporalReplicationPadding_updateOutput)(THNNState *state, if (input->dim() == 3) { - nbatch = input->size[0]; + nbatch = input->size(0); dimw++; dimslices++; } /* sizes */ - nslices = input->size[dimslices]; - iwidth = input->size[dimw]; + nslices = input->size(dimslices); + iwidth = input->size(dimw); owidth = iwidth + pad_l + pad_r; THArgCheck(owidth >= 1 , 2, @@ -159,14 +159,14 @@ void THNN_(TemporalReplicationPadding_updateGradInput)(THNNState *state, if (input->dim() == 3) { - nbatch = input->size[0]; + nbatch = input->size(0); dimw++; dimslices++; } /* sizes */ - nslices = input->size[dimslices]; - iwidth = input->size[dimw]; + nslices = input->size(dimslices); + iwidth = input->size(dimw); owidth = iwidth + pad_l + pad_r; THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3, diff --git a/aten/src/THNN/generic/TemporalRowConvolution.c b/aten/src/THNN/generic/TemporalRowConvolution.c index db3278b3c2e1b3..5a85065058670c 100644 --- a/aten/src/THNN/generic/TemporalRowConvolution.c +++ b/aten/src/THNN/generic/TemporalRowConvolution.c @@ -22,7 +22,7 @@ static inline void THNN_(TemporalRowConvolution_shapeCheck)( THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous"); if (bias != NULL) { - THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]); + THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size(0)); } // we're always looking at (possibly batch) x feats x seq @@ -38,8 +38,8 @@ static inline void THNN_(TemporalRowConvolution_shapeCheck)( THNN_ARGCHECK(!input->is_empty() && (ndim == 2 || ndim == 3), 1, input, "non-empty 2D or 3D (batch mode) input tensor expected, but got :%s"); - int64_t inputFrameSize = weight->size[0]; - int64_t nInputFrame = input->size[dimS]; + int64_t inputFrameSize = weight->size(0); + int64_t nInputFrame = input->size(dimS); int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1; if (nOutputFrame < 1) { @@ -162,7 +162,7 @@ static void THNN_(TemporalRowConvolution_updateOutput_frame)( for (i = 0; i < inputFrameSize; i++) THVector_(fill) (THStorage_(data)(output->storage) + output->storageOffset - + output->stride[0] * i, + + output->stride(0) * i, THTensor_(get1d)(bias, i), nOutputFrame); } @@ -186,7 +186,7 @@ void THNN_(TemporalRowConvolution_updateOutput)( int ndim = input->dim(); - THTensor *tinput; + THTensor *tinput = NULL; if (!featFirst) { tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2); input = THTensor_(newContiguous)(tinput); @@ -197,8 +197,8 @@ void THNN_(TemporalRowConvolution_updateOutput)( THNN_(TemporalRowConvolution_shapeCheck)( state, input, NULL, weight, bias, kW, dW, padW); - int64_t inputFrameSize = weight->size[0]; - int64_t nInputFrame = input->size[ndim - 1]; + int64_t inputFrameSize = weight->size(0); + int64_t nInputFrame = input->size(ndim - 1); int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1; if (ndim == 2) { /* non-batch mode */ @@ -215,7 +215,7 @@ void THNN_(TemporalRowConvolution_updateOutput)( inputFrameSize, nInputFrame, nOutputFrame); } else { - int64_t T = input->size[0]; + int64_t T = input->size(0); int64_t t; THTensor_(resize4d)(finput, T, inputFrameSize, kW, nOutputFrame); @@ -311,8 +311,8 @@ void THNN_(TemporalRowConvolution_updateGradInput)( THNN_(TemporalRowConvolution_shapeCheck)(state, input, gradOutput, weight, NULL, kW, dW, padW); - int64_t inputFrameSize = weight->size[0]; - int64_t nInputFrame = input->size[ndim - 1]; + int64_t inputFrameSize = weight->size(0); + int64_t nInputFrame = input->size(ndim - 1); int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1; THTensor_(resizeAs)(fgradInput, finput); @@ -330,7 +330,7 @@ void THNN_(TemporalRowConvolution_updateGradInput)( kW, dW, padW, inputFrameSize, nInputFrame, nOutputFrame); } else { - int64_t T = input->size[0]; + int64_t T = input->size(0); int64_t t; #pragma omp parallel for private(t) @@ -373,9 +373,9 @@ static void THNN_(TemporalRowConvolution_accGradParameters_frame)( int64_t i; THTensor *gradOutput3d = THTensor_(newWithStorage3d)( gradOutput->storage, gradOutput->storageOffset, - gradOutput->size[0], -1, + gradOutput->size(0), -1, 1, -1, - gradOutput->size[1], -1); + gradOutput->size(1), -1); THTensor *tfinput = THTensor_(new)(); THTensor_(transpose)(tfinput, finput, 1, 2); @@ -386,13 +386,13 @@ static void THNN_(TemporalRowConvolution_accGradParameters_frame)( THTensor_(free)(tfinput); if (gradBias != NULL) { - for (i = 0; i < gradBias->size[0]; i++) { + for (i = 0; i < gradBias->size(0); i++) { int64_t k; real sum = 0; real *data = THStorage_(data)(gradOutput3d->storage) + gradOutput3d->storageOffset - + i * gradOutput3d->stride[0]; - for (k = 0; k < gradOutput3d->size[2]; k++) { + + i * gradOutput3d->stride(0); + for (k = 0; k < gradOutput3d->size(2); k++) { sum += data[k]; } (THStorage_(data)(gradBias->storage) + gradBias->storageOffset)[i] @@ -441,7 +441,7 @@ void THNN_(TemporalRowConvolution_accGradParameters)( THNN_(TemporalRowConvolution_accGradParameters_frame)( gradOutput, gradWeight, gradBias, finput, scale); } else { - int64_t T = input->size[0]; + int64_t T = input->size(0); int64_t t; for (t = 0; t < T; t++) { diff --git a/aten/src/THNN/generic/TemporalSubSampling.c b/aten/src/THNN/generic/TemporalSubSampling.c index 8c90d26a2cc403..5467827fe0ca81 100644 --- a/aten/src/THNN/generic/TemporalSubSampling.c +++ b/aten/src/THNN/generic/TemporalSubSampling.c @@ -19,15 +19,15 @@ static inline void THNN_(TemporalSubSampling_shapeCheck)( THNN_ARGCHECK(!input->is_empty() && input->dim() == 2, 2, input, "non-empty 2D or 3D (batch mode) tensor expected for input, but got: %s"); if (inputFrameSize != NULL) { - THArgCheck( input->size[1] == *inputFrameSize, 2, + THArgCheck( input->size(1) == *inputFrameSize, 2, "invalid input frame size. Got: %d, Expected: %d", - input->size[1], *inputFrameSize); + input->size(1), *inputFrameSize); } - THArgCheck( input->size[0] >= kW, 2, + THArgCheck( input->size(0) >= kW, 2, "input sequence smaller than kernel size. Got %d, Expected: %d", - input->size[0], kW); + input->size(0), kW); - nInputFrame = input->size[0]; + nInputFrame = input->size(0); nOutputFrame = (nInputFrame - kW) / dW + 1; if (gradOutput != NULL) { @@ -59,7 +59,7 @@ void THNN_(TemporalSubSampling_updateOutput)( outputFrame = THTensor_(new)(); inputWindow = THTensor_(new)(); - nInputFrame = input->size[0]; + nInputFrame = input->size(0); nOutputFrame = (nInputFrame - kW) / dW + 1; THTensor_(resize2d)(output, @@ -105,7 +105,7 @@ void THNN_(TemporalSubSampling_updateGradInput)( THTensor_(resizeAs)(gradInput, input); THTensor_(zero)(gradInput); - for(k = 0; k < gradOutput->size[0]; k++) + for(k = 0; k < gradOutput->size(0); k++) { THTensor_(narrow)(gradInputWindow, gradInput, 0, k*dW, kW); THTensor_(select)(gradOutputFrame, gradOutput, 0, k); @@ -139,7 +139,7 @@ void THNN_(TemporalSubSampling_accGradParameters)( inputWindow = THTensor_(new)(); buffer = THTensor_(new)(); - for(k = 0; k < gradOutput->size[0]; k++) + for(k = 0; k < gradOutput->size(0); k++) { THTensor_(narrow)(inputWindow, input, 0, k*dW, kW); THTensor_(select)(gradOutputFrame, gradOutput, 0, k); diff --git a/aten/src/THNN/generic/VolumetricAdaptiveAveragePooling.c b/aten/src/THNN/generic/VolumetricAdaptiveAveragePooling.c index 1edf8a99741df4..5956312ab24393 100644 --- a/aten/src/THNN/generic/VolumetricAdaptiveAveragePooling.c +++ b/aten/src/THNN/generic/VolumetricAdaptiveAveragePooling.c @@ -109,8 +109,8 @@ void THNN_(VolumetricAdaptiveAveragePooling_updateOutput)( if (input->dim() == 5) { - istrideB = input->stride[0]; - sizeB = input->size[0]; + istrideB = input->stride(0); + sizeB = input->size(0); dimD++; dimT++; dimH++; @@ -118,15 +118,15 @@ void THNN_(VolumetricAdaptiveAveragePooling_updateOutput)( } /* sizes */ - sizeD = input->size[dimD]; - isizeT = input->size[dimT]; - isizeH = input->size[dimH]; - isizeW = input->size[dimW]; + sizeD = input->size(dimD); + isizeT = input->size(dimT); + isizeH = input->size(dimH); + isizeW = input->size(dimW); /* strides */ - istrideD = input->stride[dimD]; - istrideT = input->stride[dimT]; - istrideH = input->stride[dimH]; - istrideW = input->stride[dimW]; + istrideD = input->stride(dimD); + istrideT = input->stride(dimT); + istrideH = input->stride(dimH); + istrideW = input->stride(dimW); /* resize output */ if (input->dim() == 4) @@ -253,7 +253,7 @@ void THNN_(VolumetricAdaptiveAveragePooling_updateGradInput)( THTensor_(zero)(gradInput); if (input->dim() == 5) { - sizeB = input->size[0]; + sizeB = input->size(0); dimD++; dimT++; dimH++; @@ -261,13 +261,13 @@ void THNN_(VolumetricAdaptiveAveragePooling_updateGradInput)( } /* sizes */ - sizeD = input->size[dimD]; - isizeT = input->size[dimT]; - isizeH = input->size[dimH]; - isizeW = input->size[dimW]; - osizeT = gradOutput->size[dimT]; - osizeH = gradOutput->size[dimH]; - osizeW = gradOutput->size[dimW]; + sizeD = input->size(dimD); + isizeT = input->size(dimT); + isizeH = input->size(dimH); + isizeW = input->size(dimW); + osizeT = gradOutput->size(dimT); + osizeH = gradOutput->size(dimH); + osizeW = gradOutput->size(dimW); /* get raw pointers */ gradInput_data = THTensor_(data)(gradInput); diff --git a/aten/src/THNN/generic/VolumetricAdaptiveMaxPooling.c b/aten/src/THNN/generic/VolumetricAdaptiveMaxPooling.c index 74efa76ebd42d1..00d5a763d38ae0 100644 --- a/aten/src/THNN/generic/VolumetricAdaptiveMaxPooling.c +++ b/aten/src/THNN/generic/VolumetricAdaptiveMaxPooling.c @@ -120,8 +120,8 @@ void THNN_(VolumetricAdaptiveMaxPooling_updateOutput)( if (input->dim() == 5) { - istrideB = input->stride[0]; - sizeB = input->size[0]; + istrideB = input->stride(0); + sizeB = input->size(0); dimD++; dimT++; dimH++; @@ -129,15 +129,15 @@ void THNN_(VolumetricAdaptiveMaxPooling_updateOutput)( } /* sizes */ - sizeD = input->size[dimD]; - isizeT = input->size[dimT]; - isizeH = input->size[dimH]; - isizeW = input->size[dimW]; + sizeD = input->size(dimD); + isizeT = input->size(dimT); + isizeH = input->size(dimH); + isizeW = input->size(dimW); /* strides */ - istrideD = input->stride[dimD]; - istrideT = input->stride[dimT]; - istrideH = input->stride[dimH]; - istrideW = input->stride[dimW]; + istrideD = input->stride(dimD); + istrideT = input->stride(dimT); + istrideH = input->stride(dimH); + istrideW = input->stride(dimW); /* resize output */ if (input->dim() == 4) @@ -254,7 +254,7 @@ void THNN_(VolumetricAdaptiveMaxPooling_updateGradInput)( THTensor_(zero)(gradInput); if (input->dim() == 5) { - sizeB = input->size[0]; + sizeB = input->size(0); dimD++; dimT++; dimH++; @@ -262,13 +262,13 @@ void THNN_(VolumetricAdaptiveMaxPooling_updateGradInput)( } /* sizes */ - sizeD = input->size[dimD]; - isizeT = input->size[dimT]; - isizeH = input->size[dimH]; - isizeW = input->size[dimW]; - osizeT = gradOutput->size[dimT]; - osizeH = gradOutput->size[dimH]; - osizeW = gradOutput->size[dimW]; + sizeD = input->size(dimD); + isizeT = input->size(dimT); + isizeH = input->size(dimH); + isizeW = input->size(dimW); + osizeT = gradOutput->size(dimT); + osizeH = gradOutput->size(dimH); + osizeW = gradOutput->size(dimW); /* get raw pointers */ gradInput_data = THTensor_(data)(gradInput); diff --git a/aten/src/THNN/generic/VolumetricAveragePooling.c b/aten/src/THNN/generic/VolumetricAveragePooling.c index c9dd9f753dcc53..93448ad2e62534 100644 --- a/aten/src/THNN/generic/VolumetricAveragePooling.c +++ b/aten/src/THNN/generic/VolumetricAveragePooling.c @@ -47,11 +47,11 @@ static inline void THNN_(VolumetricAveragePooling_shapeCheck)( THNN_ARGCHECK(!input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input, "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s"); - THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH - && input->size[dimt] >= kT, 2, + THArgCheck(input->size(dimw) >= kW && input->size(dimh) >= kH + && input->size(dimt) >= kT, 2, "input image (T: %d H: %d W: %d) smaller than " "kernel size (kT: %d kH: %d kW: %d)", - input->size[dimt], input->size[dimh], input->size[dimw], + input->size(dimt), input->size(dimh), input->size(dimw), kT, kH, kW); // The second argument is argNumber... here is the index of padH. @@ -61,10 +61,10 @@ static inline void THNN_(VolumetricAveragePooling_shapeCheck)( padT, padW, padH, kT, kW, kH); /* sizes */ - nslices = input->size[dimN]; - itime = input->size[dimt]; - iheight = input->size[dimh]; - iwidth = input->size[dimw]; + nslices = input->size(dimN); + itime = input->size(dimt); + iheight = input->size(dimh); + iwidth = input->size(dimw); if (ceil_mode) { otime = (int64_t)(ceil((float)(itime - kT + 2*padT) / dT)) + 1; @@ -231,10 +231,10 @@ void THNN_(VolumetricAveragePooling_updateOutput)( } /* sizes */ - nslices = input->size[dimN]; - itime = input->size[dimt]; - iheight = input->size[dimh]; - iwidth = input->size[dimw]; + nslices = input->size(dimN); + itime = input->size(dimt); + iheight = input->size(dimh); + iwidth = input->size(dimw); if (ceil_mode) { otime = (int64_t)(ceil((float)(itime - kT + 2*padT) / dT)) + 1; @@ -283,7 +283,7 @@ void THNN_(VolumetricAveragePooling_updateOutput)( else /* batch mode */ { int64_t p; - int64_t nBatch = input->size[0]; + int64_t nBatch = input->size(0); int64_t istride = nslices * itime * iwidth * iheight; int64_t ostride = nslices * otime * owidth * oheight; @@ -445,13 +445,13 @@ void THNN_(VolumetricAveragePooling_updateGradInput)( } /* sizes */ - nslices = input->size[dimN]; - itime = input->size[dimt]; - iheight = input->size[dimh]; - iwidth = input->size[dimw]; - otime = gradOutput->size[dimt]; - oheight = gradOutput->size[dimh]; - owidth = gradOutput->size[dimw]; + nslices = input->size(dimN); + itime = input->size(dimt); + iheight = input->size(dimh); + iwidth = input->size(dimw); + otime = gradOutput->size(dimt); + oheight = gradOutput->size(dimh); + owidth = gradOutput->size(dimw); /* get raw pointers */ gradInput_data = THTensor_(data)(gradInput); @@ -473,7 +473,7 @@ void THNN_(VolumetricAveragePooling_updateGradInput)( else /* batch mode */ { int64_t p; - int64_t nBatch = input->size[0]; + int64_t nBatch = input->size(0); int64_t istride = nslices * itime * iwidth * iheight; int64_t ostride = nslices * otime * owidth * oheight; diff --git a/aten/src/THNN/generic/VolumetricConvolution.c b/aten/src/THNN/generic/VolumetricConvolution.c index d88cc606eea184..4b74445e047705 100644 --- a/aten/src/THNN/generic/VolumetricConvolution.c +++ b/aten/src/THNN/generic/VolumetricConvolution.c @@ -33,13 +33,13 @@ void THNN_(VolumetricConvolution_updateOutput)( dimw++; } - int64_t nOutputPlane = weight->size[0]; - int64_t kT = weight->size[2]; - int64_t kH = weight->size[3]; - int64_t kW = weight->size[4]; - int64_t inputDepth = input->size[dimt]; - int64_t inputHeight = input->size[dimh]; - int64_t inputWidth = input->size[dimw]; + int64_t nOutputPlane = weight->size(0); + int64_t kT = weight->size(2); + int64_t kH = weight->size(3); + int64_t kW = weight->size(4); + int64_t inputDepth = input->size(dimt); + int64_t inputHeight = input->size(dimh); + int64_t inputWidth = input->size(dimw); int64_t outputDepth = (inputDepth - kT) / dT + 1; int64_t outputWidth = (inputWidth - kW) / dW + 1; int64_t outputHeight = (inputHeight - kH) / dH + 1; @@ -51,7 +51,7 @@ void THNN_(VolumetricConvolution_updateOutput)( /* add bias */ if (bias) { - for (i = 0; i < bias->size[0]; i++) + for (i = 0; i < bias->size(0); i++) { THTensor_(select)(outn, output, 0, i); THTensor_(fill)(outn, THTensor_(get1d)(bias, i)); @@ -65,7 +65,7 @@ void THNN_(VolumetricConvolution_updateOutput)( } else /* batch mode */ { - int64_t nBatch = input->size[0]; + int64_t nBatch = input->size(0); THTensor_(resize5d)(output, nBatch, nOutputPlane, outputDepth, outputHeight, outputWidth); THTensor *inb = THTensor_(new)(); THTensor *outb = THTensor_(new)(); @@ -78,7 +78,7 @@ void THNN_(VolumetricConvolution_updateOutput)( /* add bias */ if (bias) { - for (i = 0; i < bias->size[0]; i++) + for (i = 0; i < bias->size(0); i++) { THTensor_(select)(outn, outb, 0, i); THTensor_(fill)(outn, THTensor_(get1d)(bias, i)); @@ -117,7 +117,7 @@ void THNN_(VolumetricConvolution_updateGradInput)( "non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor " "expected for weight, but got: %s"); - int nOutputPlane = (int)weight->size[0]; + int nOutputPlane = (int)weight->size(0); THNN_ARGCHECK(!gradOutput->is_empty() && (gradOutput->dim() == 4 || gradOutput->dim() == 5), 3, gradOutput, @@ -129,7 +129,7 @@ void THNN_(VolumetricConvolution_updateGradInput)( dimPlane++; } - THArgCheck(nOutputPlane == gradOutput->size[dimPlane], 1, + THArgCheck(nOutputPlane == gradOutput->size(dimPlane), 1, "Number of output features is not equal to nOutputPlane" ); @@ -141,13 +141,13 @@ void THNN_(VolumetricConvolution_updateGradInput)( } else /* batch mode */ { - int64_t nBatch = gradOutput->size[0]; + int64_t nBatch = gradOutput->size(0); THTensor *ginpb = THTensor_(new)(); THTensor *goutb = THTensor_(new)(); int64_t j; THTensor_(resize5d)(gradInput, - input->size[0], input->size[1], input->size[2], input->size[3], input->size[4] + input->size(0), input->size(1), input->size(2), input->size(3), input->size(4) ); /* loop over batches */ @@ -187,9 +187,9 @@ void THNN_(VolumetricConvolution_accGradParameters)( "non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor " "expected for gradWeight, but got: %s"); - int nOutputPlane = (int)gradWeight->size[0]; + int nOutputPlane = (int)gradWeight->size(0); if (gradBias) { - THArgCheck(!gradBias->is_empty() && gradBias->dim() == 1 && gradBias->size[0] == nOutputPlane, 5, + THArgCheck(!gradBias->is_empty() && gradBias->dim() == 1 && gradBias->size(0) == nOutputPlane, 5, "gradBias tensor has wrong size" ); } @@ -203,7 +203,7 @@ void THNN_(VolumetricConvolution_accGradParameters)( dimPlane++; } - THArgCheck(nOutputPlane == gradOutput->size[dimPlane], 1, + THArgCheck(nOutputPlane == gradOutput->size(dimPlane), 1, "Number of output features is not equal to nOutputPlane" ); @@ -226,7 +226,7 @@ void THNN_(VolumetricConvolution_accGradParameters)( } else /* batch mode */ { - int64_t nBatch = gradOutput->size[0]; + int64_t nBatch = gradOutput->size(0); THTensor *inpb = THTensor_(new)(); THTensor *goutb = THTensor_(new)(); int64_t j; diff --git a/aten/src/THNN/generic/VolumetricConvolutionMM.c b/aten/src/THNN/generic/VolumetricConvolutionMM.c index 2fa1874c55941c..525fa5928edc68 100644 --- a/aten/src/THNN/generic/VolumetricConvolutionMM.c +++ b/aten/src/THNN/generic/VolumetricConvolutionMM.c @@ -31,7 +31,7 @@ static void inline THNN_(VolumetricConvolutionMM_shapeCheck)( THNN_ARGCHECK(!weight->is_empty() && (weight->dim() == 2 || weight->dim() == 5), 5, weight, "non-empty 2D or 5D weight tensor expected, but got: %s"); if (bias != NULL) { - THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]); + THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size(0)); } } else if (!weight_nullable) { THError("weight tensor is expected to be non-nullable"); @@ -62,9 +62,9 @@ static void inline THNN_(VolumetricConvolutionMM_shapeCheck)( int64_t outputHeight; int64_t outputWidth; - inputDepth = input->size[dimt]; - inputHeight = input->size[dimh]; - inputWidth = input->size[dimw]; + inputDepth = input->size(dimt); + inputHeight = input->size(dimh); + inputWidth = input->size(dimw); exactInputDepth = inputDepth + 2*pT; exactInputHeight = inputHeight + 2*pH; @@ -88,7 +88,7 @@ static void inline THNN_(VolumetricConvolutionMM_shapeCheck)( } if (weight != NULL) { - int64_t nInputPlane = weight->size[1]; + int64_t nInputPlane = weight->size(1); if (weight->dim() == 2) { nInputPlane /= (kT * kH * kW); } @@ -97,10 +97,10 @@ static void inline THNN_(VolumetricConvolutionMM_shapeCheck)( if (gradOutput != NULL) { if (weight != NULL) { - int64_t nOutputPlane = weight->size[0]; + int64_t nOutputPlane = weight->size(0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - int64_t nOutputPlane = bias->size[0]; + int64_t nOutputPlane = bias->size(0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, outputDepth); @@ -113,8 +113,8 @@ static THTensor* THNN_(newViewWeight)(THTensor *weight) { weight = THTensor_(newContiguous)(weight); if (weight->dim() == 5) { - int64_t s1 = weight->size[0]; - int64_t s2 = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4]; + int64_t s1 = weight->size(0); + int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3) * weight->size(4); THTensor *old_weight = weight; weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset, s1, -1, s2, -1); @@ -435,7 +435,7 @@ static void THNN_(VolumetricConvolutionMM_updateOutput_frame)( for (i = 0; i < nOutputPlane; i++) { THVector_(fill)( - THStorage_(data)(output->storage)+output->storageOffset+output->stride[0]*i, + THStorage_(data)(output->storage)+output->storageOffset+output->stride(0)*i, THTensor_(get1d)(bias, i), outputDepth*outputHeight*outputWidth ); @@ -494,11 +494,11 @@ void THNN_(VolumetricConvolutionMM_updateOutput)( dimw++; } - nInputPlane = input->size[dimf]; - inputDepth = input->size[dimt]; - inputHeight = input->size[dimh]; - inputWidth = input->size[dimw]; - nOutputPlane = weight->size[0]; + nInputPlane = input->size(dimf); + inputDepth = input->size(dimt); + inputHeight = input->size(dimh); + inputWidth = input->size(dimw); + nOutputPlane = weight->size(0); outputDepth = (inputDepth + 2*pT - kT) / dT + 1; outputHeight = (inputHeight + 2*pH - kH) / dH + 1; outputWidth = (inputWidth + 2*pW - kW) / dW + 1; @@ -521,7 +521,7 @@ void THNN_(VolumetricConvolutionMM_updateOutput)( } else { - int64_t T = input->size[0]; + int64_t T = input->size(0); int64_t t; THTensor_(resize3d)(finput, T, kT*kW*kH*nInputPlane, outputDepth*outputHeight*outputWidth); @@ -571,8 +571,8 @@ static void THNN_(VolumetricConvolutionMM_updateGradInput_frame)( { THTensor *gradOutput2d = THTensor_(newWithStorage2d)( gradOutput->storage, gradOutput->storageOffset, - gradOutput->size[0], -1, - gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1 + gradOutput->size(0), -1, + gradOutput->size(1)*gradOutput->size(2)*gradOutput->size(3), -1 ); THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d); @@ -585,8 +585,8 @@ static void THNN_(VolumetricConvolutionMM_updateGradInput_frame)( kT, kW, kH, dT, dW, dH, pT, pW, pH, - gradInput->size[0], gradInput->size[1], gradInput->size[3], gradInput->size[2], - gradOutput->size[1], gradOutput->size[3], gradOutput->size[2] + gradInput->size(0), gradInput->size(1), gradInput->size(3), gradInput->size(2), + gradOutput->size(1), gradOutput->size(3), gradOutput->size(2) ); } @@ -636,7 +636,7 @@ void THNN_(VolumetricConvolutionMM_updateGradInput)( } else { - int64_t T = input->size[0]; + int64_t T = input->size(0); int64_t t; #ifdef _OPENMP @@ -677,8 +677,8 @@ static void THNN_(VolumetricConvolutionMM_accGradParameters_frame)( int64_t i; THTensor *gradOutput2d = THTensor_(newWithStorage2d)( gradOutput->storage, gradOutput->storageOffset, - gradOutput->size[0], -1, - gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1 + gradOutput->size(0), -1, + gradOutput->size(1)*gradOutput->size(2)*gradOutput->size(3), -1 ); if (gradWeight){ @@ -689,12 +689,12 @@ static void THNN_(VolumetricConvolutionMM_accGradParameters_frame)( } if (gradBias) { - for (i = 0; i < gradBias->size[0]; i++) + for (i = 0; i < gradBias->size(0); i++) { int64_t k; real sum = 0; - real *data = THStorage_(data)(gradOutput2d->storage) + gradOutput2d->storageOffset + i*gradOutput2d->stride[0]; - for (k = 0; k < gradOutput2d->size[1]; k++) + real *data = THStorage_(data)(gradOutput2d->storage) + gradOutput2d->storageOffset + i*gradOutput2d->stride(0); + for (k = 0; k < gradOutput2d->size(1); k++) sum += data[k]; (THStorage_(data)(gradBias->storage) + gradBias->storageOffset)[i] += scale * sum; @@ -735,7 +735,7 @@ void THNN_(VolumetricConvolutionMM_accGradParameters)( } else // batch mode { - int64_t T = input->size[0]; + int64_t T = input->size(0); int64_t t; #ifdef _OPENMP diff --git a/aten/src/THNN/generic/VolumetricDilatedConvolution.c b/aten/src/THNN/generic/VolumetricDilatedConvolution.c index 66d560a5d3e06e..845093eb1f0b30 100644 --- a/aten/src/THNN/generic/VolumetricDilatedConvolution.c +++ b/aten/src/THNN/generic/VolumetricDilatedConvolution.c @@ -24,7 +24,7 @@ static inline void THNN_(VolumetricDilatedConvolution_shapeCheck)( "non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor " "expected for weight, but got: %s"); if (bias != NULL) { - THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]); + THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size(0)); } } else if (!weight_nullable) { THError("weight tensor is expected to be non-nullable"); @@ -44,9 +44,9 @@ static inline void THNN_(VolumetricDilatedConvolution_shapeCheck)( dimw++; } - int64_t inputDepth = input->size[dimd]; - int64_t inputHeight = input->size[dimh]; - int64_t inputWidth = input->size[dimw]; + int64_t inputDepth = input->size(dimd); + int64_t inputHeight = input->size(dimh); + int64_t inputWidth = input->size(dimw); int64_t outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1; int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; @@ -58,16 +58,16 @@ static inline void THNN_(VolumetricDilatedConvolution_shapeCheck)( } if (weight != NULL) { - int64_t nInputPlane = weight->size[1]; + int64_t nInputPlane = weight->size(1); THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane); } if (gradOutput != NULL) { if (weight != NULL) { - int64_t nOutputPlane = weight->size[0]; + int64_t nOutputPlane = weight->size(0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - int64_t nOutputPlane = bias->size[0]; + int64_t nOutputPlane = bias->size(0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimd, outputDepth); @@ -95,8 +95,8 @@ void THNN_(VolumetricDilatedConvolution_updateOutput)( dilationT, dilationH, dilationW, 0); // Params: - int64_t nInputPlane = weight->size[1]; - int64_t nOutputPlane = weight->size[0]; + int64_t nInputPlane = weight->size(1); + int64_t nOutputPlane = weight->size(0); input = THTensor_(newContiguous)(input); weight = THTensor_(newContiguous)(weight); @@ -109,18 +109,18 @@ void THNN_(VolumetricDilatedConvolution_updateOutput)( if (input->dim() == 4) { // Force batch is_batch = 0; - THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + THTensor_(resize5d)(input, 1, input->size(0), input->size(1), input->size(2), input->size(3)); } - int64_t inputDepth = input->size[2]; - int64_t inputHeight = input->size[3]; - int64_t inputWidth = input->size[4]; + int64_t inputDepth = input->size(2); + int64_t inputHeight = input->size(3); + int64_t inputWidth = input->size(4); int64_t outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1; int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Resize output THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth); @@ -133,7 +133,7 @@ void THNN_(VolumetricDilatedConvolution_updateOutput)( // Note: this buffer can be shared with other modules, it only ever gets increased, // and always contains ones. if (ones->dim() != 3 || - ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) { + ones->size(0)*ones->size(1)*ones->size(2) < outputDepth*outputHeight*outputWidth) { // Resize plane and fill with ones... THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth); THTensor_(fill)(ones, 1); @@ -182,7 +182,7 @@ void THNN_(VolumetricDilatedConvolution_updateOutput)( // M,N,K are dims of matrix A and B int64_t m = nOutputPlane; - int64_t n = columns->size[1]; + int64_t n = columns->size(1); int64_t k = nInputPlane*kT*kH*kW; // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) @@ -230,8 +230,8 @@ void THNN_(VolumetricDilatedConvolution_updateGradInput)( dilationT, dilationH, dilationW, 0); // Params - int64_t nInputPlane = weight->size[1]; - int64_t nOutputPlane = weight->size[0]; + int64_t nInputPlane = weight->size(1); + int64_t nOutputPlane = weight->size(0); input = THTensor_(newContiguous)(input); gradOutput = THTensor_(newContiguous)(gradOutput); @@ -242,19 +242,19 @@ void THNN_(VolumetricDilatedConvolution_updateGradInput)( if (input->dim() == 4) { // Force batch is_batch = 0; - THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); - THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + THTensor_(resize5d)(input, 1, input->size(0), input->size(1), input->size(2), input->size(3)); + THTensor_(resize5d)(gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2), gradOutput->size(3)); } - int64_t inputDepth = input->size[2]; - int64_t inputWidth = input->size[4]; - int64_t inputHeight = input->size[3]; + int64_t inputDepth = input->size(2); + int64_t inputWidth = input->size(4); + int64_t inputHeight = input->size(3); int64_t outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1; int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Resize output THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth); @@ -275,7 +275,7 @@ void THNN_(VolumetricDilatedConvolution_updateGradInput)( // M,N,K are dims of matrix A and B int64_t m = nInputPlane*kT*kW*kH; - int64_t n = gradColumns->size[1]; + int64_t n = gradColumns->size(1); int64_t k = nOutputPlane; // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) @@ -352,24 +352,24 @@ void THNN_(VolumetricDilatedConvolution_accGradParameters)( if (input->dim() == 4) { // Force batch is_batch = 0; - THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); - THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + THTensor_(resize5d)(input, 1, input->size(0), input->size(1), input->size(2), input->size(3)); + THTensor_(resize5d)(gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2), gradOutput->size(3)); } - int64_t nInputPlane = input->size[1]; - int64_t nOutputPlane = gradOutput->size[1]; - int64_t inputDepth = input->size[2]; - int64_t inputWidth = input->size[4]; - int64_t inputHeight = input->size[3]; + int64_t nInputPlane = input->size(1); + int64_t nOutputPlane = gradOutput->size(1); + int64_t inputDepth = input->size(2); + int64_t inputWidth = input->size(4); + int64_t inputHeight = input->size(3); int64_t outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1; int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; // Batch size + input planes - int64_t batchSize = input->size[0]; + int64_t batchSize = input->size(0); // Define a buffer of ones, for bias accumulation - if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) { + if (ones->dim() != 3 || ones->size(0)*ones->size(1)*ones->size(2) < outputDepth*outputHeight*outputWidth) { // Resize plane and fill with ones... THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth); THTensor_(fill)(ones, 1); @@ -405,7 +405,7 @@ void THNN_(VolumetricDilatedConvolution_accGradParameters)( // M,N,K are dims of matrix A and B int64_t m = nOutputPlane; int64_t n = nInputPlane*kT*kW*kH; - int64_t k = columns->size[1]; + int64_t k = columns->size(1); // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) THBlas_(gemm)( diff --git a/aten/src/THNN/generic/VolumetricDilatedMaxPooling.c b/aten/src/THNN/generic/VolumetricDilatedMaxPooling.c index 1641c6018c51d2..aaa00ffa353769 100644 --- a/aten/src/THNN/generic/VolumetricDilatedMaxPooling.c +++ b/aten/src/THNN/generic/VolumetricDilatedMaxPooling.c @@ -51,10 +51,10 @@ static inline void THNN_(VolumetricDilatedMaxPooling_shapeCheck)( "kT: %d kW: %d, kH: %d, padT: %d, padW: %d, padH: %d", kT, kW, kH, pT, pW, pH); - nslices = input->size[dimN]; - itime = input->size[dimt]; - iheight = input->size[dimh]; - iwidth = input->size[dimw]; + nslices = input->size(dimN); + itime = input->size(dimt); + iheight = input->size(dimh); + iwidth = input->size(dimw); if (ceilMode) { otime = (int)(ceil((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1; @@ -241,10 +241,10 @@ void THNN_(VolumetricDilatedMaxPooling_updateOutput)( ceilMode); /* sizes */ - nslices = input->size[dimN]; - itime = input->size[dimt]; - iheight = input->size[dimh]; - iwidth = input->size[dimw]; + nslices = input->size(dimN); + itime = input->size(dimt); + iheight = input->size(dimh); + iwidth = input->size(dimw); if (ceilMode) { otime = (int)(ceil((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1; @@ -298,7 +298,7 @@ void THNN_(VolumetricDilatedMaxPooling_updateOutput)( else /* batch mode */ { int64_t p; - int64_t nBatch = input->size[0]; + int64_t nBatch = input->size(0); int64_t istride = nslices * itime * iwidth * iheight; int64_t ostride = nslices * otime * owidth * oheight; @@ -444,13 +444,13 @@ void THNN_(VolumetricDilatedMaxPooling_updateGradInput)( } /* sizes */ - nslices = input->size[dimN]; - itime = input->size[dimt]; - iheight = input->size[dimh]; - iwidth = input->size[dimw]; - otime = gradOutput->size[dimt]; - oheight = gradOutput->size[dimh]; - owidth = gradOutput->size[dimw]; + nslices = input->size(dimN); + itime = input->size(dimt); + iheight = input->size(dimh); + iwidth = input->size(dimw); + otime = gradOutput->size(dimt); + oheight = gradOutput->size(dimh); + owidth = gradOutput->size(dimw); /* get raw pointers */ gradInput_data = THTensor_(data)(gradInput); @@ -474,7 +474,7 @@ void THNN_(VolumetricDilatedMaxPooling_updateGradInput)( else /* batch mode */ { int64_t p; - int64_t nBatch = input->size[0]; + int64_t nBatch = input->size(0); int64_t istride = nslices * itime * iwidth * iheight; int64_t ostride = nslices * otime * owidth * oheight; diff --git a/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c b/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c index c7c18eab7a0ddd..4cc4dcc69837d8 100644 --- a/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c +++ b/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c @@ -112,7 +112,7 @@ static inline void THNN_(VolumetricFullDilatedConvolution_shapeCheck)( "non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor " "expected for weight, but got: %s"); if (bias != NULL) { - THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[1]); + THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size(1)); } } else if (!weight_nullable) { THError("weight tensor is expected to be non-nullable"); @@ -132,13 +132,13 @@ static inline void THNN_(VolumetricFullDilatedConvolution_shapeCheck)( } if (weight != NULL) { - const int64_t nInputPlane = weight->size[0]; + const int64_t nInputPlane = weight->size(0); THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane); } - const int64_t inputWidth = input->size[dimw]; - const int64_t inputHeight = input->size[dimh]; - const int64_t inputDepth = input->size[dimd]; + const int64_t inputWidth = input->size(dimw); + const int64_t inputHeight = input->size(dimh); + const int64_t inputDepth = input->size(dimd); const int64_t outputDepth = (inputDepth - 1) * dT - 2*pT + (dilationT * (kT - 1) + 1) + aT; const int64_t outputHeight = (inputHeight - 1) * dH - 2*pH + (dilationH * (kH - 1) + 1) + aH; const int64_t outputWidth = (inputWidth - 1) * dW - 2*pW + (dilationW * (kW - 1) + 1) + aW; @@ -151,10 +151,10 @@ static inline void THNN_(VolumetricFullDilatedConvolution_shapeCheck)( if (gradOutput != NULL) { if (weight != NULL) { - const int64_t nOutputPlane = weight->size[1]; + const int64_t nOutputPlane = weight->size(1); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - const int64_t nOutputPlane = bias->size[0]; + const int64_t nOutputPlane = bias->size(0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimd, outputDepth); @@ -184,8 +184,8 @@ void THNN_(VolumetricFullDilatedConvolution_updateOutput)( input, NULL, weight, bias, kT, kW, kH, dT, dW, dH, pT, pW, pH, dilationT, dilationW, dilationH, aT, aW, aH, 0); - const int nInputPlane = (int)weight->size[0]; - const int nOutputPlane = (int)weight->size[1]; + const int nInputPlane = (int)weight->size(0); + const int nOutputPlane = (int)weight->size(1); input = THTensor_(newContiguous)(input); weight = THTensor_(newContiguous)(weight); @@ -195,18 +195,18 @@ void THNN_(VolumetricFullDilatedConvolution_updateOutput)( { // Force batch is_batch = 0; - THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + THTensor_(resize5d)(input, 1, input->size(0), input->size(1), input->size(2), input->size(3)); } - const int64_t inputWidth = input->size[4]; - const int64_t inputHeight = input->size[3]; - const int64_t inputDepth = input->size[2]; + const int64_t inputWidth = input->size(4); + const int64_t inputHeight = input->size(3); + const int64_t inputDepth = input->size(2); const int64_t outputDepth = (inputDepth - 1) * dT - 2*pT + (dilationT * (kT - 1) + 1) + aT; const int64_t outputHeight = (inputHeight - 1) * dH - 2*pH + (dilationH * (kH - 1) + 1) + aH; const int64_t outputWidth = (inputWidth - 1) * dW - 2*pW + (dilationW * (kW - 1) + 1) + aW; // Batch size + input planes - const int64_t batchSize = input->size[0]; + const int64_t batchSize = input->size(0); // Resize output THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth); @@ -218,7 +218,7 @@ void THNN_(VolumetricFullDilatedConvolution_updateOutput)( // Define a buffer of ones, for bias accumulation // Note: this buffer can be shared with other modules, it only ever gets increased, // and always contains ones. - if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) + if (ones->dim() != 3 || ones->size(0)*ones->size(1)*ones->size(2) < outputDepth*outputHeight*outputWidth) { // Resize plane and fill with ones... THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth); @@ -239,9 +239,9 @@ void THNN_(VolumetricFullDilatedConvolution_updateOutput)( // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - const int64_t m = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4]; - const int64_t n = columns->size[1]; - const int64_t k = weight->size[0]; + const int64_t m = weight->size(1) * weight->size(2) * weight->size(3) * weight->size(4); + const int64_t n = columns->size(1); + const int64_t k = weight->size(0); // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) THBlas_(gemm)( @@ -324,8 +324,8 @@ void THNN_(VolumetricFullDilatedConvolution_updateGradInput)( input, gradOutput, weight, NULL, kT, kW, kH, dT, dW, dH, pT, pW, pH, dilationT, dilationW, dilationH, aT, aW, aH, 0); - const int64_t nInputPlane = weight->size[0]; - const int64_t nOutputPlane = weight->size[1]; + const int64_t nInputPlane = weight->size(0); + const int64_t nOutputPlane = weight->size(1); input = THTensor_(newContiguous)(input); weight = THTensor_(newContiguous)(weight); @@ -336,19 +336,19 @@ void THNN_(VolumetricFullDilatedConvolution_updateGradInput)( { // Force batch is_batch = 0; - THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); - THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + THTensor_(resize5d)(input, 1, input->size(0), input->size(1), input->size(2), input->size(3)); + THTensor_(resize5d)(gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2), gradOutput->size(3)); } - const int64_t inputWidth = input->size[4]; - const int64_t inputHeight = input->size[3]; - const int64_t inputDepth = input->size[2]; + const int64_t inputWidth = input->size(4); + const int64_t inputHeight = input->size(3); + const int64_t inputDepth = input->size(2); const int64_t outputDepth = (inputDepth - 1) * dT - 2*pT + (dilationT * (kT - 1) + 1) + aT; const int64_t outputHeight = (inputHeight - 1) * dH - 2*pH + (dilationH * (kH - 1) + 1) + aH; const int64_t outputWidth = (inputWidth - 1) * dW - 2*pW + (dilationW * (kW - 1) + 1) + aW; // Batch size + input planes - const int64_t batchSize = input->size[0]; + const int64_t batchSize = input->size(0); // Resize output THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth); @@ -383,9 +383,9 @@ void THNN_(VolumetricFullDilatedConvolution_updateGradInput)( // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - const int64_t m = weight->size[0]; - const int64_t n = gradColumns->size[1]; - const int64_t k = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4]; + const int64_t m = weight->size(0); + const int64_t n = gradColumns->size(1); + const int64_t k = weight->size(1) * weight->size(2) * weight->size(3) * weight->size(4); // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) THBlas_(gemm)( @@ -464,22 +464,22 @@ void THNN_(VolumetricFullDilatedConvolution_accGradParameters)( { // Force batch is_batch = 0; - THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); - THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + THTensor_(resize5d)(input, 1, input->size(0), input->size(1), input->size(2), input->size(3)); + THTensor_(resize5d)(gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2), gradOutput->size(3)); } - const int64_t inputWidth = input->size[4]; - const int64_t inputHeight = input->size[3]; - const int64_t inputDepth = input->size[2]; + const int64_t inputWidth = input->size(4); + const int64_t inputHeight = input->size(3); + const int64_t inputDepth = input->size(2); const int64_t outputDepth = (inputDepth - 1) * dT - 2*pT + (dilationT * (kT - 1) + 1) + aT; const int64_t outputHeight = (inputHeight - 1) * dH - 2*pH + (dilationH * (kH - 1) + 1) + aH; const int64_t outputWidth = (inputWidth - 1) * dW - 2*pW + (dilationW * (kW - 1) + 1) + aW; // Batch size + input planes - const int64_t batchSize = input->size[0]; + const int64_t batchSize = input->size(0); // Define a buffer of ones, for bias accumulation - if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) + if (ones->dim() != 3 || ones->size(0)*ones->size(1)*ones->size(2) < outputDepth*outputHeight*outputWidth) { // Resize plane and fill with ones... THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth); @@ -519,9 +519,9 @@ void THNN_(VolumetricFullDilatedConvolution_accGradParameters)( // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - const int64_t n = columns->size[0]; // nOutputPlane * kt * kh * kw - const int64_t m = input_n->size[0]; // nInputPlane - const int64_t k = columns->size[1]; // inputHeight * inputWidth + const int64_t n = columns->size(0); // nOutputPlane * kt * kh * kw + const int64_t m = input_n->size(0); // nInputPlane + const int64_t k = columns->size(1); // inputHeight * inputWidth // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) THBlas_(gemm)( @@ -563,7 +563,7 @@ void THNN_(VolumetricFullDilatedConvolution_accGradParameters)( if (is_batch == 0) { THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth); - THTensor_(resize4d)(input, input->size[1], inputDepth, inputHeight, inputWidth); + THTensor_(resize4d)(input, input->size(1), inputDepth, inputHeight, inputWidth); } THTensor_(free)(input); diff --git a/aten/src/THNN/generic/VolumetricMaxUnpooling.c b/aten/src/THNN/generic/VolumetricMaxUnpooling.c index b8e649cc39d7e4..566b656e739b58 100644 --- a/aten/src/THNN/generic/VolumetricMaxUnpooling.c +++ b/aten/src/THNN/generic/VolumetricMaxUnpooling.c @@ -38,14 +38,14 @@ static inline void THNN_(VolumetricMaxUnpooling_shapeCheck)( dimh++; dimn++; } - int nslices = input->size[dimn]; + int nslices = input->size(dimn); if (gradOutput != NULL) { - if (oT != gradOutput->size[dimt] || oW != gradOutput->size[dimw] || oH != gradOutput->size[dimh]) + if (oT != gradOutput->size(dimt) || oW != gradOutput->size(dimw) || oH != gradOutput->size(dimh)) { THError( "Inconsistent gradOutput size. oT= %d, oH= %d, oW= %d, gradOutput: %dx%dx%d", - oT, oH, oW, gradOutput->size[dimt], gradOutput->size[dimh], gradOutput->size[dimw] + oT, oH, oW, gradOutput->size(dimt), gradOutput->size(dimh), gradOutput->size(dimw) ); } @@ -140,17 +140,17 @@ void THNN_(VolumetricMaxUnpooling_updateOutput)( if (input->dim() == 5) { - nbatch = input->size[0]; + nbatch = input->size(0); dimt++; dimw++; dimh++; } /* sizes */ - nslices = input->size[dimt-1]; - iT = input->size[dimt]; - iH = input->size[dimh]; - iW = input->size[dimw]; + nslices = input->size(dimt-1); + iT = input->size(dimt); + iH = input->size(dimh); + iW = input->size(dimw); /* get contiguous input */ input = THTensor_(newContiguous)(input); @@ -287,17 +287,17 @@ void THNN_(VolumetricMaxUnpooling_updateGradInput)( if (input->dim() == 5) { - nbatch = input->size[0]; + nbatch = input->size(0); dimt++; dimw++; dimh++; } /* sizes */ - nslices = input->size[dimt-1]; - iT = input->size[dimt]; - iH = input->size[dimh]; - iW = input->size[dimw]; + nslices = input->size(dimt-1); + iT = input->size(dimt); + iH = input->size(dimh); + iW = input->size(dimw); /* get raw pointers */ gradInput_data = THTensor_(data)(gradInput); diff --git a/aten/src/THNN/generic/VolumetricReplicationPadding.c b/aten/src/THNN/generic/VolumetricReplicationPadding.c index e64cb3662f01a5..7e91bfff8ed377 100644 --- a/aten/src/THNN/generic/VolumetricReplicationPadding.c +++ b/aten/src/THNN/generic/VolumetricReplicationPadding.c @@ -33,10 +33,10 @@ static inline void THNN_(VolumetricReplicationPadding_shapeCheck)( } /* sizes */ - nslices = input->size[dimslices]; - idepth = input->size[dimd]; - iheight = input->size[dimh]; - iwidth = input->size[dimw]; + nslices = input->size(dimslices); + idepth = input->size(dimd); + iheight = input->size(dimh); + iwidth = input->size(dimw); odepth = idepth + pfront + pback; oheight = iheight + ptop + pbottom; owidth = iwidth + pleft + pright; @@ -151,7 +151,7 @@ THNN_(VolumetricReplicationPadding_shapeCheck)( if (input->dim() == 5) { - nbatch = input->size[0]; + nbatch = input->size(0); dimw++; dimh++; dimd++; @@ -159,10 +159,10 @@ THNN_(VolumetricReplicationPadding_shapeCheck)( } /* sizes */ - nslices = input->size[dimslices]; - idepth = input->size[dimd]; - iheight = input->size[dimh]; - iwidth = input->size[dimw]; + nslices = input->size(dimslices); + idepth = input->size(dimd); + iheight = input->size(dimh); + iwidth = input->size(dimw); odepth = idepth + pfront + pback; oheight = iheight + ptop + pbottom; owidth = iwidth + pleft + pright; @@ -295,7 +295,7 @@ void THNN_(VolumetricReplicationPadding_updateGradInput)(THNNState *state, if (input->dim() == 5) { - nbatch = input->size[0]; + nbatch = input->size(0); dimw++; dimh++; dimd++; @@ -303,10 +303,10 @@ void THNN_(VolumetricReplicationPadding_updateGradInput)(THNNState *state, } /* sizes */ - nslices = input->size[dimslices]; - idepth = input->size[dimd]; - iheight = input->size[dimh]; - iwidth = input->size[dimw]; + nslices = input->size(dimslices); + idepth = input->size(dimd); + iheight = input->size(dimh); + iwidth = input->size(dimw); odepth = idepth + pfront + pback; oheight = iheight + ptop + pbottom; owidth = iwidth + pleft + pright; diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index a28fb02ed02e23..0518c089235ad3 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -53,6 +53,11 @@ if(BUILD_ATEN) ENDIF(USE_ROCM) endif() +# ---[ Torch build +if(BUILD_TORCH) + add_subdirectory(../torch torch) +endif() + # ---[ Caffe2 build if(BUILD_CAFFE2) # Note: the folders that are being commented out have not been properly @@ -456,7 +461,8 @@ if(BUILD_CAFFE2) endif() if(USE_ROCM) - add_library(caffe2_pybind11_state_hip MODULE ${Caffe2_HIP_PYTHON_SRCS}) + hip_add_library(caffe2_pybind11_state_hip MODULE ${Caffe2_HIP_PYTHON_SRCS}) + set_target_properties(caffe2_pybind11_state_hip PROPERTIES LINKER_LANGUAGE HIP) set_target_properties(caffe2_pybind11_state_hip PROPERTIES COMPILE_FLAGS "${HIP_HIPCC_FLAGS} -fvisibility=hidden") set_target_properties(caffe2_pybind11_state_hip PROPERTIES PREFIX "") set_target_properties(caffe2_pybind11_state_hip PROPERTIES SUFFIX ${PY_EXT_SUFFIX}) diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu index 3b378d23ffb522..cb4eaedfdbeceb 100644 --- a/caffe2/core/context_gpu.cu +++ b/caffe2/core/context_gpu.cu @@ -11,6 +11,7 @@ #include "caffe2/core/macros.h" #include "caffe2/core/asan.h" +#include "caffe2/core/blob_stats.h" #ifdef CAFFE2_USE_CUDNN #include "caffe2/core/common_cudnn.h" #endif // CAFFE2_USE_CUDNN @@ -252,7 +253,22 @@ struct Caffe2CudaInitializerHelper { } } }; -} // namespace + +struct TensorCUDAStatGetter : BlobStatGetter { + size_t sizeBytes(const Blob& blob) const override { + const auto& tensor = blob.Get(); + auto nbytes = tensor.nbytes(); + if (nbytes > 0 && tensor.IsType()) { + const auto* data = tensor.data(); + for (int i = 0; i < tensor.size(); ++i) { + nbytes += data[i].size(); + } + } + return nbytes; + } +}; +REGISTER_BLOB_STAT_GETTER(TensorCUDA, TensorCUDAStatGetter); +} // namespace /** * A utility function to rectify the gpu id. If the context specifies the diff --git a/caffe2/core/dispatch/CMakeLists.txt b/caffe2/core/dispatch/CMakeLists.txt index 736225fc1c6844..841bfca164684a 100644 --- a/caffe2/core/dispatch/CMakeLists.txt +++ b/caffe2/core/dispatch/CMakeLists.txt @@ -5,6 +5,7 @@ set(LIB_SOURCES DispatchTable.cpp KernelRegistration.cpp LayoutId.cpp + LeftRight.cpp OpSchema.cpp OpSchemaRegistration.cpp TensorTypeId.cpp diff --git a/caffe2/core/dispatch/DispatchTable.h b/caffe2/core/dispatch/DispatchTable.h index 43901b2e5500a7..0f119791dbfa03 100644 --- a/caffe2/core/dispatch/DispatchTable.h +++ b/caffe2/core/dispatch/DispatchTable.h @@ -1,104 +1,93 @@ #pragma once -#include "caffe2/utils/flat_hash_map/flat_hash_map.h" -#include "caffe2/utils/Metaprogramming.h" +#include "caffe2/core/dispatch/LeftRight.h" #include "caffe2/core/dispatch/OpSchema.h" +#include "caffe2/utils/Metaprogramming.h" +#include "caffe2/utils/flat_hash_map/flat_hash_map.h" -#include #include -#include +#include #include #include +#include +#include namespace c10 { namespace details { - /// Kernel implementations in a thread-safe hash table. -template +template class ThreadsafeOperatorTable_ final { -public: - // TODO The current implementation below does not have the correct correctness characteristics - // which we need. It's worth spelling out exactly what we need: - // - // - We need LOCK FREE read access to the table (as per the performance benchmark - // at https://fb.quip.com/hvz3AGnx8MQ8 - // - // - We need to support writes which are possibly concurrent with reads, occurring when - // a dynamic library is loaded or unloaded. - // - // - We probably can require that dynamic library loads/unloads be synchronized (so - // there are never two concurrent loads.) - - template - void emplace(Key_&& key, void* value) { - using std::to_string; - // TODO Locking - //std::unique_lock lock(mutex_); - - auto result = map_.emplace(std::forward(key), value); - if (!result.second) { - std::ostringstream msg; - msg << "Tried to register conflicting kernels to the dispatcher: " << key; - throw std::logic_error(msg.str()); - } + public: + template + void emplace(Key_&& key, void* value) { + bool res = map_.write([&](ska::flat_hash_map& map) -> bool { + auto result = map->emplace(std::forward(key), value); + return result.second; + }); + if (!res) { + std::ostringstream msg; + msg << "Tried to register conflicting kernels to the dispatcher: " << key; + throw std::logic_error(msg.str()); } + } - void erase(const Key& key) { - // TODO Locking - //std::unique_lock lock(mutex_); - - size_t num_removed = map_.erase(key); - assert(num_removed <= 1); //This is not a multi-map - if (num_removed == 0) { - throw std::logic_error("Tried to deregister a kernel that isn't registered."); - } + void erase(const Key& key) { + auto num_removed = + map_.write([&](ska::flat_hash_map& map) -> size_t { + return map->erase(key); + }); + assert(num_removed <= 1); // This is not a multi-map + if (num_removed == 0) { + throw std::logic_error( + "Tried to deregister a kernel that isn't registered."); } + } - void* lookup(const Key& key) const { - // TODO (lock needed but slow perf. Find better way) - // std::shared_lock lock(mutex_); - auto found = map_.find(key); - if (found == map_.end()) { - return nullptr; - } else { + void* lookup(const Key& key) const { + return map_.read([&](const ska::flat_hash_map& map) -> void* { + auto found = map->find(key); + if (found != map->end()) { return found->second; + } else { + return nullptr; } - } + }); + } -private: - ska::flat_hash_map map_; - // TODO Figure out how to get fast locking in C++11 (use boost::shared_timed_mutex? folly::SharedMutex? LR pattern?) - //mutable std::shared_timed_mutex mutex_; + private: + LeftRight> map_; }; } // namespace details /** * Per-operator dispatch table. * - * Given an operator specified by 'OpSchemaDef', this class records a dispatch table for - * various kernels provided for this operator. For example, if we consider the operator - * add(Tensor, Tensor), the dispatch table for this operator may contain implementations - * for various dynamic tensor types, such as (CPUFloatTensor, CPUFloatTensor), - * (CUDAFloatTensor, CUDAFloatTensor), etc. + * Given an operator specified by 'OpSchemaDef', this class records a dispatch + * table for various kernels provided for this operator. For example, if we + * consider the operator add(Tensor, Tensor), the dispatch table for this + * operator may contain implementations for various dynamic tensor types, such + * as (CPUFloatTensor, CPUFloatTensor), (CUDAFloatTensor, CUDAFloatTensor), etc. * * @tparam OpSchemaDef The operator signature this dispatch table encodes. */ // TODO: Support dispatch for meta-operators (which apply to all dynamic types) -template +template class DispatchTable final { -private: + private: using Schema = OpSchema; -public: - DispatchTable(): kernels_() {} + public: + DispatchTable() : kernels_() {} /** * Register a kernel in the table at some dispatch key. * @param func Concrete kernel function implementation to register * @param dispatch_key Dispatch key to define when this kernel is selected */ - void registerKernel(typename Schema::signature::func_type* func, typename Schema::dispatch::dispatch_key_type dispatch_key) { + void registerKernel( + typename Schema::signature::func_type* func, + typename Schema::dispatch::dispatch_key_type dispatch_key) { kernels_.emplace(std::move(dispatch_key), reinterpret_cast(func)); } @@ -107,10 +96,11 @@ class DispatchTable final { * * @param dispatch_key Dispatch key to unregister. */ - // TODO: This isn't going to work so well when we get more complicated override patterns! - // In this case, an operator will show up in multiple slots, and erasing them one-by-one - // is probably not such a good idea. - void deregisterKernel(const typename Schema::dispatch::dispatch_key_type& dispatch_key) { + // TODO: This isn't going to work so well when we get more complicated + // override patterns! In this case, an operator will show up in multiple + // slots, and erasing them one-by-one is probably not such a good idea. + void deregisterKernel( + const typename Schema::dispatch::dispatch_key_type& dispatch_key) { kernels_.erase(dispatch_key); } @@ -121,27 +111,36 @@ class DispatchTable final { * @param args Arguments to invoke the function with * @return Returned value of the operator */ - template + template typename Schema::signature::return_type call(Args&&... args) const { - // TODO Better error message, but need to take care that reference arguments match non-reference arguments and so on. - // static_assert(std::is_same::value, "Argument types don't match operator signature"); + // TODO Better error message, but need to take care that reference arguments + // match non-reference arguments and so on. + // static_assert(std::is_same::value, "Argument types don't match + // operator signature"); auto kernel_func = lookupKernelFunc_(args...); return kernel_func(std::forward(args)...); } -private: - template - typename Schema::signature::func_type* lookupKernelFunc_(const Args&... args) const { + private: + template + typename Schema::signature::func_type* lookupKernelFunc_( + const Args&... args) const { auto dispatch_key = Schema::dispatch::dispatch_key(args...); void* found = kernels_.lookup(dispatch_key); if (found == nullptr) { - // TODO Better error message - include op name and dispatch key (i.e. argument types) - throw std::logic_error(std::string() + "Didn't find kernel to dispatch to for operator '" + Schema::metadata::name() + "'"); + // TODO Better error message - include op name and dispatch key (i.e. + // argument types) + throw std::logic_error( + std::string() + "Didn't find kernel to dispatch to for operator '" + + Schema::metadata::name() + "'"); } return reinterpret_cast(found); } - details::ThreadsafeOperatorTable_ kernels_; + details::ThreadsafeOperatorTable_< + typename Schema::dispatch::dispatch_key_type> + kernels_; }; } // namespace c10 @@ -151,4 +150,5 @@ class DispatchTable final { * It has an implementation for each op schema def in a cpp file, because * we can't rely on the one-definition-rule. */ -template c10::DispatchTable& c10_dispatch_table(); +template +c10::DispatchTable& c10_dispatch_table(); diff --git a/caffe2/core/dispatch/LeftRight.cpp b/caffe2/core/dispatch/LeftRight.cpp new file mode 100644 index 00000000000000..26e7a7e114ccf9 --- /dev/null +++ b/caffe2/core/dispatch/LeftRight.cpp @@ -0,0 +1 @@ +#include "caffe2/core/dispatch/LeftRight.h" diff --git a/caffe2/core/dispatch/LeftRight.h b/caffe2/core/dispatch/LeftRight.h new file mode 100644 index 00000000000000..dc60a303c412cd --- /dev/null +++ b/caffe2/core/dispatch/LeftRight.h @@ -0,0 +1,72 @@ +#include +#include +#include +#include + +namespace c10 { +namespace details { + +// LeftRight wait-free readers synchronization primitive +// https://hal.archives-ouvertes.fr/hal-01207881/document +template +class LeftRight { + public: + LeftRight() { + counters_[0].store(0); + counters_[1].store(0); + } + + template + auto read(F&& readFunc) -> typename std::result_of::type { + auto localCounterIndex = counterIndex_.load(); + ++counters_[localCounterIndex]; + try { + auto r = readFunc(data_[dataIndex_.load()]); + --counters_[localCounterIndex]; + return r; + } catch (const std::exception& e) { + --counters_[localCounterIndex]; + throw; + } + } + + // Throwing from write would result in invalid state + template + auto write(F&& writeFunc) -> typename std::result_of::type { + std::unique_lock lock(mutex_); + uniqueWrite(std::forward(writeFunc)); + } + + private: + // This function doesn't use any locks for the writers. Use only if you know + // what you're doing + template + auto uniqueWrite(F&& writeFunc) -> typename std::result_of::type { + try { + auto localDataIndex = dataIndex_.load(); + writeFunc(data_[localDataIndex ^ 1]); + dataIndex_ = localDataIndex ^ 1; + auto localCounterIndex = counterIndex_.load(); + while (counters_[localCounterIndex ^ 1].load()) { + std::this_thread::yield(); + } + counterIndex_ = localCounterIndex ^ 1; + while (counters_[localCounterIndex].load()) { + std::this_thread::yield(); + } + return writeFunc(data_[localDataIndex]); + } catch (const std::exception& e) { + // rethrow + throw; + } + } + + std::mutex mutex_; + std::atomic counterIndex_{0}; + std::atomic dataIndex_{0}; + std::atomic counters_[2]; + T data_[2]; +}; + +} // namespace details +} // namespace c10 diff --git a/caffe2/core/hip/operator_hip_test.cc b/caffe2/core/hip/operator_hip_test.cc index 14b0188452fe29..f7c6ef34c43cdc 100644 --- a/caffe2/core/hip/operator_hip_test.cc +++ b/caffe2/core/hip/operator_hip_test.cc @@ -49,7 +49,7 @@ TEST(EnginePrefTest, GPUDeviceDefaultPreferredEngines) { const auto op = CreateOperator(op_def, &ws); EXPECT_NE(nullptr, op.get()); - EXPECT_EQ(static_cast(op.get())->type(), "HIP"); + EXPECT_EQ(static_cast(op.get())->type(), "MIOPEN"); } } diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc index e017456497e743..169e730125a2cd 100644 --- a/caffe2/core/operator.cc +++ b/caffe2/core/operator.cc @@ -70,8 +70,8 @@ PerOpEnginePrefType& g_per_op_engine_pref() { } GlobalEnginePrefType& g_global_engine_pref() { - static auto* g_global_engine_pref_ = - new GlobalEnginePrefType{{DeviceType::CUDA, {"CUDNN"}}}; + static auto* g_global_engine_pref_ = new GlobalEnginePrefType{ + {DeviceType::CUDA, {"CUDNN"}}, {DeviceType::HIP, {"MIOPEN"}}}; return *g_global_engine_pref_; } diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h index 9372ed49f4243a..26bb02415d3ea2 100644 --- a/caffe2/core/operator.h +++ b/caffe2/core/operator.h @@ -20,6 +20,7 @@ #include "caffe2/core/types.h" #include "caffe2/core/workspace.h" #include "caffe2/proto/caffe2.pb.h" +#include "caffe2/utils/filler.h" #include "caffe2/utils/proto_utils.h" namespace caffe2 { @@ -521,6 +522,74 @@ class Operator : public OperatorBase { return &context_; } + virtual std::vector> InputFillers( + const std::vector>& shapes) { + CAFFE_ENFORCE(shapes.size() == Inputs().size()); + std::vector> fillers; + for (const auto& shape : shapes) { + fillers.emplace_back(shape, &context_); + } + + return fillers; + } + +#define DISABLE_INPUT_FILLERS(Context) \ + std::vector> InputFillers( \ + const std::vector>& /* unused */) override { \ + throw UnsupportedOperatorFeature("Op does not have input fillers"); \ + } + + void SparseLengthsFillerHelper( + const std::vector>& shapes, + size_t value_index, + size_t length_index, + std::vector>* fillers) { + CAFFE_ENFORCE_EQ(shapes[length_index].size(), 1); + (*fillers)[length_index].SparseLengths(shapes[value_index].front()); + } + + void SparseSegmentsFillerHelper( + const std::vector>& shapes, + size_t value_index, + size_t segment_index, + std::vector>* fillers) { + CAFFE_ENFORCE_EQ(shapes[segment_index].size(), 1); + // TODO: what would be a proper #segments + (*fillers)[segment_index].SparseSegments(shapes[value_index].front() - 1); + } + +// The helper is build sparse input with values and lengths; e.g.: +// values = [1, 2, 3, 2, 4, 6, 7, 3, 6] +// \_____/ \________/ \__/ +// lengths = [3, 4, 2] +#define USE_VALUE_LENGTH_INPUT_FILLERS(Context, value_index, length_index) \ + std::vector> InputFillers( \ + const std::vector>& shapes) override { \ + CAFFE_ENFORCE_EQ(shapes.size(), Operator::Inputs().size()); \ + auto fillers = Operator::InputFillers(shapes); \ + Operator::SparseLengthsFillerHelper( \ + shapes, value_index, length_index, &fillers); \ + return fillers; \ + } + + // The helper is build sparse input with values, keys, and lengths; e.g.: + // values = [1, 2, 3, 2, 4, 6, 7, 3, 6] + // keys = [0, 1, 4, 0, 1, 2, 5, 1, 2] + // \_____/ \________/ \__/ + // lengths = [3, 4, 2] +#define USE_VALUE_KEY_LENGTH_INPUT_FILLERS( \ + Context, value_index, key_index, length_index) \ + std::vector> InputFillers( \ + const std::vector>& shapes) override { \ + CAFFE_ENFORCE_EQ(shapes.size(), Operator::Inputs().size()); \ + auto fillers = Operator::InputFillers(shapes); \ + Operator::SparseLengthsFillerHelper( \ + shapes, key_index, length_index, &fillers); \ + Operator::SparseSegmentsFillerHelper( \ + shapes, value_index, key_index, &fillers); \ + return fillers; \ + } + protected: void RecordEvent(const char* err_msg = nullptr) final { if (event_) { diff --git a/caffe2/core/predictor.cc b/caffe2/core/predictor.cc index bbe487b3ac6cd7..2aaa7a2dac3a30 100644 --- a/caffe2/core/predictor.cc +++ b/caffe2/core/predictor.cc @@ -71,13 +71,13 @@ Predictor::Predictor(const MetaNetDef& def, Workspace* parent, bool run_init) const auto& inputs = getBlobs(def, PredictorConsts::default_instance().inputs_blob_type()); for (const auto& input : inputs) { - inputNames_.insert(input); + config_.input_names.emplace_back(input); } const auto& outputs = getBlobs(def, PredictorConsts::default_instance().outputs_blob_type()); for (const auto& output : outputs) { - outputNames_.emplace_back(output); + config_.output_names.emplace_back(output); } } @@ -87,19 +87,19 @@ Predictor::Predictor( Workspace* parent, bool run_init, int optimization) - : run_net_(run_net), ws_(parent) { - + : ws_(parent) { + config_.predict_net = std::make_shared(run_net); if (run_init) { CAFFE_ENFORCE(ws_.RunNetOnce(init_net)); } #if CAFFE2_MOBILE GlobalInit(); #endif - + auto predict_net = config_.predict_net; if (optimization) { #ifdef CAFFE2_OPTIMIZER try { - run_net_ = opt::optimize(run_net_, &ws_, optimization); + *predict_net = opt::optimize(*predict_net, &ws_, optimization); } catch (const std::exception& e) { LOG(WARNING) << "Optimization pass failed: " << e.what(); } @@ -112,45 +112,52 @@ Predictor::Predictor( const auto& initialized_vec = ws_.Blobs(); const std::unordered_set initialized{initialized_vec.begin(), initialized_vec.end()}; - for (const auto& name : run_net.external_input()) { + for (const auto& name : predict_net->external_input()) { if (!initialized.count(name)) { auto* blob = ws_.CreateBlob(name); blob->template GetMutable(); } } - CAFFE_ENFORCE(ws_.CreateNet(run_net)); + CAFFE_ENFORCE(ws_.CreateNet(predict_net)); } bool Predictor::run(const TensorVector& inputs, TensorVector* outputs) { - CAFFE_ENFORCE(inputs.size() <= (unsigned)run_net_.external_input_size()); + CAFFE_ENFORCE( + inputs.size() <= + static_cast(config_.predict_net->external_input_size())); for (size_t i = 0; i < inputs.size(); ++i) { - shareInputTensor(&ws_, run_net_.external_input(i), inputs[i]); + shareInputTensor(&ws_, config_.predict_net->external_input(i), inputs[i]); } - if (!ws_.RunNet(run_net_.name())) { + if (!ws_.RunNet(config_.predict_net->name())) { return false; } - outputs->resize(run_net_.external_output_size()); + outputs->resize(config_.predict_net->external_output_size()); for (size_t i = 0; i < outputs->size(); ++i) { - (*outputs)[i] = extractOutputTensor(&ws_, run_net_.external_output(i)); + (*outputs)[i] = + extractOutputTensor(&ws_, config_.predict_net->external_output(i)); } return true; } bool Predictor::run_map_workspace(const TensorMap& inputs) { - if (!inputNames_.empty()) { - CAFFE_ENFORCE_EQ(inputs.size(), inputNames_.size()); + if (!config_.input_names.empty()) { + CAFFE_ENFORCE_EQ(inputs.size(), input_names().size()); } for (auto input : inputs) { - if (!inputNames_.empty()) { - CAFFE_ENFORCE_GT(inputNames_.count(input.first), 0); + if (!input_names().empty()) { + CAFFE_ENFORCE( + std::find(input_names().begin(), input_names().end(), input.first) != + input_names().end(), + "Input can't be found: ", + input.first); } shareInputTensor(&ws_, input.first, input.second); } - return ws_.RunNet(run_net_.name()); + return ws_.RunNet(config_.predict_net->name()); } bool Predictor::run_map(const TensorMap& inputs, TensorVector* outputs) { @@ -158,9 +165,10 @@ bool Predictor::run_map(const TensorMap& inputs, TensorVector* outputs) { return false; } - outputs->resize(run_net_.external_output_size()); + outputs->resize(config_.predict_net->external_output_size()); for (size_t i = 0; i < outputs->size(); ++i) { - (*outputs)[i] = extractOutputTensor(&ws_, run_net_.external_output(i)); + (*outputs)[i] = + extractOutputTensor(&ws_, config_.predict_net->external_output(i)); } return true; } @@ -170,8 +178,8 @@ bool Predictor::run_map_outputs(const TensorMap& inputs, TensorMap* outputs) { return false; } - outputs->reserve(outputNames_.size()); - for (const std::string& outputName : outputNames_) { + outputs->reserve(output_names().size()); + for (const std::string& outputName : output_names()) { (*outputs)[outputName] = extractOutputTensor(&ws_, outputName); } return true; diff --git a/caffe2/core/predictor.h b/caffe2/core/predictor.h index 1212946038e834..b56401a35da5c3 100644 --- a/caffe2/core/predictor.h +++ b/caffe2/core/predictor.h @@ -2,6 +2,7 @@ #include #include "caffe2/core/net.h" +#include "caffe2/core/predictor_config.h" #include "caffe2/core/tensor.h" #include "caffe2/proto/metanet.pb.h" #include "caffe2/proto/predictor_consts.pb.h" @@ -52,29 +53,24 @@ class Predictor { bool run_map_outputs(const TensorMap& inputs, TensorMap* outputs); const NetDef& def() const { - return run_net_; + return *config_.predict_net; }; Workspace* ws() { return &ws_; }; - const std::unordered_set& input_names() const { - return inputNames_; + const std::vector& input_names() const { + return config_.input_names; } const std::vector& output_names() const { - return outputNames_; + return config_.output_names; } private: bool run_map_workspace(const TensorMap& inputs); - - NetDef run_net_; + PredictorConfig config_; Workspace ws_; - std::unordered_set inputNames_; - // Outputs need to be ordered since TensorVector outputs rely on the outputs - // being in a certain order. - std::vector outputNames_; }; } diff --git a/caffe2/core/predictor_config.h b/caffe2/core/predictor_config.h new file mode 100644 index 00000000000000..343c573c59e93f --- /dev/null +++ b/caffe2/core/predictor_config.h @@ -0,0 +1,36 @@ +#pragma once +#include +#include +#include "caffe2/core/tensor.h" + +namespace caffe2 { + +/* + * Parameters for a Predictor provided by name. + * They are stored as shared_ptr to accommodate parameter sharing + */ +using PredictorParameters = std::map>; + +/** + * Stores parameters nessasary for creating a PredictorInterface object. + */ +struct PredictorConfig { + // A map of parameter name to Tensor object. Predictor is supposed to + // guarantee constness of all these Tensor objects. + std::shared_ptr parameters; + + std::shared_ptr predict_net; + + // Input names of a model. User will have to provide all of the inputs + // for inference + std::vector input_names; + // Output names of a model. All outputs will be returned as results of + // inference + std::vector output_names; + // Parameter names of a model. Should be a subset of parameters map passed in. + // We provide a separate set of parameter names here as whole parameter set + // passed in by a user might contain extra tensors used by other models + std::vector parameter_names; +}; + +} // namespace caffe2 diff --git a/caffe2/core/stats.h b/caffe2/core/stats.h index 42b5c752fcce7c..09de1b1d095b81 100644 --- a/caffe2/core/stats.h +++ b/caffe2/core/stats.h @@ -258,6 +258,25 @@ class DetailedExportedStat : public ExportedStat { } }; +class StaticStat : public Stat { + private: + StatValue* value_; + + public: + StaticStat(const std::string& groupName, const std::string& name) + : Stat(groupName, name), + value_(StatRegistry::get().add(groupName + "/" + name)) {} + + int64_t increment(int64_t value = 1) { + return value_->reset(value); + } + + template + int64_t increment(T value, Unused1, Unused...) { + return increment(value); + } +}; + namespace detail { template @@ -285,7 +304,7 @@ template _ScopeGuard ScopeGuard(T f) { return _ScopeGuard(f); } -} +} // namespace detail #define CAFFE_STAT_CTOR(ClassName) \ ClassName(std::string name) : groupName(name) {} \ @@ -316,6 +335,11 @@ _ScopeGuard ScopeGuard(T f) { groupName, #name \ } +#define CAFFE_STATIC_STAT(name) \ + StaticStat name { \ + groupName, #name \ + } + #define CAFFE_EVENT(stats, field, ...) \ { \ auto __caffe_event_value_ = stats.field.increment(__VA_ARGS__); \ @@ -330,4 +354,4 @@ _ScopeGuard ScopeGuard(T f) { if (auto g = detail::ScopeGuard([&](int64_t nanos) { \ CAFFE_EVENT(stats, field, nanos, ##__VA_ARGS__); \ })) -} +} // namespace caffe2 diff --git a/caffe2/core/stats_test.cc b/caffe2/core/stats_test.cc index 383daaf80d35c2..5d7c86bc918d12 100644 --- a/caffe2/core/stats_test.cc +++ b/caffe2/core/stats_test.cc @@ -117,5 +117,31 @@ TEST(StatsTest, StatsTestSimple) { toMap(reg2.publish()), ExportedStatMap({{"i1/s3", 0}, {"i2/s3", 0}})); } +TEST(StatsTest, StatsTestStatic) { + struct TestStats { + CAFFE_STAT_CTOR(TestStats); + CAFFE_STATIC_STAT(cpuUsage); + CAFFE_STATIC_STAT(memUsage); + }; + TestStats i1("i1"); + TestStats i2("i2"); + CAFFE_EVENT(i1, cpuUsage, 95); + CAFFE_EVENT(i2, memUsage, 80); + + ExportedStatList data; + StatRegistry::get().publish(data); + EXPECT_SUBSET( + toMap(data), ExportedStatMap({{"i1/cpuUsage", 95}, {"i2/memUsage", 80}})); + + CAFFE_EVENT(i1, cpuUsage, 80); + CAFFE_EVENT(i1, memUsage, 50); + CAFFE_EVENT(i2, memUsage, 90); + + StatRegistry::get().publish(data); + EXPECT_SUBSET( + toMap(data), + ExportedStatMap( + {{"i1/cpuUsage", 80}, {"i1/memUsage", 50}, {"i2/memUsage", 90}})); +} } // namespace } // namespace caffe2 diff --git a/caffe2/distributed/file_store_handler_op_gpu.cc b/caffe2/distributed/file_store_handler_op_gpu.cc index c2e3ff27c937c3..6c13d14f36a6b2 100644 --- a/caffe2/distributed/file_store_handler_op_gpu.cc +++ b/caffe2/distributed/file_store_handler_op_gpu.cc @@ -1,4 +1,4 @@ -#include "file_store_handler_op.h" +#include "caffe2/distributed/file_store_handler_op.h" #include diff --git a/caffe2/distributed/redis_store_handler_op_gpu.cc b/caffe2/distributed/redis_store_handler_op_gpu.cc index 9bf8d25bba6cb1..5a759e5340a616 100644 --- a/caffe2/distributed/redis_store_handler_op_gpu.cc +++ b/caffe2/distributed/redis_store_handler_op_gpu.cc @@ -1,4 +1,4 @@ -#include "redis_store_handler_op.h" +#include "caffe2/distributed/redis_store_handler_op.h" #include diff --git a/caffe2/experiments/operators/sparse_funhash_op.h b/caffe2/experiments/operators/sparse_funhash_op.h index 04c2441f297b12..5c5f27e46667c1 100644 --- a/caffe2/experiments/operators/sparse_funhash_op.h +++ b/caffe2/experiments/operators/sparse_funhash_op.h @@ -47,6 +47,9 @@ class SparseFunHashOp : public Operator { adaptive_ = (InputSize() == 5); } + // TODO: enable the filler + DISABLE_INPUT_FILLERS(Context) + bool RunOnDevice() override { const auto& val = Input(0); const auto& key = Input(1); @@ -151,6 +154,9 @@ class SparseFunHashGradientOp : public Operator { adaptive_ = (InputSize() == 6); } + // TODO: enable the filler + DISABLE_INPUT_FILLERS(Context) + bool RunOnDevice() override { const auto& grad_out = Input(0); const auto& val = Input(1); diff --git a/caffe2/experiments/operators/sparse_matrix_reshape_op.h b/caffe2/experiments/operators/sparse_matrix_reshape_op.h index b952a72158f450..8c8d51c4ed01dc 100644 --- a/caffe2/experiments/operators/sparse_matrix_reshape_op.h +++ b/caffe2/experiments/operators/sparse_matrix_reshape_op.h @@ -91,6 +91,9 @@ class SparseMatrixReshapeOp : public Operator { new_stride_ = new_shape[1]; } + // TODO: enable the filler + DISABLE_INPUT_FILLERS(Context) + bool RunOnDevice() override { auto& old_col = Input(0); CAFFE_ENFORCE(old_col.ndim() == 1, "Row index tensor must be 1-D."); diff --git a/caffe2/ideep/operators/operator_fallback_ideep.h b/caffe2/ideep/operators/operator_fallback_ideep.h index 97bc8d1b348ef3..ac27cd7253b864 100644 --- a/caffe2/ideep/operators/operator_fallback_ideep.h +++ b/caffe2/ideep/operators/operator_fallback_ideep.h @@ -81,6 +81,18 @@ class IDEEPFallbackOp final : public IDEEPOperator { } else { input.reorder_to(dtensor->template mutable_data()); } + } else if ( + InputIsType(i) && + Input(i).get_data_type() == itensor::data_type::s32) { + auto& input = Input(i); + auto dtensor = local_input_blobs_[i]->template GetMutable(); + dtensor->Resize(input.get_dims()); + if (input.is_public_format()) { + dtensor->ShareExternalPointer( + static_cast(input.get_data_handle())); + } else { + input.reorder_to(dtensor->template mutable_data()); + } } else { VLOG(1) << "Input " << i << " is not ideep::tensor. Skipping copy."; // Note(jiayq): This removes a const but conceptually @@ -122,6 +134,19 @@ class IDEEPFallbackOp final : public IDEEPOperator { dtensor->resize(dst_dims, itensor::data_type::f32); } dtensor->set_data_handle(const_cast(src.raw_data())); + } else if (src.template IsType()) { + Blob* dst = OperatorBase::OutputBlob(i); + if (!dst->template IsType()) { + dst->Reset(new itensor()); + } + + auto src_dims = src.dims(); + itensor::dims dst_dims(src_dims.begin(), src_dims.end()); + auto dtensor = dst->template GetMutable(); + if (dtensor->get_dims() != dst_dims) { + dtensor->resize(dst_dims, itensor::data_type::s32); + } + dtensor->set_data_handle(const_cast(src.raw_data())); } else { CAFFE_THROW("ideep memory only supports float data type."); } diff --git a/caffe2/image/CMakeLists.txt b/caffe2/image/CMakeLists.txt index 84d9007ad6a7da..fdc74ec0e50328 100644 --- a/caffe2/image/CMakeLists.txt +++ b/caffe2/image/CMakeLists.txt @@ -11,6 +11,14 @@ if(USE_OPENCV AND OpenCV_FOUND) file(GLOB tmp *_test.cc) exclude(Caffe2_GPU_SRCS "${Caffe2_GPU_SRCS}" ${tmp}) + # ---[ HIP files + # ------[ general hip + file(GLOB_RECURSE tmp *_hip.cc) + set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp}) + # exclude test files + file(GLOB_RECURSE tmp *_test.cc) + exclude(Caffe2_HIP_SRCS "${Caffe2_HIP_SRCS}" ${tmp}) + # ---[ CPU files. file(GLOB tmp *.cc) set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp}) @@ -18,21 +26,29 @@ if(USE_OPENCV AND OpenCV_FOUND) file(GLOB tmp *_test.cc) exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp}) exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_GPU_SRCS}) + exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_HIP_SRCS}) # ---[ GPU test files file(GLOB tmp *_gpu_test.cc) set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} ${tmp}) + # ---[ HIP test files + file(GLOB_RECURSE tmp *_hip_test.cc) + set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} ${tmp}) + # ---[ CPU test files file(GLOB tmp *_test.cc) set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp}) exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_GPU_TEST_SRCS}) + exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_HIP_TEST_SRCS}) # ---[ Send the lists to the parent scope. set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE) set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE) + set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE) set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE) set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE) + set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE) else() message(STATUS "Excluding image processing operators due to no opencv") endif() diff --git a/caffe2/onnx/backend.cc b/caffe2/onnx/backend.cc index de44ab67cf6c00..6f3986b837d308 100644 --- a/caffe2/onnx/backend.cc +++ b/caffe2/onnx/backend.cc @@ -340,8 +340,8 @@ Caffe2Backend::get_special_operators() const { {"Cast", &Caffe2Backend::CreateCast}, {"Constant", &Caffe2Backend::CreateConstant}, {"Conv", &Caffe2Backend::CreateConvPoolOpBase}, - {"AveragePool", &Caffe2Backend::CreateConvPoolOpBase}, - {"GlobalAveragePool", &Caffe2Backend::CreateConvPoolOpBase}, + {"AveragePool", &Caffe2Backend::CreatePadPool}, + {"GlobalAveragePool", &Caffe2Backend::CreatePadPool}, {"GlobalMaxPool", &Caffe2Backend::CreateConvPoolOpBase}, {"MaxPool", &Caffe2Backend::CreateConvPoolOpBase}, {"Reshape", &Caffe2Backend::CreateReshape}, @@ -515,6 +515,63 @@ Caffe2Ops Caffe2Backend::CreateConvPoolOpBase( return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version); } +Caffe2Ops Caffe2Backend::CreatePadPool(OnnxNode* onnx_node, int opset_version) { + auto& node = onnx_node->node; + auto& attributes = onnx_node->attributes; + Caffe2Ops ret; + // Pad + bool padding = false; + const std::string pad_name = opset_version < 2 ? "paddings" : "pads"; + const auto pad_input = dummy_->NewDummyName(); + if (attributes.HasAttribute("count_include_pad") && + attributes.HasAttribute(pad_name)) { + auto count_include_pad = attributes.get("count_include_pad", 0L); + ::google::protobuf::RepeatedField<::google::protobuf::int64> pads; + pads = + attributes + .get<::google::protobuf::RepeatedField<::google::protobuf::int64>>( + pad_name); + if (count_include_pad == 1 && pads.size() == 4 && + !(pads.Get(0) == 0 && pads.Get(1) == 0 && pads.Get(2) == 0 && + pads.Get(3) == 0)) { + padding = true; + attributes.remove(pad_name); + caffe2::Argument arg_pads; + arg_pads.add_ints(pads.Get(0)); + arg_pads.add_ints(pads.Get(1)); + arg_pads.add_ints(pads.Get(2)); + arg_pads.add_ints(pads.Get(3)); + arg_pads.set_name("pads"); + auto* c2_op = ret.ops.Add(); + BuildOperator( + c2_op, "PadImage", {node.input(0)}, {pad_input}, {arg_pads}); + } else if (count_include_pad == 1) { + std::string str; + bool pads_flag = false; + str += "["; + for (const auto& i : pads) { + str += caffe2::to_string(i) + ","; + pads_flag = pads_flag || i > 0; + } + str += "]"; + if (pads_flag == true) { + CAFFE_THROW( + "Caffe2 only supports padding 2D Tensor, whereas padding is ", str); + } + } + } + // Pool + auto c2_ops = Caffe2Backend::CreateConvPoolOpBase(onnx_node, opset_version); + auto* pool_op = c2_ops.ops.Mutable(0); + if (padding) { + pool_op->set_input(0, pad_input); + } + auto* c2_op = ret.ops.Add(); + c2_op->CopyFrom(*pool_op); + + return ret; +} + Caffe2Ops Caffe2Backend::CreateReshape(OnnxNode* onnx_node, int opset_version) { auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version); CAFFE_ENFORCE_EQ(c2_op.ops.size(), 1); diff --git a/caffe2/onnx/backend.h b/caffe2/onnx/backend.h index e8a8ec3c65bc57..437e572b8528b7 100644 --- a/caffe2/onnx/backend.h +++ b/caffe2/onnx/backend.h @@ -168,6 +168,8 @@ class Caffe2Backend { Caffe2Ops CreateConvPoolOpBase(OnnxNode* onnx_node, int opset_version); + Caffe2Ops CreatePadPool(OnnxNode* onnx_node, int opset_version); + Caffe2Ops CreateReshape(OnnxNode* onnx_node, int opset_version); Caffe2Ops CreateGather(OnnxNode* onnx_node, int opset_version); diff --git a/caffe2/onnx/helper.h b/caffe2/onnx/helper.h index 85b27dda87d524..42c9a639431de3 100644 --- a/caffe2/onnx/helper.h +++ b/caffe2/onnx/helper.h @@ -40,6 +40,18 @@ inline AttributeProto MakeAttribute( return attr; } +inline AttributeProto MakeAttribute( + const std::string& name, + const std::vector& vals) { + AttributeProto attr; + attr.set_name(name); + for (const auto v : vals) { + attr.add_floats(v); + } + attr.set_type(AttributeProto::FLOATS); + return attr; +} + inline AttributeProto MakeAttribute(const std::string& name, int64_t val) { AttributeProto attr; attr.set_name(name); diff --git a/caffe2/onnx/onnx_exporter.cc b/caffe2/onnx/onnx_exporter.cc index 02d870d511d6d2..0c36e26d248e9f 100644 --- a/caffe2/onnx/onnx_exporter.cc +++ b/caffe2/onnx/onnx_exporter.cc @@ -245,7 +245,8 @@ OnnxExporter::get_special_operators() const { {"LRN", &OnnxExporter::CreateLrnNodes}, {"Reshape", &OnnxExporter::CreateReshapeNodes}, {"Slice", &OnnxExporter::CreateSliceNodes}, - {"ChannelShuffle", &OnnxExporter::CreateChannelShuffleNodes}}; + {"ChannelShuffle", &OnnxExporter::CreateChannelShuffleNodes}, + {"ResizeNearest", &OnnxExporter::CreateUpsampleNodes}}; return kSpecialOperators; } @@ -681,6 +682,41 @@ ConvertedResult OnnxExporter::CreateChannelShuffleNodes( return result; } +ConvertedResult OnnxExporter::CreateUpsampleNodes( + const caffe2::OperatorDef& def, + const std::unordered_map& shapes) { + float width_scale = 1.0; + float height_scale = 1.0; + for (const auto& a : def.arg()) { + if (a.name() == "width_scale") { + width_scale = a.f(); + } else if (a.name() == "height_scale") { + height_scale = a.f(); + } + } + CAFFE_ENFORCE_GT(width_scale, 0); + CAFFE_ENFORCE_GT(height_scale, 0); + + auto x = def.input(0); + const auto& x_shape = shapes.at(x); + CAFFE_ENFORCE_GE(x_shape.dims().size(), 2); + + std::vector scales(x_shape.dims().size(), 1.0); + scales[scales.size() - 2] = height_scale; + scales[scales.size() - 1] = width_scale; + + ConvertedResult result; + auto& nodes = result.first; + std::vector inputs(def.input().begin(), def.input().end()); + std::vector outputs(def.output().begin(), def.output().end()); + auto node = MakeNode("Upsample", inputs, outputs, def.name()); + node.add_attribute()->CopyFrom(MakeAttribute("scales", scales)); + node.add_attribute()->CopyFrom(MakeAttribute("mode", "nearest")); + nodes.emplace_back(node); + + return result; +} + ConvertedResult OnnxExporter::CreateSliceNodes( const caffe2::OperatorDef& def, const std::unordered_map& shapes) { diff --git a/caffe2/onnx/onnx_exporter.h b/caffe2/onnx/onnx_exporter.h index 7fcd54044d9d66..51f62df0eb2212 100644 --- a/caffe2/onnx/onnx_exporter.h +++ b/caffe2/onnx/onnx_exporter.h @@ -92,6 +92,10 @@ class OnnxExporter { const caffe2::OperatorDef& def, const std::unordered_map& shapes); + ConvertedResult CreateUpsampleNodes( + const caffe2::OperatorDef& def, + const std::unordered_map& shapes); + // \brief Check black listed arguemnts where we won't pass down when // converting to ONNX node bool IsBlackListed(const caffe2::Argument& arg); diff --git a/caffe2/onnx/onnxifi_init.cc b/caffe2/onnx/onnxifi_init.cc index 7ec264b342fe63..62c44b0b38b3e1 100644 --- a/caffe2/onnx/onnxifi_init.cc +++ b/caffe2/onnx/onnxifi_init.cc @@ -11,7 +11,7 @@ onnxifi_library* initOnnxifiLibrary() { static onnxifi_library core{}; std::call_once(once, []() { auto ret = - onnxifi_load(ONNXIFI_LOADER_FLAG_VERSION_1_0, nullptr, nullptr, &core); + onnxifi_load(ONNXIFI_LOADER_FLAG_VERSION_1_0, nullptr, &core); if (!ret) { CAFFE_THROW("Cannot load onnxifi lib"); } diff --git a/caffe2/operators/affine_channel_op.cc b/caffe2/operators/affine_channel_op.cc index 26953876b4891a..823a3cf8fee378 100644 --- a/caffe2/operators/affine_channel_op.cc +++ b/caffe2/operators/affine_channel_op.cc @@ -21,6 +21,8 @@ void AffineChannelScaleBiasBackwardNCHW( const int stride = C * HxW; EigenVectorArrayMap dscale_arr(dscale, C); EigenVectorArrayMap dbias_arr(dbias, C); + dscale_arr.setZero(); + dbias_arr.setZero(); for (int i = 0; i < N; ++i) { ConstEigenArrayMap dY_arr(dY_ptr, HxW, C); ConstEigenArrayMap X_arr(X_ptr, HxW, C); diff --git a/caffe2/operators/affine_channel_op_cudnn.cc b/caffe2/operators/affine_channel_op_cudnn.cc deleted file mode 100644 index e3bf3b140a3ec3..00000000000000 --- a/caffe2/operators/affine_channel_op_cudnn.cc +++ /dev/null @@ -1,371 +0,0 @@ -#include "caffe2/operators/affine_channel_op.h" - -#include -#include -#include - -#include "caffe2/core/context_gpu.h" -#include "caffe2/core/cudnn_wrappers.h" -#include "caffe2/core/types.h" -#include "caffe2/utils/conversions.h" -#include "caffe2/utils/math.h" - -namespace caffe2 { - -namespace { - -class CuDNNAffineChannelOpBase : public Operator { - public: - CuDNNAffineChannelOpBase(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws), - order_(StringToStorageOrder( - OperatorBase::GetSingleArgument("order", "NCHW"))), - OP_SINGLE_ARG(bool, "is_learnable", is_learnable_, false), - cudnn_wrapper_(&context_) { - CAFFE_ENFORCE_NE(order_, StorageOrder::UNKNOWN); - - CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&X_desc_)); - CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&scale_desc_)); - CUDNN_ENFORCE(cudnnCreateOpTensorDescriptor(&mul_desc_)); - } - - virtual ~CuDNNAffineChannelOpBase() { - CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(X_desc_)); - CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(scale_desc_)); - CUDNN_ENFORCE(cudnnDestroyOpTensorDescriptor(mul_desc_)); - } - - protected: - void SetTensorDesc4D( - const cudnnDataType_t cudnn_type, - const int N, - const int C, - const int H, - const int W) { - CUDNN_ENFORCE(cudnnSetTensor4dDescriptor( - X_desc_, GetCudnnTensorFormat(order_), cudnn_type, N, C, H, W)); - CUDNN_ENFORCE(cudnnSetTensor4dDescriptor( - scale_desc_, GetCudnnTensorFormat(order_), cudnn_type, 1, C, 1, 1)); - } - - void SetTensorDescND( - const cudnnDataType_t cudnn_type, - const std::vector& X_dims) { - const int ndim = X_dims.size(); - const int C_dim = order_ == StorageOrder::NCHW ? 1 : ndim - 1; - const int C = X_dims[C_dim]; - std::vector X_strides(ndim); - X_strides.back() = 1; - for (int i = ndim - 1; i > 0; --i) { - X_strides[i - 1] = X_strides[i] * X_dims[i]; - } - std::vector scale_dims(ndim, 1); - scale_dims[C_dim] = C; - std::vector scale_strides(ndim); - std::fill(scale_strides.begin(), scale_strides.begin() + C_dim, C); - std::fill(scale_strides.begin() + C_dim, scale_strides.end(), 1); - CUDNN_ENFORCE(cudnnSetTensorNdDescriptor( - X_desc_, cudnn_type, ndim, X_dims.data(), X_strides.data())); - CUDNN_ENFORCE(cudnnSetTensorNdDescriptor( - scale_desc_, - cudnn_type, - ndim, - scale_dims.data(), - scale_strides.data())); - } - - const StorageOrder order_; - const bool is_learnable_; - - CuDNNWrapper cudnn_wrapper_; - cudnnTensorDescriptor_t X_desc_; - cudnnTensorDescriptor_t scale_desc_; - cudnnOpTensorDescriptor_t mul_desc_; -}; - -class CuDNNAffineChannelOp final : public CuDNNAffineChannelOpBase { - public: - CuDNNAffineChannelOp(const OperatorDef& operator_def, Workspace* ws) - : CuDNNAffineChannelOpBase(operator_def, ws) { - CUDNN_ENFORCE(cudnnCreateOpTensorDescriptor(&add_desc_)); - } - - ~CuDNNAffineChannelOp() { - CUDNN_ENFORCE(cudnnDestroyOpTensorDescriptor(add_desc_)); - } - - bool RunOnDevice() override { - return DispatchHelper>::call(this, Input(0)); - } - - template - bool DoRunWithType() { - const auto& X = Input(0); - const auto& scale = Input(1); - const auto& bias = Input(2); - auto* Y = Output(0); - if (is_learnable_) { - CAFFE_ENFORCE_NE( - Y, - &X, - "In-place affine_channel_op is not supported when " - "is_learnable = true."); - } - Y->ResizeLike(X); - const T* X_data = X.data(); - const T* scale_data = scale.data(); - const T* bias_data = bias.data(); - T* Y_data = Y->mutable_data(); - const int ndim = X.ndim(); - CAFFE_ENFORCE_GE(ndim, 4); - const cudnnDataType_t cudnn_type = cudnnTypeWrapper::type; - if (ndim == 4) { - const int N = X.dim32(0); - const int C = order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(3); - const int H = order_ == StorageOrder::NCHW ? X.dim32(2) : X.dim32(1); - const int W = order_ == StorageOrder::NCHW ? X.dim32(3) : X.dim32(2); - SetTensorDesc4D(cudnn_type, N, C, H, W); - } else { - const std::vector X_dims(X.dims().cbegin(), X.dims().cend()); - SetTensorDescND(cudnn_type, X_dims); - } - CUDNN_ENFORCE(cudnnSetOpTensorDescriptor( - mul_desc_, CUDNN_OP_TENSOR_MUL, cudnn_type, CUDNN_PROPAGATE_NAN)); - CUDNN_ENFORCE(cudnnOpTensor( - cudnn_wrapper_.inline_cudnn_handle(), - mul_desc_, - cudnnTypeWrapper::kOne(), - X_desc_, - X_data, - cudnnTypeWrapper::kOne(), - scale_desc_, - scale_data, - cudnnTypeWrapper::kZero(), - X_desc_, - Y_data)); - if (ndim == 4) { - CUDNN_ENFORCE(cudnnAddTensor( - cudnn_wrapper_.inline_cudnn_handle(), - cudnnTypeWrapper::kOne(), - scale_desc_, - bias_data, - cudnnTypeWrapper::kOne(), - X_desc_, - Y_data)); - } else { - CUDNN_ENFORCE(cudnnSetOpTensorDescriptor( - add_desc_, CUDNN_OP_TENSOR_ADD, cudnn_type, CUDNN_PROPAGATE_NAN)); - CUDNN_ENFORCE(cudnnOpTensor( - cudnn_wrapper_.inline_cudnn_handle(), - add_desc_, - cudnnTypeWrapper::kOne(), - X_desc_, - Y_data, - cudnnTypeWrapper::kOne(), - scale_desc_, - bias_data, - cudnnTypeWrapper::kZero(), - X_desc_, - Y_data)); - } - return true; - } - - private: - cudnnOpTensorDescriptor_t add_desc_; -}; - -class CuDNNAffineChannelGradientOp final : public CuDNNAffineChannelOpBase { - public: - CuDNNAffineChannelGradientOp(const OperatorDef& operator_def, Workspace* ws) - : CuDNNAffineChannelOpBase(operator_def, ws) { -#if CUDNN_VERSION_MIN(6, 0, 0) - CUDNN_ENFORCE(cudnnCreateReduceTensorDescriptor(&reduce_desc_)); -#endif - } - - ~CuDNNAffineChannelGradientOp() { -#if CUDNN_VERSION_MIN(6, 0, 0) - CUDNN_ENFORCE(cudnnDestroyReduceTensorDescriptor(reduce_desc_)); -#endif - } - - bool RunOnDevice() override { - return DispatchHelper>::call(this, Input(0)); - } - - template - bool DoRunWithType() { - const auto& dY = Input(0); - const auto& scale = is_learnable_ ? Input(2) : Input(1); - auto* dX = Output(0); - dX->ResizeLike(dY); - const T* dY_data = dY.data(); - const T* scale_data = scale.data(); - T* dX_data = dX->mutable_data(); - const int ndim = dY.ndim(); - CAFFE_ENFORCE_GE(ndim, 4); - const cudnnDataType_t cudnn_type = cudnnTypeWrapper::type; - const std::vector X_dims(dY.dims().cbegin(), dY.dims().cend()); - SetTensorDescND(cudnn_type, X_dims); - CUDNN_ENFORCE(cudnnSetOpTensorDescriptor( - mul_desc_, CUDNN_OP_TENSOR_MUL, cudnn_type, CUDNN_PROPAGATE_NAN)); - CUDNN_ENFORCE(cudnnOpTensor( - cudnn_wrapper_.inline_cudnn_handle(), - mul_desc_, - cudnnTypeWrapper::kOne(), - X_desc_, - dY_data, - cudnnTypeWrapper::kOne(), - scale_desc_, - scale_data, - cudnnTypeWrapper::kZero(), - X_desc_, - dX_data)); - if (is_learnable_) { - const auto& X = Input(1); - const T* X_data = X.data(); - auto* dscale = Output(1); - auto* dbias = Output(2); - dscale->ResizeLike(scale); - dbias->ResizeLike(scale); - T* dscale_data = dscale->mutable_data(); - T* dbias_data = dbias->mutable_data(); - if (X.size() == scale.size()) { - CUDNN_ENFORCE(cudnnOpTensor( - cudnn_wrapper_.inline_cudnn_handle(), - mul_desc_, - cudnnTypeWrapper::kOne(), - X_desc_, - dY_data, - cudnnTypeWrapper::kOne(), - X_desc_, - X_data, - cudnnTypeWrapper::kZero(), - X_desc_, - dscale_data)); - context_.Copy( - dY.size(), dY_data, dbias_data); - } else { - dYxX_.ResizeLike(X); - T* dYxX_data = dYxX_.mutable_data(); - CUDNN_ENFORCE(cudnnOpTensor( - cudnn_wrapper_.inline_cudnn_handle(), - mul_desc_, - cudnnTypeWrapper::kOne(), - X_desc_, - dY_data, - cudnnTypeWrapper::kOne(), - X_desc_, - X_data, - cudnnTypeWrapper::kZero(), - X_desc_, - dYxX_data)); -#if CUDNN_VERSION_MIN(6, 0, 0) - ComputeScaleBiasGradient( - dYxX_data, dY_data, dscale_data, dbias_data); -#else - const int N = X.dim32(0); - const int C = - order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(ndim - 1); - const int HxW = X.size() / (N * C); - ComputeScaleBiasGradientFallback( - N, C, HxW, dYxX_data, dY_data, dscale_data, dbias_data); -#endif - } - } - return true; - } - - private: -#if CUDNN_VERSION_MIN(6, 0, 0) - template - void - ComputeScaleBiasGradient(const T* dYxX, const T* dY, T* dscale, T* dbias) { - const cudnnDataType_t cudnn_type = cudnnTypeWrapper::type; - CUDNN_ENFORCE(cudnnSetReduceTensorDescriptor( - reduce_desc_, - CUDNN_REDUCE_TENSOR_ADD, - cudnn_type, - CUDNN_PROPAGATE_NAN, - CUDNN_REDUCE_TENSOR_NO_INDICES, - CUDNN_32BIT_INDICES)); - std::size_t workspace_size = 0; - CUDNN_ENFORCE(cudnnGetReductionWorkspaceSize( - cudnn_wrapper_.inline_cudnn_handle(), - reduce_desc_, - X_desc_, - scale_desc_, - &workspace_size)); - workspace_buff_.Resize((workspace_size + sizeof(T) - 1) / sizeof(T)); - T* workspace_data = workspace_buff_.mutable_data(); - CUDNN_ENFORCE(cudnnReduceTensor( - cudnn_wrapper_.inline_cudnn_handle(), - reduce_desc_, - nullptr, - 0, - workspace_data, - workspace_size, - cudnnTypeWrapper::kOne(), - X_desc_, - dYxX, - cudnnTypeWrapper::kZero(), - scale_desc_, - dscale)); - CUDNN_ENFORCE(cudnnReduceTensor( - cudnn_wrapper_.inline_cudnn_handle(), - reduce_desc_, - nullptr, - 0, - workspace_data, - workspace_size, - cudnnTypeWrapper::kOne(), - X_desc_, - dY, - cudnnTypeWrapper::kZero(), - scale_desc_, - dbias)); - } -#else - template - void ComputeScaleBiasGradientFallback( - const int N, - const int C, - const int HxW, - const T* dYxX, - const T* dY, - T* dscale, - T* dbias) { - if (order_ == StorageOrder::NCHW) { - std::array dims = {N, C, HxW}; - std::array axes = {0, 2}; - math::ReduceSum( - 3, dims.data(), 2, axes.data(), dYxX, dscale, &context_); - math::ReduceSum( - 3, dims.data(), 2, axes.data(), dY, dbias, &context_); - } else { - std::array dims = {N * HxW, C}; - const int axis = 0; - math::ReduceSum( - 2, dims.data(), 1, &axis, dYxX, dscale, &context_); - math::ReduceSum( - 2, dims.data(), 1, &axis, dY, dbias, &context_); - } - } -#endif - - Tensor dYxX_; - -#if CUDNN_VERSION_MIN(6, 0, 0) - cudnnReduceTensorDescriptor_t reduce_desc_; - - Tensor workspace_buff_; -#endif -}; - -} // namespace - -REGISTER_CUDNN_OPERATOR(AffineChannel, CuDNNAffineChannelOp); -REGISTER_CUDNN_OPERATOR(AffineChannelGradient, CuDNNAffineChannelGradientOp); - -} // namespace caffe2 diff --git a/caffe2/operators/batch_moments_op.cc b/caffe2/operators/batch_moments_op.cc new file mode 100644 index 00000000000000..8247b9af228cb8 --- /dev/null +++ b/caffe2/operators/batch_moments_op.cc @@ -0,0 +1,122 @@ +#include "caffe2/operators/batch_moments_op.h" + +#include +#include + +#include "caffe2/utils/eigen_utils.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { + +template <> +bool BatchMomentsOp::ComputeBatchMomentsNCHW( + const int N, + const int C, + const int HxW, + const float* X, + float* mu, + float* var) { + math::Set(C, 0.0f, mu, &context_); + math::Set(C, 0.0f, var, &context_); + EigenVectorArrayMap mu_arr(mu, C); + EigenVectorArrayMap var_arr(var, C); + const float* X_ptr = X; + const int stride = C * HxW; + for (int i = 0; i < N; ++i) { + ConstEigenArrayMap X_arr(X_ptr, HxW, C); + mu_arr += X_arr.colwise().sum(); + var_arr += X_arr.square().colwise().sum(); + X_ptr += stride; + } + const float scale = 1.0f / static_cast(N * HxW); + math::Scale(C, scale, mu, mu, &context_); + math::Scale(C, scale, var, var, &context_); + return true; +} + +template <> +bool BatchMomentsOp::ComputeBatchMomentsNHWC( + const int N, + const int C, + const int HxW, + const float* X, + float* mu, + float* var) { + ConstEigenArrayMap X_arr(X, C, N * HxW); + EigenVectorMap(mu, C) = X_arr.rowwise().mean(); + EigenVectorMap(var, C) = X_arr.square().rowwise().mean(); + return true; +} + +template <> +bool BatchMomentsGradientOp::ComputeBatchMomentsGradientNCHW( + const int N, + const int C, + const int HxW, + const float* dmu, + const float* dvar, + const float* X, + float* dX) { + ConstEigenVectorArrayMap dmu_arr(dmu, C); + ConstEigenVectorArrayMap dvar_arr(dvar, C); + const float* X_ptr = X; + float* dX_ptr = dX; + const int stride = C * HxW; + for (int i = 0; i < N; ++i) { + EigenArrayMap dX_arr(dX_ptr, HxW, C); + dX_arr = ConstEigenArrayMap(X_ptr, HxW, C).rowwise() * + dvar_arr.transpose() * 2.0f; + dX_arr.rowwise() += dmu_arr.transpose(); + X_ptr += stride; + dX_ptr += stride; + } + const float scale = 1.0f / static_cast(N * HxW); + math::Scale(N * C * HxW, scale, dX, dX, &context_); + return true; +} + +template <> +bool BatchMomentsGradientOp::ComputeBatchMomentsGradientNHWC( + const int N, + const int C, + const int HxW, + const float* dmu, + const float* dvar, + const float* X, + float* dX) { + const float scale = 1.0f / static_cast(N * HxW); + EigenArrayMap dX_arr(dX, C, N * HxW); + dX_arr = ConstEigenArrayMap(X, C, N * HxW).colwise() * + ConstEigenVectorArrayMap(dvar, C) * 2.0f; + dX_arr.colwise() += ConstEigenVectorArrayMap(dmu, C); + math::Scale(N * C * HxW, scale, dX, dX, &context_); + return true; +} + +REGISTER_CPU_OPERATOR(BatchMoments, BatchMomentsOp); +REGISTER_CPU_OPERATOR( + BatchMomentsGradient, + BatchMomentsGradientOp); + +OPERATOR_SCHEMA(BatchMoments).NumInputs(1).NumOutputs(2); +OPERATOR_SCHEMA(BatchMomentsGradient).NumInputs(3).NumOutputs(1); + +namespace { + +class GetBatchMomentsGradient : public GradientMakerBase { + using GradientMakerBase::GradientMakerBase; + + std::vector GetGradientDefs() override { + return SingleGradientDef( + "BatchMomentsGradient", + "", + std::vector{GO(0), GO(1), I(0)}, + std::vector{GI(0)}); + } +}; + +} // namespace + +REGISTER_GRADIENT(BatchMoments, GetBatchMomentsGradient); + +} // namespace caffe2 diff --git a/caffe2/operators/batch_moments_op.cu b/caffe2/operators/batch_moments_op.cu new file mode 100644 index 00000000000000..7aadc8d8e69e07 --- /dev/null +++ b/caffe2/operators/batch_moments_op.cu @@ -0,0 +1,152 @@ +#include "caffe2/operators/batch_moments_op.h" + +#include + +#include "caffe2/core/context_gpu.h" + +namespace caffe2 { + +namespace { + +template +using BlockReduce = cub::BlockReduce; + +template +__global__ void BatchMomentsCUDAKernel( + const int N, + const int C, + const int HxW, + const T* X, + T* mu, + T* var) { + const int outer_size = C; + const int inner_size = N * HxW; + __shared__ typename BlockReduce::TempStorage m_storage; + __shared__ typename BlockReduce::TempStorage v_storage; + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + T m_sum = 0; + T v_sum = 0; + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = kOrder == StorageOrder::NCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; +#if __CUDA_ARCH__ >= 350 + m_sum += __ldg(X + index); + v_sum += __ldg(X + index) * __ldg(X + index); +#else + m_sum += X[index]; + v_sum += X[index] * X[index]; +#endif + } + m_sum = BlockReduce(m_storage).Reduce(m_sum, cub::Sum()); + v_sum = BlockReduce(v_storage).Reduce(v_sum, cub::Sum()); + if (threadIdx.x == 0) { + mu[i] = m_sum / static_cast(N * HxW); + var[i] = v_sum / static_cast(N * HxW); + } + __syncthreads(); + } +} + +template +__global__ void BatchMomentsGradientCUDAKernel( + const int N, + const int C, + const int HxW, + const T* dmu, + const T* dvar, + const T* X, + T* dX) { + const int size = N * C * HxW; + const T scale = T(1) / static_cast(N * HxW); + CUDA_1D_KERNEL_LOOP(i, size) { + const int i_mu = kOrder == StorageOrder::NCHW ? i / (HxW) % C : i % C; +#if __CUDA_ARCH__ >= 350 + dX[i] = + (__ldg(dmu + i_mu) + __ldg(dvar + i_mu) * T(2) * __ldg(X + i)) * scale; +#else + dX[i] = (dmu[i_mu] + dvar[i_mu] * T(2) * X[i]) * scale; +#endif + } +} + +} // namespace + +template <> +bool BatchMomentsOp::ComputeBatchMomentsNCHW( + const int N, + const int C, + const int HxW, + const float* X, + float* mu, + float* var) { + const int outer_size = N * HxW; + BatchMomentsCUDAKernel + <<>>(N, C, HxW, X, mu, var); + return true; +} + +template <> +bool BatchMomentsOp::ComputeBatchMomentsNHWC( + const int N, + const int C, + const int HxW, + const float* X, + float* mu, + float* var) { + const int outer_size = N * HxW; + BatchMomentsCUDAKernel + <<>>(N, C, HxW, X, mu, var); + return true; +} + +template <> +bool BatchMomentsGradientOp:: + ComputeBatchMomentsGradientNCHW( + const int N, + const int C, + const int HxW, + const float* dmu, + const float* dvar, + const float* X, + float* dX) { + const int size = N * C * HxW; + BatchMomentsGradientCUDAKernel + <<>>(N, C, HxW, dmu, dvar, X, dX); + return true; +} + +template <> +bool BatchMomentsGradientOp:: + ComputeBatchMomentsGradientNHWC( + const int N, + const int C, + const int HxW, + const float* dmu, + const float* dvar, + const float* X, + float* dX) { + const int size = N * C * HxW; + BatchMomentsGradientCUDAKernel + <<>>(N, C, HxW, dmu, dvar, X, dX); + return true; +} + +REGISTER_CUDA_OPERATOR(BatchMoments, BatchMomentsOp); +REGISTER_CUDA_OPERATOR( + BatchMomentsGradient, + BatchMomentsGradientOp); + +} // namespace caffe2 diff --git a/caffe2/operators/batch_moments_op.h b/caffe2/operators/batch_moments_op.h new file mode 100644 index 00000000000000..eea4c84943d0fe --- /dev/null +++ b/caffe2/operators/batch_moments_op.h @@ -0,0 +1,117 @@ +#ifndef CAFFE2_OPERATORS_BATCH_MOMENTS_OP_H_ +#define CAFFE2_OPERATORS_BATCH_MOMENTS_OP_H_ + +#include "caffe2/core/context.h" +#include "caffe2/core/logging.h" +#include "caffe2/core/operator.h" + +namespace caffe2 { + +template +class BatchMomentsOp final : public Operator { + public: + USE_OPERATOR_CONTEXT_FUNCTIONS; + + BatchMomentsOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws), + order_(StringToStorageOrder( + OperatorBase::GetSingleArgument("order", "NCHW"))) { + CAFFE_ENFORCE_NE(order_, StorageOrder::UNKNOWN); + } + + bool RunOnDevice() override { + const auto& X = Input(0); + auto* mu = Output(0); + auto* var = Output(1); + const int ndim = X.ndim(); + const int N = X.dim32(0); + const int C = order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(ndim - 1); + const int HxW = X.size() / (N * C); + mu->Resize(C); + var->Resize(C); + const T* X_data = X.template data(); + T* mu_data = mu->template mutable_data(); + T* var_data = var->template mutable_data(); + return order_ == StorageOrder::NCHW + ? ComputeBatchMomentsNCHW(N, C, HxW, X_data, mu_data, var_data) + : ComputeBatchMomentsNHWC(N, C, HxW, X_data, mu_data, var_data); + } + + private: + bool ComputeBatchMomentsNCHW( + const int N, + const int C, + const int HxW, + const T* X, + T* mu, + T* var); + + bool ComputeBatchMomentsNHWC( + const int N, + const int C, + const int HxW, + const T* X, + T* mu, + T* var); + + const StorageOrder order_; +}; + +template +class BatchMomentsGradientOp final : public Operator { + public: + USE_OPERATOR_CONTEXT_FUNCTIONS; + + BatchMomentsGradientOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws), + order_(StringToStorageOrder( + OperatorBase::GetSingleArgument("order", "NCHW"))) { + CAFFE_ENFORCE_NE(order_, StorageOrder::UNKNOWN); + } + + bool RunOnDevice() override { + const auto& dmu = Input(0); + const auto& dvar = Input(1); + const auto& X = Input(2); + auto* dX = Output(0); + const int ndim = X.ndim(); + const int N = X.dim32(0); + const int C = order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(ndim - 1); + const int HxW = X.size() / (N * C); + dX->ResizeLike(X); + const T* dmu_data = dmu.template data(); + const T* dvar_data = dvar.template data(); + const T* X_data = X.template data(); + T* dX_data = dX->template mutable_data(); + return order_ == StorageOrder::NCHW + ? ComputeBatchMomentsGradientNCHW( + N, C, HxW, dmu_data, dvar_data, X_data, dX_data) + : ComputeBatchMomentsGradientNHWC( + N, C, HxW, dmu_data, dvar_data, X_data, dX_data); + } + + private: + bool ComputeBatchMomentsGradientNCHW( + const int N, + const int C, + const int HxW, + const T* dmu, + const T* dvar, + const T* X, + T* dX); + + bool ComputeBatchMomentsGradientNHWC( + const int N, + const int C, + const int HxW, + const T* dmu, + const T* dvar, + const T* X, + T* dX); + + const StorageOrder order_; +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_BATCH_MOMENTS_OP_H_ diff --git a/caffe2/operators/batch_sparse_to_dense_op.h b/caffe2/operators/batch_sparse_to_dense_op.h index de6c69b795d128..0e854d9e467bc0 100644 --- a/caffe2/operators/batch_sparse_to_dense_op.h +++ b/caffe2/operators/batch_sparse_to_dense_op.h @@ -19,6 +19,9 @@ class BatchSparseToDenseOp : public Operator { OP_SINGLE_ARG(T, "default_value", default_value_, static_cast(0)) {} bool RunOnDevice() override; + // TODO: enable the filler + DISABLE_INPUT_FILLERS(Context) + private: TIndex dense_last_dim_; T default_value_; diff --git a/caffe2/operators/boolean_mask_ops.cu b/caffe2/operators/boolean_mask_ops.cu index a976e7159309e8..85315768bd85d9 100644 --- a/caffe2/operators/boolean_mask_ops.cu +++ b/caffe2/operators/boolean_mask_ops.cu @@ -6,13 +6,12 @@ namespace caffe2 { namespace { -template __global__ void BooleanMaskCopyKernel( const TIndex numOfOutput, const TIndex numBytes, const TIndex* indices, - const T* src, - T* dest) { + const uint8_t* src, + uint8_t* dest) { for (TIndex i = blockIdx.x; i < numOfOutput; i += gridDim.x) { const auto srcBase = indices[i] * numBytes; const auto destBase = i * numBytes; @@ -81,8 +80,8 @@ class BooleanMaskOp final : public Operator { std::vector dims = src.dims(); dims[0] = numOfOutput; dest->Resize(dims); - auto* destData = (char*)dest->raw_mutable_data(src.meta()); - const auto* srcData = (char*)src.raw_data(); + auto* destData = (uint8_t*)dest->raw_mutable_data(src.meta()); + const auto* srcData = (uint8_t*)src.raw_data(); if (OutputSize() == 2) { auto* indicesOut = Output(1); indicesOut->Resize(numOfOutput); diff --git a/caffe2/operators/boolean_unmask_ops.cu b/caffe2/operators/boolean_unmask_ops.cu index 42801e17a64815..dcdec9c33df7be 100644 --- a/caffe2/operators/boolean_unmask_ops.cu +++ b/caffe2/operators/boolean_unmask_ops.cu @@ -27,7 +27,7 @@ __global__ void FillValuesKernel( const size_t itemSize, const int* indices, char* const values[], - int valueSizes[], + int* valueSizes, char* dest) { CUDA_1D_KERNEL_LOOP(j, numMasks) { int k = 0; diff --git a/caffe2/operators/byte_weight_dequant_op.cc b/caffe2/operators/byte_weight_dequant_op.cc new file mode 100644 index 00000000000000..6596fff76647ad --- /dev/null +++ b/caffe2/operators/byte_weight_dequant_op.cc @@ -0,0 +1,11 @@ +#include "caffe2/operators/byte_weight_dequant_op.h" + +#include "caffe2/utils/math.h" + +namespace caffe2 { + +REGISTER_CPU_OPERATOR(ByteWeightDequant, ByteWeightDequantOp); + +OPERATOR_SCHEMA(ByteWeightDequant).NumInputs(1).NumOutputs(1); + +} // namespace caffe2 diff --git a/caffe2/operators/byte_weight_dequant_op.h b/caffe2/operators/byte_weight_dequant_op.h new file mode 100644 index 00000000000000..14df6826a3bfcd --- /dev/null +++ b/caffe2/operators/byte_weight_dequant_op.h @@ -0,0 +1,55 @@ +#ifndef CAFFE2_OPERATORS_BYTE_WEIGHT_DEQUANT_OP_H_ +#define CAFFE2_OPERATORS_BYTE_WEIGHT_DEQUANT_OP_H_ + +#include "caffe2/core/operator.h" +#include "caffe2/utils/eigen_utils.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { + +template +class ByteWeightDequantOp : public Operator { + public: + ByteWeightDequantOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws), + min_(OperatorBase::GetSingleArgument("min", -3)), + max_(OperatorBase::GetSingleArgument("max", 3)), + shape_(OperatorBase::GetRepeatedArgument("shape")) {} + + USE_OPERATOR_FUNCTIONS(Context); + using Operator::Operator; + + bool RunOnDevice() override { + const auto& WI = Input(0); + auto* Y = Output(0); + Y->Resize(shape_); + float bin_interval = (max_ - min_) / 255.0; + int total = 1; + for (int i = 0; i < shape_.size(); i++) { + total *= Y->dim(i); + } + const uint8_t* Xdata; + if (WI.template IsType()) { + CAFFE_ENFORCE(total, WI.nbytes()); + Xdata = WI.template data(); + } else { + CAFFE_ENFORCE(total, WI.template data()[0].size()); + Xdata = reinterpret_cast( + WI.template data()[0].c_str()); + } + auto* Ydata = Y->template mutable_data(); + ConstEigenVectorMap index(&Xdata[0], total); + EigenVectorMap weights(&Ydata[0], total); + weights = (index.cast().array() * bin_interval) + min_; + return true; + } + + private: + float min_; + float max_; + std::vector shape_; +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_BYTE_WEIGHT_DEQUANT_OP_H_ diff --git a/caffe2/operators/channel_shuffle_op_gpu.cu b/caffe2/operators/channel_shuffle_op_gpu.cu index 7e53add8615bee..447ba55d3436fa 100644 --- a/caffe2/operators/channel_shuffle_op_gpu.cu +++ b/caffe2/operators/channel_shuffle_op_gpu.cu @@ -1,5 +1,5 @@ #include "caffe2/core/context_gpu.h" -#include "channel_shuffle_op.h" +#include "caffe2/operators/channel_shuffle_op.h" namespace caffe2 { diff --git a/caffe2/operators/conv_op_shared_gpu.cc b/caffe2/operators/conv_op_shared_gpu.cc index 70570bde1c4161..eb5a762cbd3e3e 100644 --- a/caffe2/operators/conv_op_shared_gpu.cc +++ b/caffe2/operators/conv_op_shared_gpu.cc @@ -1,5 +1,5 @@ #include "caffe2/core/context_gpu.h" -#include "conv_op_shared.h" +#include "caffe2/operators/conv_op_shared.h" namespace caffe2 { diff --git a/caffe2/operators/cosine_embedding_criterion_op.cu b/caffe2/operators/cosine_embedding_criterion_op.cu index 792062f086b593..69a37ff3294453 100644 --- a/caffe2/operators/cosine_embedding_criterion_op.cu +++ b/caffe2/operators/cosine_embedding_criterion_op.cu @@ -9,7 +9,7 @@ __global__ void CECKernel( const int N, const float* S, const int* Y, const float margin, float* output) { CUDA_1D_KERNEL_LOOP(i, N) { - output[i] = Y[i] == 1 ? (1. - S[i]) : max(0.f, S[i] - margin); + output[i] = Y[i] == 1 ? (1. - S[i]) : fmaxf(0.f, S[i] - margin); } } diff --git a/caffe2/operators/counter_ops_gpu.cc b/caffe2/operators/counter_ops_gpu.cc index 7880aeeb419379..1c157633a62093 100644 --- a/caffe2/operators/counter_ops_gpu.cc +++ b/caffe2/operators/counter_ops_gpu.cc @@ -1,5 +1,5 @@ #include "caffe2/core/context_gpu.h" -#include "counter_ops.h" +#include "caffe2/operators/counter_ops.h" namespace caffe2 { REGISTER_CUDA_OPERATOR(CreateCounter, CreateCounterOp); diff --git a/caffe2/operators/distance_op.cu b/caffe2/operators/distance_op.cu index 037b1a50e1fd59..e1a56399a2f947 100644 --- a/caffe2/operators/distance_op.cu +++ b/caffe2/operators/distance_op.cu @@ -131,9 +131,9 @@ __global__ void L1DistanceKernel( for (int i = blockIdx.x; i < N; i += gridDim.x) { float sum = 0.0f; for (int j = threadIdx.x; j < D; j += blockDim.x) { - sum += - abs(convert::To(X[i * D + j]) - - convert::To(Y[i * D + j])); + sum += fabsf( + convert::To(X[i * D + j]) - + convert::To(Y[i * D + j])); } float aggregate = BlockReduce(temp_storage).Sum(sum); @@ -395,33 +395,33 @@ bool CosineSimilarityGradientOp::RunOnDevice() { context_.cuda_stream()>>>(N, D, X_data, Y_data, xy); math::Div(N, dCos_data, xyn, scale, &context_); // dX - BatchedMul<<< + BatchedMul<<< std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(N, D, Y_data, scale, dX_data); - Scale2AxpyScale<<< + Scale2AxpyScale<<< std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(N, scale, xy, xn, axpy_scale); - BatchedAxpy<<< + BatchedAxpy<<< std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(N, D, axpy_scale, X_data, dX_data); // dY - BatchedMul<<< + BatchedMul<<< std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(N, D, X_data, scale, dY_data); - Scale2AxpyScale<<< + Scale2AxpyScale<<< std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(N, scale, xy, yn, axpy_scale); - BatchedAxpy<<< + BatchedAxpy<<< std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), CAFFE_CUDA_NUM_THREADS, 0, diff --git a/caffe2/operators/elementwise_linear_op.cu b/caffe2/operators/elementwise_linear_op.cu index 503675a86d7de8..e4cd235eeffa38 100644 --- a/caffe2/operators/elementwise_linear_op.cu +++ b/caffe2/operators/elementwise_linear_op.cu @@ -1,6 +1,6 @@ #include -#include "elementwise_linear_op.h" +#include "caffe2/operators/elementwise_linear_op.h" #include "caffe2/core/context_gpu.h" #include "caffe2/operators/operator_fallback_gpu.h" diff --git a/caffe2/operators/elementwise_ops.cu b/caffe2/operators/elementwise_ops.cu index b564d8ae8005f8..1dee0d62724706 100644 --- a/caffe2/operators/elementwise_ops.cu +++ b/caffe2/operators/elementwise_ops.cu @@ -8,6 +8,11 @@ #include "caffe2/core/context_gpu.h" #include "caffe2/utils/conversions.h" +#ifdef __HIPCC__ +// rocblas doesn't fully support fp16 yet +#define ROCBLAS_FP16 0 +#endif + namespace caffe2 { REGISTER_CUDA_OPERATOR( @@ -111,6 +116,9 @@ void device_reduce( int N, Tensor* buffer, CUDAContext* context) { +#if defined(__HIPCC__) && !ROCBLAS_FP16 + CAFFE_THROW("HIP rocblas doesn't fully support fp16 device_reduce yet."); +#else auto buffer_size = 1; if (buffer->size() != buffer_size) { @@ -135,6 +143,7 @@ void device_reduce( out, CUDA_R_16F, CUDA_R_32F)); +#endif } template diff --git a/caffe2/operators/generate_proposals_op_test.cc b/caffe2/operators/generate_proposals_op_test.cc index 3fb7ed92e90a6a..af9214379becd8 100644 --- a/caffe2/operators/generate_proposals_op_test.cc +++ b/caffe2/operators/generate_proposals_op_test.cc @@ -2,6 +2,11 @@ #include #include "caffe2/core/flags.h" +#include "caffe2/core/macros.h" + +#ifdef CAFFE2_USE_OPENCV +#include +#endif // CAFFE2_USE_OPENCV namespace caffe2 { diff --git a/caffe2/operators/generate_proposals_op_util_nms.h b/caffe2/operators/generate_proposals_op_util_nms.h index 39e7febe272961..5d6f87d4d30563 100644 --- a/caffe2/operators/generate_proposals_op_util_nms.h +++ b/caffe2/operators/generate_proposals_op_util_nms.h @@ -4,12 +4,13 @@ #include #include "caffe2/core/logging.h" +#include "caffe2/core/macros.h" #include "caffe2/utils/eigen_utils.h" #include "caffe2/utils/math.h" -#if defined(CV_MAJOR_VERSION) && (CV_MAJOR_VERSION >= 3) +#ifdef CAFFE2_USE_OPENCV #include -#endif // CV_MAJOR_VERSION >= 3 +#endif // CAFFE2_USE_OPENCV namespace caffe2 { namespace utils { diff --git a/caffe2/operators/group_norm_op.cu b/caffe2/operators/group_norm_op.cu index daf3ab91e11161..cfdd308bceb050 100644 --- a/caffe2/operators/group_norm_op.cu +++ b/caffe2/operators/group_norm_op.cu @@ -6,7 +6,7 @@ // This is a stand-alone op: Y = gamma * (X - mu) / sig + beta // ------------------------------------------------------------------ -#include "group_norm_op.h" +#include "caffe2/operators/group_norm_op.h" #include diff --git a/caffe2/operators/gru_unit_op_gpu.cu b/caffe2/operators/gru_unit_op_gpu.cu index 2df357bf057865..ee923ae7a9667c 100644 --- a/caffe2/operators/gru_unit_op_gpu.cu +++ b/caffe2/operators/gru_unit_op_gpu.cu @@ -2,7 +2,7 @@ #include #include #include "caffe2/core/context_gpu.h" -#include "gru_unit_op.h" +#include "caffe2/operators/gru_unit_op.h" namespace caffe2 { diff --git a/caffe2/operators/heatmap_max_keypoint_op.cc b/caffe2/operators/heatmap_max_keypoint_op.cc new file mode 100644 index 00000000000000..ed714bdfddec38 --- /dev/null +++ b/caffe2/operators/heatmap_max_keypoint_op.cc @@ -0,0 +1,161 @@ +#include "heatmap_max_keypoint_op.h" +#include "caffe2/utils/eigen_utils.h" + +namespace caffe2 { +namespace { + +REGISTER_CPU_OPERATOR( + HeatmapMaxKeypoint, + HeatmapMaxKeypointOp); + +// Input: heatmaps [size x size], boxes [x0, y0, x1, y1] +// Output: keypoints (#rois, 4, #keypoints) +OPERATOR_SCHEMA(HeatmapMaxKeypoint).NumInputs(2).NumOutputs(1); + +SHOULD_NOT_DO_GRADIENT(HeatmapMaxKeypoint); +} // namespace + +/** +Mask R-CNN uses bicubic upscaling before taking the maximum of the heat map +for keypoints. We would like to avoid bicubic upscaling, because it is +computationally expensive. This approach uses the Taylor expansion up to the +quadratic terms on approximation of the heatmap function. +**/ +template <> +bool HeatmapMaxKeypointOp::RunOnDevice() { + const auto& heatmaps_in = Input(0); + const auto& bboxes_in = Input(1); + auto* keypoints_out = Output(0); + + CAFFE_ENFORCE_EQ(heatmaps_in.ndim(), 4); + const int N = heatmaps_in.dim32(0); + CAFFE_ENFORCE_EQ(heatmaps_in.dim32(0), N); + const int keypoint_count = heatmaps_in.dim32(1); + const int heatmap_size = heatmaps_in.dim32(2); + CAFFE_ENFORCE_GE(heatmap_size, 2); // at least 2x2 for approx + CAFFE_ENFORCE_EQ(heatmaps_in.dim32(2), heatmaps_in.dim32(3)); + + CAFFE_ENFORCE_EQ(bboxes_in.ndim(), 2); + CAFFE_ENFORCE_EQ(bboxes_in.dim32(0), N); + CAFFE_ENFORCE_GE(bboxes_in.dim32(1), 4); + + // Wrap inputs in Eigen + Eigen::Map heatmaps( + heatmaps_in.data(), + heatmaps_in.dim32(0) * heatmaps_in.dim32(1), + heatmaps_in.dim32(2) * heatmaps_in.dim32(3)); + Eigen::Map bboxes( + bboxes_in.data(), bboxes_in.dim32(0), bboxes_in.dim32(1)); + + // Calculate the softmax + ERArrXXf probs( + heatmaps_in.dim32(0) * heatmaps_in.dim32(1), + heatmaps_in.dim32(2) * heatmaps_in.dim32(3)); + if (should_output_softmax_) { + // softmax output is expensive to compute, if should_output_softmax is not + // specified, don't populate it + ERArrXXf heatmap_exp = heatmaps.exp(); + for (int r = 0; r < N * keypoint_count; r++) { + probs.row(r) = heatmap_exp.row(r) / heatmap_exp.row(r).sum(); + } + } /* otherwise not initialized */ + + // Resize and wrap outputs in Eigen + keypoints_out->Resize(N, 4, keypoint_count); + Eigen::Map keypoints( + keypoints_out->mutable_data(), N, 4 * keypoint_count); + + EArrXi maxIndices(N * keypoint_count); + // finding max value first (only maxCoeff() is vectorized, not + // maxCoeff(&index)), then find the index (equalness check is also fast) + EArrXf maxScores = heatmaps.rowwise().maxCoeff(); + for (int r = 0; r < N * keypoint_count; r++) { + float maxScore = maxScores[r]; + for (int c = 0; c < heatmap_size * heatmap_size; c++) { + if (heatmaps(r, c) == maxScore) { + maxIndices[r] = c; + break; + } + } + } + + // Populate outputs + for (int k = 0; k < N; k++) { // For each box, even skipped + + float x0 = bboxes(k, 0); + float y0 = bboxes(k, 1); + float xLen = std::max(bboxes(k, 2) - bboxes(k, 0), 1.0f); + float yLen = std::max(bboxes(k, 3) - bboxes(k, 1), 1.0f); + + // Extract max keypoints and probabilities from heatmaps + for (int j = 0; j < keypoint_count; j++) { + const int heatmap_index = k * keypoint_count + j; + const int maxIndex = maxIndices[heatmap_index]; + const float maxScore = maxScores[heatmap_index]; + const int maxY = maxIndex / heatmap_size; + const int maxX = maxIndex - heatmap_size * maxY; + + assert(heatmaps(heatmap_index, maxIndex) == maxScore); + ERArrXXf fmax = ERArrXXf::Zero(3, 3); + + // initialize fmax values of local 3x3 grid + // when 3x3 grid going out-of-bound, mirrowing around center + for (int y = -1; y <= 1; y++) { + for (int x = -1; x <= 1; x++) { + int xx = x - 2 * (x + maxX >= heatmap_size) + 2 * (x + maxX < 0); + int yy = y - 2 * (y + maxY >= heatmap_size) + 2 * (y + maxY < 0); + assert((xx + maxX < heatmap_size) && (xx + maxX >= 0)); + assert((yy + maxY < heatmap_size) && (yy + maxY >= 0)); + const int coord_index = (yy + maxY) * heatmap_size + xx + maxX; + fmax(y + 1, x + 1) = heatmaps(heatmap_index, coord_index); + } + } + + // b = -f'(0), A = f''(0) Hessian matrix + EVecXf b(2); + b << -(fmax(1, 2) - fmax(1, 0)) / 2, -(fmax(2, 1) - fmax(0, 1)) / 2; + EMatXf A(2, 2); + A << fmax(1, 0) - 2 * fmax(1, 1) + fmax(1, 2), + (fmax(2, 2) - fmax(2, 0) - fmax(0, 2) + fmax(0, 0)) / 4, + (fmax(2, 2) - fmax(2, 0) - fmax(0, 2) + fmax(0, 0)) / 4, + fmax(0, 1) - 2 * fmax(1, 1) + fmax(2, 1); + + // Solve Ax=b + const float div = A.determinant(); + EVecXf delta(2); + float deltaScore; + const float MAX_DELTA = 1.5; + if (std::abs(div) < 1e-4f) { + delta << 0.0f, 0.0f; + deltaScore = maxScore; + } else { + delta = A.ldlt().solve(b); + // clip delta if going out-of-range of 3x3 grid + if (std::abs(delta(0)) > MAX_DELTA || std::abs(delta(1)) > MAX_DELTA) { + float larger_delta = std::max(std::abs(delta(0)), std::abs(delta(1))); + delta(0) = delta(0) / larger_delta * MAX_DELTA; + delta(1) = delta(1) / larger_delta * MAX_DELTA; + } + deltaScore = fmax(1, 1) - b.transpose() * delta + + 1.0 / 2.0 * delta.transpose() * A * delta; + } + assert(std::abs(delta(0)) <= MAX_DELTA); + assert(std::abs(delta(1)) <= MAX_DELTA); + // find maximum of detla scores + keypoints(k, 0 * keypoint_count + j) = + x0 + (0.5 + maxX + delta(0)) * xLen / heatmap_size; + keypoints(k, 1 * keypoint_count + j) = + y0 + (0.5 + maxY + delta(1)) * yLen / heatmap_size; + keypoints(k, 2 * keypoint_count + j) = deltaScore; + if (should_output_softmax_) { + keypoints(k, 3 * keypoint_count + j) = probs(heatmap_index, maxIndex); + } else { + keypoints(k, 3 * keypoint_count + j) = .0f; + } + } + } + + return true; +} + +} // namespace caffe2 diff --git a/caffe2/operators/heatmap_max_keypoint_op.h b/caffe2/operators/heatmap_max_keypoint_op.h new file mode 100644 index 00000000000000..352c9ff109de94 --- /dev/null +++ b/caffe2/operators/heatmap_max_keypoint_op.h @@ -0,0 +1,31 @@ +// Copyright 2004-present Facebook. All Rights Reserved. + +#ifndef HEATMAP_MAX_KEYPOINT_OP_H_ +#define HEATMAP_MAX_KEYPOINT_OP_H_ + +#include "caffe2/core/context.h" +#include "caffe2/core/logging.h" +#include "caffe2/core/operator.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { + +template +class HeatmapMaxKeypointOp final : public Operator { + public: + HeatmapMaxKeypointOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws), + should_output_softmax_(OperatorBase::GetSingleArgument( + "should_output_softmax", + false)) {} + USE_OPERATOR_CONTEXT_FUNCTIONS; + + bool RunOnDevice() override; + + protected: + bool should_output_softmax_ = false; +}; + +} // namespace caffe2 + +#endif // HEATMAP_MAX_KEYPOINT_OP_H_ diff --git a/caffe2/operators/hip/operator_fallback_hip.h b/caffe2/operators/hip/operator_fallback_hip.h deleted file mode 100644 index 62e5fe8f01e5dc..00000000000000 --- a/caffe2/operators/hip/operator_fallback_hip.h +++ /dev/null @@ -1,114 +0,0 @@ -#ifndef CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_ -#define CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_ - -#include "caffe2/core/common.h" -#include "caffe2/core/context.h" -#include "caffe2/core/hip/context_hip.h" -#include "caffe2/core/operator.h" -#include "caffe2/proto/caffe2.pb.h" - -namespace caffe2 { - -/** - * @brief A templated class to allow one to wrap a CPU operator as a CUDA - * operator. - * - * This class can be used when one does not have the CUDA implementation ready - * yet for an operator. Essentially, what this op does is to automatically - * deal with data copy for you. Plausibly, this causes a lot of overhead and - * is not optimal, so you should use this operator mostly for quick prototyping - * purpose. - * - * All the input and output of the original operator should be TensorCPU. - * - * Example usage: if you have a class MyMagicOp that is CPU based, and you use - * the registration code - * REGISTER_CPU_OPERATOR(MyMagic, MyMagicOp); - * to register the CPU side, you can create its corresponding GPU operator - * (with performance hits of course) via - * REGISTER_HIP_OPERATOR(MyMagic, - * GPUFallbackOp); - * - * Advanced usage: if you want to have some specific outputs never copied, you - * can use the SkipOutputCopy template argument to do that. For example, if - * MyMagic produces two outputs and the first output is always going to live on - * the CPU, you can do - * REGISTER_HIP_OPERATOR(MyMagic, - * GPUFallbackOp>); - */ -template > -class GPUFallbackOp final : public Operator { - public: - USE_OPERATOR_FUNCTIONS(HIPContext); - GPUFallbackOp(const OperatorDef& def, Workspace* ws) - : Operator(def, ws) { - CAFFE_ENFORCE_EQ(def.device_option().device_type(), HIP); - OperatorDef base_def_(def); - // base_def_ runs on CPU, so we will set its device option to CPU. - base_def_.clear_device_option(); - base_def_.mutable_device_option()->set_device_type(CPU); - // Set up the symbols for the local workspace. - for (const string& name : def.input()) { - local_input_blobs_.push_back(local_ws_.CreateBlob(name)); - CHECK_NOTNULL(local_input_blobs_.back()); - } - base_op_.reset(new CPUOp(base_def_, &local_ws_)); - for (const string& name : def.output()) { - local_output_blobs_.push_back(local_ws_.GetBlob(name)); - CHECK_NOTNULL(local_output_blobs_.back()); - } - } - - bool RunOnDevice() override { - bool need_sync = false; - for (int i = 0; i < InputSize(); ++i) { - if (OperatorBase::InputIsType(i)) { - local_input_blobs_[i]->template GetMutable()->CopyFrom( - Input(i), &context_); - need_sync = true; - } else { - VLOG(1) << "Input " << i << " is not TensorHIP. Skipping copy."; - // Note(jiayq): This removes a const but conceptually - // local_input_blobs will only be used as const blob input for the - // base op so we are still fine. - local_input_blobs_[i]->ShareExternal( - const_cast(OperatorBase::Inputs()[i]->GetRaw()), - OperatorBase::Inputs()[i]->meta()); - } - } - - // Sync to make sure copies are done. - if (need_sync) { - context_.FinishDeviceComputation(); - } - - if (!base_op_->Run()) { - LOG(ERROR) << "Base op run failed in GPUFallbackOp. Def: " - << ProtoDebugString(this->debug_def()); - return false; - } - for (int i = 0; i < OutputSize(); ++i) { - if (SkipOutputCopy::Contains(i)) { - VLOG(1) << "Copy output: index " << i << " skipped."; - continue; - } - CAFFE_ENFORCE( - local_output_blobs_[i]->template IsType(), - "GPU fallback op currently does not support non-TensorCPU " - "output type who needs copying."); - Output(i)->CopyFrom( - local_output_blobs_[i]->template Get(), &context_); - } - return true; - } - - protected: - Workspace local_ws_; - vector local_input_blobs_; - vector local_output_blobs_; - std::unique_ptr base_op_; -}; - -} // namespace caffe2 - -#endif // CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_ diff --git a/caffe2/operators/hip/operator_fallback_hip_test.cc b/caffe2/operators/hip/operator_fallback_hip_test.cc deleted file mode 100644 index 4a074c35f8a186..00000000000000 --- a/caffe2/operators/hip/operator_fallback_hip_test.cc +++ /dev/null @@ -1,80 +0,0 @@ -#include - -#include -#include "caffe2/core/operator.h" -#include "caffe2/operators/hip/operator_fallback_hip.h" - -namespace caffe2 { - -class IncrementByOneOp final : public Operator { - public: - IncrementByOneOp(const OperatorDef& def, Workspace* ws) - : Operator(def, ws) {} - bool RunOnDevice() { - const auto& in = Input(0); - auto* out = Output(0); - out->ResizeLike(in); - const float* in_data = in.template data(); - float* out_data = out->template mutable_data(); - for (int i = 0; i < in.size(); ++i) { - out_data[i] = in_data[i] + 1.f; - } - return true; - } -}; - -OPERATOR_SCHEMA(IncrementByOne) - .NumInputs(1) - .NumOutputs(1) - .AllowInplace({{0, 0}}); - -REGISTER_CPU_OPERATOR(IncrementByOne, IncrementByOneOp); -REGISTER_HIP_OPERATOR(IncrementByOne, GPUFallbackOp); - -TEST(OperatorFallbackTest, IncrementByOneOp) { - OperatorDef op_def = CreateOperatorDef( - "IncrementByOne", "", vector{"X"}, vector{"X"}); - Workspace ws; - TensorCPU source_tensor(vector{2, 3}); - for (int i = 0; i < 6; ++i) { - source_tensor.mutable_data()[i] = i; - } - ws.CreateBlob("X")->GetMutable()->CopyFrom(source_tensor); - unique_ptr op(CreateOperator(op_def, &ws)); - EXPECT_TRUE(op.get() != nullptr); - EXPECT_TRUE(op->Run()); - const TensorCPU& output = ws.GetBlob("X")->Get(); - EXPECT_EQ(output.ndim(), 2); - EXPECT_EQ(output.dim(0), 2); - EXPECT_EQ(output.dim(1), 3); - for (int i = 0; i < 6; ++i) { - EXPECT_EQ(output.data()[i], i + 1); - } -} - -TEST(OperatorFallbackTest, GPUIncrementByOneOp) { - if (!HasHipGPU()) - return; - OperatorDef op_def = CreateOperatorDef( - "IncrementByOne", "", vector{"X"}, vector{"X"}); - op_def.mutable_device_option()->set_device_type(HIP); - Workspace ws; - TensorCPU source_tensor(vector{2, 3}); - for (int i = 0; i < 6; ++i) { - source_tensor.mutable_data()[i] = i; - } - ws.CreateBlob("X")->GetMutable()->CopyFrom(source_tensor); - unique_ptr op(CreateOperator(op_def, &ws)); - EXPECT_TRUE(op.get() != nullptr); - EXPECT_TRUE(op->Run()); - const TensorHIP& output = ws.GetBlob("X")->Get(); - TensorCPU output_cpu(output); - EXPECT_EQ(output.ndim(), 2); - EXPECT_EQ(output.dim(0), 2); - EXPECT_EQ(output.dim(1), 3); - for (int i = 0; i < 6; ++i) { - EXPECT_EQ(output_cpu.data()[i], i + 1); - } -} - -} // namespace caffe2 diff --git a/caffe2/operators/instance_norm_op.cu b/caffe2/operators/instance_norm_op.cu index 5a5010dc608b36..87532066278b2e 100644 --- a/caffe2/operators/instance_norm_op.cu +++ b/caffe2/operators/instance_norm_op.cu @@ -51,7 +51,7 @@ __global__ void InstanceNormInvStdevKernel( } inv_stdev_data[i] /= dim; inv_stdev_data[i] += epsilon; - inv_stdev_data[i] = 1.0 / std::sqrt(inv_stdev_data[i]); + inv_stdev_data[i] = 1.0 / sqrtf(inv_stdev_data[i]); } } diff --git a/caffe2/operators/integral_image_op.cu b/caffe2/operators/integral_image_op.cu index d5c122001292fb..872d29bd0dddb4 100644 --- a/caffe2/operators/integral_image_op.cu +++ b/caffe2/operators/integral_image_op.cu @@ -1,5 +1,5 @@ #include "caffe2/core/context_gpu.h" -#include "integral_image_op.h" +#include "caffe2/operators/integral_image_op.h" namespace caffe2 { diff --git a/caffe2/operators/layer_norm_op.cu b/caffe2/operators/layer_norm_op.cu index 0309b4e6d7fe10..bcec393b2ad95c 100644 --- a/caffe2/operators/layer_norm_op.cu +++ b/caffe2/operators/layer_norm_op.cu @@ -116,7 +116,7 @@ bool LayerNormOp::DoRunWithType() { mean->CopyFrom(input); mean->Resize(stats_dims); math::Set( - left, std::sqrt(epsilon_), stdev->mutable_data(), &context_); + left, sqrtf(epsilon_), stdev->mutable_data(), &context_); } else { // Calculate row-wise means // First stage: sum up feature vectors diff --git a/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.h b/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.h index 5fdcb1d13058bf..7c42d522f2e71f 100644 --- a/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.h +++ b/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.h @@ -68,6 +68,8 @@ class SparseLengthsFused8BitRowwiseOp : public Operator { return true; } + USE_VALUE_KEY_LENGTH_INPUT_FILLERS(Context, DATA, INDICES, LENGTHS) + private: enum { DATA = 0, diff --git a/caffe2/operators/lengths_reducer_ops.h b/caffe2/operators/lengths_reducer_ops.h index 461038ca3cb97f..505dad1b102de3 100644 --- a/caffe2/operators/lengths_reducer_ops.h +++ b/caffe2/operators/lengths_reducer_ops.h @@ -92,6 +92,8 @@ class CPUSparseLengthsReductionOp : public Operator { return true; } + USE_VALUE_KEY_LENGTH_INPUT_FILLERS(CPUContext, DATA, INDICES, LENGTHS) + private: enum { DATA = 0, // Data input. diff --git a/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h b/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h index 58ebe6cb58e846..8af4a413239997 100644 --- a/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h +++ b/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h @@ -87,6 +87,8 @@ class SparseLengths8BitsRowwiseOp : public Operator { return true; } + USE_VALUE_LENGTH_INPUT_FILLERS(Context, DATA, LENGTHS) + enum { DATA = 0, WEIGHTS = 1, diff --git a/caffe2/operators/lstm_unit_op_gpu.cu b/caffe2/operators/lstm_unit_op_gpu.cu index bdd62e5c3c4228..e15d706635b447 100644 --- a/caffe2/operators/lstm_unit_op_gpu.cu +++ b/caffe2/operators/lstm_unit_op_gpu.cu @@ -2,7 +2,7 @@ #include #include #include "caffe2/core/context_gpu.h" -#include "lstm_unit_op.h" +#include "caffe2/operators/lstm_unit_op.h" namespace caffe2 { diff --git a/caffe2/operators/max_pool_with_index.cu b/caffe2/operators/max_pool_with_index.cu index b8e6d2b469e5dc..5ac3c58bb5f89b 100644 --- a/caffe2/operators/max_pool_with_index.cu +++ b/caffe2/operators/max_pool_with_index.cu @@ -1,4 +1,4 @@ -#include "caffe2/operators/max_pool_with_index.h" +#include "caffe2/operators/max_pool_with_index_gpu.h" #include "caffe2/utils/conversions.h" namespace caffe2 { diff --git a/caffe2/operators/max_pool_with_index.h b/caffe2/operators/max_pool_with_index_gpu.h similarity index 88% rename from caffe2/operators/max_pool_with_index.h rename to caffe2/operators/max_pool_with_index_gpu.h index 64337dc56088f7..abc26233a7dce6 100644 --- a/caffe2/operators/max_pool_with_index.h +++ b/caffe2/operators/max_pool_with_index_gpu.h @@ -1,5 +1,4 @@ -#ifndef CAFFE2_OPERATORS_MAX_POOL_WITH_INDEX_H_ -#define CAFFE2_OPERATORS_MAX_POOL_WITH_INDEX_H_ +#pragma once #include #include "caffe2/core/context.h" @@ -45,5 +44,3 @@ class MaxPoolWithIndexGradientOp final : public ConvPoolOpBase { }; }; // namespace caffe2 - -#endif // CAFFE2_OPERATORS_MAX_POOL_WITH_INDEX_H_ diff --git a/caffe2/operators/normalize_op.cc b/caffe2/operators/normalize_op.cc index 1a7d720deb6c3c..73a88201b3278e 100644 --- a/caffe2/operators/normalize_op.cc +++ b/caffe2/operators/normalize_op.cc @@ -12,6 +12,7 @@ void NormalizeOp::DoNormalize( const int m, const int n, const int sf) { + const T kEps = 1e-12f; using InnerStride = Eigen::InnerStride; using StridedVec = Eigen::Map, 0, InnerStride>; @@ -22,10 +23,9 @@ void NormalizeOp::DoNormalize( auto base = (i / sf) * sf * m + (i % sf); ConstStridedVec xVec(xData + base, 1, m, InnerStride(sf)); auto norm = xVec.template lpNorm<2>(); - if (norm != 0) { - StridedVec yVec(yData + base, 1, m, InnerStride(sf)); - yVec = xVec / norm; - } + norm = std::max(norm, kEps); + StridedVec yVec(yData + base, 1, m, InnerStride(sf)); + yVec = xVec / norm; } }; @@ -37,6 +37,7 @@ void NormalizeGradientOp::DoNormalize( const int m, const int n, const int sf) { + const T kEps = 1e-12f; using InnerStride = Eigen::InnerStride; using StridedVec = Eigen::Map, 0, InnerStride>; @@ -50,11 +51,10 @@ void NormalizeGradientOp::DoNormalize( auto row_sum = xVec.dot(gOutVec); auto row_norm = xVec.template lpNorm<2>(); + row_norm = std::max(row_norm, kEps); auto row_norm_3 = pow(row_norm, 3); - if (row_norm != 0) { - StridedVec gInVec(gInData + base, 1, m, InnerStride(sf)); - gInVec = (gOutVec / row_norm) - ((xVec / row_norm_3) * row_sum); - } + StridedVec gInVec(gInData + base, 1, m, InnerStride(sf)); + gInVec = (gOutVec / row_norm) - ((xVec / row_norm_3) * row_sum); } }; diff --git a/caffe2/operators/normalize_ops.cu b/caffe2/operators/normalize_ops.cu index 343a8cee24ec8d..dcffe02f650abf 100644 --- a/caffe2/operators/normalize_ops.cu +++ b/caffe2/operators/normalize_ops.cu @@ -12,6 +12,7 @@ __global__ void NormalizeKernel( const int sf, const float* xData, float* yData) { + const float kEps = 1e-12f; typedef cub::BlockReduce BlockReduce; __shared__ BlockReduce::TempStorage temp_storage; @@ -27,14 +28,13 @@ __global__ void NormalizeKernel( float reduce_result = BlockReduce(temp_storage).Sum(sum); if (threadIdx.x == 0) { - norm = sqrt(reduce_result); + norm = sqrtf(reduce_result); + norm = fmaxf(norm, kEps); } __syncthreads(); - if (norm != 0) { - for (int j = threadIdx.x; j < m; j += blockDim.x) { - const auto index = base + j * sf; - yData[index] = xData[index] / norm; - } + for (int j = threadIdx.x; j < m; j += blockDim.x) { + const auto index = base + j * sf; + yData[index] = xData[index] / norm; } } } @@ -46,6 +46,7 @@ __global__ void NormalizeGradientKernel( const float* in_mat, const float* grad_out_mat, float* grad_mat) { + const float kEps = 1e-12f; typedef cub::BlockReduce BlockReduce; __shared__ BlockReduce::TempStorage temp_storage_sum; __shared__ BlockReduce::TempStorage temp_storage_norm; @@ -66,8 +67,9 @@ __global__ void NormalizeGradientKernel( if (threadIdx.x == 0) { row_sum = reduce_result; - row_norm = sqrt(reduce_norm); - row_norm_3 = pow(row_norm, 3); + row_norm = sqrtf(reduce_norm); + row_norm = fmaxf(row_norm, kEps); + row_norm_3 = powf(row_norm, 3); } __syncthreads(); for (int j = threadIdx.x; j < N; j += blockDim.x) { @@ -131,7 +133,7 @@ __global__ void NormalizeL1Kernel( __shared__ float norm; for (int j = threadIdx.x; j < m; j += blockDim.x) { const auto x_ij = xData[base + j * sf]; - sum += abs(x_ij); + sum += fabsf(x_ij); } float reduce_result = BlockReduce(temp_storage).Sum(sum); diff --git a/caffe2/operators/piecewise_linear_transform_op.cu b/caffe2/operators/piecewise_linear_transform_op.cu index 877b795c19076b..ecc9f0f2493972 100644 --- a/caffe2/operators/piecewise_linear_transform_op.cu +++ b/caffe2/operators/piecewise_linear_transform_op.cu @@ -256,8 +256,8 @@ bool PiecewiseLinearTransformOp::TransformBinary() { X.data(), Y->mutable_data()); } else { + // don't want N*M threads, only N*M/2 PieceWiseLinearTransformBinaryKernel2<<< - // don't want N*M threads, only N*M/2 CAFFE_GET_BLOCKS(X.size() / 2), CAFFE_CUDA_NUM_THREADS, 0, diff --git a/caffe2/operators/relu_op.cu b/caffe2/operators/relu_op.cu index d392e4994bc14e..7309270aa28cc8 100644 --- a/caffe2/operators/relu_op.cu +++ b/caffe2/operators/relu_op.cu @@ -9,6 +9,10 @@ namespace caffe2 { namespace { +#ifdef __HIPCC__ +typedef __half2 half2; +#endif + template __global__ void ReluCUDAKernel(const int N, const T* X, T* Y) { CUDA_1D_KERNEL_LOOP(i, N) { diff --git a/caffe2/operators/resize_op.cc b/caffe2/operators/resize_op.cc index 508ab3390e6072..8a272a3d40f96a 100644 --- a/caffe2/operators/resize_op.cc +++ b/caffe2/operators/resize_op.cc @@ -153,7 +153,8 @@ output_width = floor(input_width * width_scale) output_height = floor(output_height * height_scale) )DOC") .Input(0, "X", "Input tensor") - .Output(0, "Y", "Output tensor"); + .Output(0, "Y", "Output tensor") + .InheritOnnxSchema("Upsample"); // Input: dY, output: dX OPERATOR_SCHEMA(ResizeNearestGradient) diff --git a/caffe2/operators/resize_op.cu b/caffe2/operators/resize_op.cu index 5098d1d590ccbe..0e1d55e5a4f309 100644 --- a/caffe2/operators/resize_op.cu +++ b/caffe2/operators/resize_op.cu @@ -1,6 +1,6 @@ #include "caffe2/core/context_gpu.h" #include "caffe2/utils/math.h" -#include "resize_op.h" +#include "caffe2/operators/resize_op.h" namespace caffe2 { diff --git a/caffe2/operators/reverse_packed_segs_op.cu b/caffe2/operators/reverse_packed_segs_op.cu index fdcffc66c240e4..aab600e27c7496 100644 --- a/caffe2/operators/reverse_packed_segs_op.cu +++ b/caffe2/operators/reverse_packed_segs_op.cu @@ -1,5 +1,5 @@ #include "caffe2/core/context_gpu.h" -#include "reverse_packed_segs_op.h" +#include "caffe2/operators/reverse_packed_segs_op.h" namespace caffe2 { diff --git a/caffe2/operators/roi_align_gradient_op.cu b/caffe2/operators/roi_align_gradient_op.cu index 26fb555d184cde..534d55ddd9a469 100644 --- a/caffe2/operators/roi_align_gradient_op.cu +++ b/caffe2/operators/roi_align_gradient_op.cu @@ -1,4 +1,4 @@ -#include "roi_align_gradient_op.h" +#include "caffe2/operators/roi_align_gradient_op.h" #include #include diff --git a/caffe2/operators/roi_align_op.cu b/caffe2/operators/roi_align_op.cu index 4a448ae8939e3f..e512f3d9741393 100644 --- a/caffe2/operators/roi_align_op.cu +++ b/caffe2/operators/roi_align_op.cu @@ -1,4 +1,4 @@ -#include "roi_align_op.h" +#include "caffe2/operators/roi_align_op.h" #include #include @@ -76,6 +76,7 @@ __global__ void RoIAlignForward( const int pooled_width, const int sampling_ratio, const T* bottom_rois, + int roi_cols, T* top_data) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output @@ -84,18 +85,23 @@ __global__ void RoIAlignForward( int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; - const T* offset_bottom_rois = bottom_rois + n * 5; - int roi_batch_ind = offset_bottom_rois[0]; + // RoI could have 4 or 5 columns + const T* offset_bottom_rois = bottom_rois + n * roi_cols; + int roi_batch_ind = 0; + if (roi_cols == 5) { + roi_batch_ind = offset_bottom_rois[0]; + offset_bottom_rois++; + } // Do not using rounding; this implementation detail is critical - T roi_start_w = offset_bottom_rois[1] * spatial_scale; - T roi_start_h = offset_bottom_rois[2] * spatial_scale; - T roi_end_w = offset_bottom_rois[3] * spatial_scale; - T roi_end_h = offset_bottom_rois[4] * spatial_scale; - // T roi_start_w = roundf(offset_bottom_rois[1] * spatial_scale); - // T roi_start_h = roundf(offset_bottom_rois[2] * spatial_scale); - // T roi_end_w = roundf(offset_bottom_rois[3] * spatial_scale); - // T roi_end_h = roundf(offset_bottom_rois[4] * spatial_scale); + T roi_start_w = offset_bottom_rois[0] * spatial_scale; + T roi_start_h = offset_bottom_rois[1] * spatial_scale; + T roi_end_w = offset_bottom_rois[2] * spatial_scale; + T roi_end_h = offset_bottom_rois[3] * spatial_scale; + // T roi_start_w = roundf(offset_bottom_rois[0] * spatial_scale); + // T roi_start_h = roundf(offset_bottom_rois[1] * spatial_scale); + // T roi_end_w = roundf(offset_bottom_rois[2] * spatial_scale); + // T roi_end_h = roundf(offset_bottom_rois[3] * spatial_scale); // Force malformed ROIs to be 1x1 T roi_width = max(roi_end_w - roi_start_w, (T)1.); @@ -173,6 +179,7 @@ bool RoIAlignOp::RunOnDevice() { pooled_width_, sampling_ratio_, R.data(), + R.dim32(1), Y->mutable_data()); return true; } diff --git a/caffe2/operators/roi_align_op_gpu_test.cc b/caffe2/operators/roi_align_op_gpu_test.cc index 199500f93df3a8..ed4ef33a1d6880 100644 --- a/caffe2/operators/roi_align_op_gpu_test.cc +++ b/caffe2/operators/roi_align_op_gpu_test.cc @@ -1,5 +1,5 @@ #include "caffe2/utils/eigen_utils.h" -#include "roi_align_op.h" +#include "caffe2/operators/roi_align_op.h" #include "caffe2/core/context_gpu.h" #include "caffe2/core/flags.h" diff --git a/caffe2/operators/roi_align_rotated_gradient_op.cu b/caffe2/operators/roi_align_rotated_gradient_op.cu index 1606209944d0eb..1941029fc9d3fb 100644 --- a/caffe2/operators/roi_align_rotated_gradient_op.cu +++ b/caffe2/operators/roi_align_rotated_gradient_op.cu @@ -3,7 +3,7 @@ #endif // _MSC_VER #include -#include "roi_align_rotated_gradient_op.h" +#include "caffe2/operators/roi_align_rotated_gradient_op.h" #include #include diff --git a/caffe2/operators/roi_align_rotated_op.cu b/caffe2/operators/roi_align_rotated_op.cu index 3f8a609451fbd4..0fad3d74b397eb 100644 --- a/caffe2/operators/roi_align_rotated_op.cu +++ b/caffe2/operators/roi_align_rotated_op.cu @@ -3,7 +3,7 @@ #endif // _MSC_VER #include -#include "roi_align_rotated_op.h" +#include "caffe2/operators/roi_align_rotated_op.h" #include #include diff --git a/caffe2/operators/roi_pool_op.cu b/caffe2/operators/roi_pool_op.cu index 34dcebb72b3ca4..45839117b2eda3 100644 --- a/caffe2/operators/roi_pool_op.cu +++ b/caffe2/operators/roi_pool_op.cu @@ -1,7 +1,7 @@ #include #include "caffe2/core/context_gpu.h" -#include "roi_pool_op.h" +#include "caffe2/operators/roi_pool_op.h" namespace caffe2 { diff --git a/caffe2/operators/selu_op.cu b/caffe2/operators/selu_op.cu index 314f2b48370e53..95eb2c54ee96a1 100644 --- a/caffe2/operators/selu_op.cu +++ b/caffe2/operators/selu_op.cu @@ -33,7 +33,7 @@ bool SeluOp::RunOnDevice() { auto* Y = Output(0); CAFFE_ENFORCE_GT(X.size(), 0); Y->ResizeLike(X); - SeluKernel<<< + SeluKernel<<< CAFFE_GET_BLOCKS(X.size()), CAFFE_CUDA_NUM_THREADS, 0, @@ -50,7 +50,7 @@ bool SeluGradientOp::RunOnDevice() { CAFFE_ENFORCE_GT(Y.size(), 0); CAFFE_ENFORCE_EQ(dY.size(), Y.size()); dX->ResizeLike(Y); - SeluGradientKernel<<< + SeluGradientKernel<<< CAFFE_GET_BLOCKS(Y.size()), CAFFE_CUDA_NUM_THREADS, 0, diff --git a/caffe2/operators/softmax_ops.cu b/caffe2/operators/softmax_ops.cu index 8795b6c9f7a8a0..08dbf6e7d07a48 100644 --- a/caffe2/operators/softmax_ops.cu +++ b/caffe2/operators/softmax_ops.cu @@ -2,9 +2,9 @@ #include #include "caffe2/core/context_gpu.h" -#include "softmax_op.h" -#include "softmax_with_loss_op.h" -#include "spatial_softmax_with_loss_op.h" +#include "caffe2/operators/softmax_op.h" +#include "caffe2/operators/softmax_with_loss_op.h" +#include "caffe2/operators/spatial_softmax_with_loss_op.h" namespace caffe2 { @@ -70,7 +70,7 @@ __global__ void ProbCrossEntropyKernel( int idx = i * D + j; CUDA_KERNEL_ASSERT(labeldata[idx] >= 0); total_prob += labeldata[idx]; - sum += -logf(max(Pdata[idx], FLT_MIN)) * labeldata[idx] * weight; + sum += -logf(fmaxf(Pdata[idx], FLT_MIN)) * labeldata[idx] * weight; } float tot = BlockReduce(temp_storage).Sum(sum); __syncthreads(); @@ -78,7 +78,7 @@ __global__ void ProbCrossEntropyKernel( if (threadIdx.x == 0) { Ydata[i] = tot; // Sanity check - CUDA_KERNEL_ASSERT(abs(1.0 - total_prob_sum) < 1e-5f); + CUDA_KERNEL_ASSERT(fabsf(1.0 - total_prob_sum) < 1e-5f); } __syncthreads(); } @@ -118,14 +118,14 @@ __global__ void SpatialSoftmaxKernel( float max_val = -FLT_MAX; for(int c = 0; c < D; ++c) { int idx = i * (H * W * D) + c * (H * W) + y * W + x; - max_val = max(max_val, Xdata[idx]); + max_val = fmaxf(max_val, Xdata[idx]); } // Exponentiate float expsum = 0.0f; for(int c = 0; c < D; ++c) { int idx = i * (H * W * D) + c * (H * W) + y * W + x; - float expx = exp(Xdata[idx] - max_val); + float expx = expf(Xdata[idx] - max_val); Pdata[idx] = expx; expsum += expx; } @@ -160,7 +160,7 @@ __global__ void SpatialCrossEntropyLossKernel( if (label != DONTCARE) { CUDA_KERNEL_ASSERT(label >= 0 && label < D); float weight = (weights == NULL ? 1.0 : weights[index]); - loss_data[index] = -log(max( + loss_data[index] = -logf(fmaxf( Pdata[i * W * H * D + label * W * H + y * W + x], 1e-20f)) * weight; weight_data[index] = weight; } else { @@ -213,7 +213,7 @@ __global__ void SoftmaxNormalizeLogsKernel( float* out_log) { CUDA_1D_KERNEL_LOOP(index, nthreads) { int n = index / D; - out_log[index] = logits[index] - rowmax[n] - logf(max(scales[n], FLT_MIN)); + out_log[index] = logits[index] - rowmax[n] - logf(fmaxf(scales[n], FLT_MIN)); } } diff --git a/caffe2/operators/softplus_op.cu b/caffe2/operators/softplus_op.cu index e733c47a6be386..7e542f5a9b7c84 100644 --- a/caffe2/operators/softplus_op.cu +++ b/caffe2/operators/softplus_op.cu @@ -26,7 +26,7 @@ bool SoftplusOp::RunOnDevice() { auto* Y = Output(0); DCHECK_GT(X.size(), 0); Y->ResizeLike(X); - SoftplusKernel<<< + SoftplusKernel<<< CAFFE_GET_BLOCKS(X.size()), CAFFE_CUDA_NUM_THREADS, 0, @@ -43,7 +43,7 @@ bool SoftplusGradientOp::RunOnDevice() { DCHECK_GT(Y.size(), 0); DCHECK_EQ(dY.size(), Y.size()); dX->ResizeLike(Y); - SoftplusGradientKernel<<< + SoftplusGradientKernel<<< CAFFE_GET_BLOCKS(Y.size()), CAFFE_CUDA_NUM_THREADS, 0, diff --git a/caffe2/operators/softsign_op.cu b/caffe2/operators/softsign_op.cu index 9eeaad33c4251e..e3a32507adad66 100644 --- a/caffe2/operators/softsign_op.cu +++ b/caffe2/operators/softsign_op.cu @@ -14,13 +14,26 @@ inline __host__ __device__ T SquareCUDA(const T x) { return x * x; } +template +inline __device__ T typed_abs(T x); + +template <> +inline __device__ float typed_abs(float x) { + return fabsf(x); +} + +template <> +inline __device__ double typed_abs(double x) { + return fabs(x); +} + template __global__ void SoftsignCUDAKernel(const int N, const T* X, T* Y) { CUDA_1D_KERNEL_LOOP(i, N) { #if __CUDA_ARCH__ >= 350 - Y[i] = __ldg(X + i) / (T(1) + abs(__ldg(X + i))); + Y[i] = __ldg(X + i) / (T(1) + typed_abs(__ldg(X + i))); #else - Y[i] = X[i] / (T(1) + abs(X[i])); + Y[i] = X[i] / (T(1) + typed_abs(X[i])); #endif } } @@ -30,9 +43,9 @@ __global__ void SoftsignGradientCUDAKernel(const int N, const T* dY, const T* X, T* dX) { CUDA_1D_KERNEL_LOOP(i, N) { #if __CUDA_ARCH__ >= 350 - dX[i] = __ldg(dY + i) / SquareCUDA(T(1) + abs(__ldg(X + i))); + dX[i] = __ldg(dY + i) / SquareCUDA(T(1) + typed_abs(__ldg(X + i))); #else - dX[i] = dY[i] / SquareCUDA(T(1) + abs(X[i])); + dX[i] = dY[i] / SquareCUDA(T(1) + typed_abs(X[i])); #endif } } diff --git a/caffe2/operators/sparse_to_dense_mask_op.h b/caffe2/operators/sparse_to_dense_mask_op.h index 517c9ac1639617..5ea10d17c3f29f 100644 --- a/caffe2/operators/sparse_to_dense_mask_op.h +++ b/caffe2/operators/sparse_to_dense_mask_op.h @@ -37,6 +37,9 @@ class SparseToDenseMaskBase : public Operator { } } + // TODO: enable the filler + DISABLE_INPUT_FILLERS(Context) + protected: const int64_t kMaxDenseSize = 1024 * 128; diff --git a/caffe2/operators/sparse_to_dense_op.cu b/caffe2/operators/sparse_to_dense_op.cu index 1086c0a6c521d5..c62718a8ece1b7 100644 --- a/caffe2/operators/sparse_to_dense_op.cu +++ b/caffe2/operators/sparse_to_dense_op.cu @@ -1,4 +1,4 @@ -#include "sparse_to_dense_op.h" +#include "caffe2/operators/sparse_to_dense_op.h" #include "caffe2/core/common_gpu.h" #include "caffe2/core/context_gpu.h" diff --git a/caffe2/operators/top_k_radix_selection.cuh b/caffe2/operators/top_k_radix_selection.cuh index c9913df9f41053..69a9710ec78f04 100644 --- a/caffe2/operators/top_k_radix_selection.cuh +++ b/caffe2/operators/top_k_radix_selection.cuh @@ -360,7 +360,7 @@ __global__ void gatherTopK(const T* inputPtr, // Find the start offset for our slice const T* inputSliceStart = &inputPtr[slice * inputSliceSize]; T* topKSliceStart = &topKPtr[slice * outputSliceSize]; - caffe2::TIndex* indicesSliceStart = &indicesPtr[slice * outputSliceSize]; + IndicesType* indicesSliceStart = &indicesPtr[slice * outputSliceSize]; // Find the k-th highest element in our input T topKValue = (T)0; diff --git a/caffe2/operators/utility_ops.cu b/caffe2/operators/utility_ops.cu index 9e68790f0a262c..12ded223f6925d 100644 --- a/caffe2/operators/utility_ops.cu +++ b/caffe2/operators/utility_ops.cu @@ -1,9 +1,3 @@ -#include -#include -// TODO(jamesreed): I would use here but std::isnan -// and std::isinf are declared constexpr there and the nvidia -// compiler throws an error because of it - #include "caffe2/core/context_gpu.h" #include "caffe2/operators/flatten_op.h" #include "caffe2/operators/minmax_ops.h" @@ -169,7 +163,7 @@ bool NanCheckOp::RunOnDevice() { std::cerr << "NaN idxs:" << std::endl; auto* cpu_X_data = cpu_X.data(); for (size_t i = 0; i < cpu_X.size(); ++i) { - if (isnan(cpu_X_data[i]) || isinf(cpu_X_data[i])) { + if (std::isnan(cpu_X_data[i]) || std::isinf(cpu_X_data[i])) { std::cerr << i << " "; } } @@ -404,7 +398,7 @@ bool ScatterWeightedSumOp::DoRunWithType() { TIndex K = indices.size(); TIndex block_size = M / N; - T* data = output->template mutable_data(); + float* data = output->template mutable_data(); // In order to have all device pointers of x_i (and weight_i similarly) // consecutively in device memory, copy pointers to a host vector and then diff --git a/caffe2/operators/utility_ops.h b/caffe2/operators/utility_ops.h index d79296c1ebfa40..a0eb0f3c531f03 100644 --- a/caffe2/operators/utility_ops.h +++ b/caffe2/operators/utility_ops.h @@ -703,6 +703,9 @@ class LengthsToSegmentIdsOp : public Operator { USE_OPERATOR_CONTEXT_FUNCTIONS; USE_SIMPLE_CTOR_DTOR(LengthsToSegmentIdsOp); + // TODO: enable the InputFillers + DISABLE_INPUT_FILLERS(Context) + bool RunOnDevice() override { auto& input = Input(0); auto* output = Output(0); @@ -758,6 +761,9 @@ class SegmentIdsToLengthsOp : public Operator { USE_OPERATOR_CONTEXT_FUNCTIONS; USE_SIMPLE_CTOR_DTOR(SegmentIdsToLengthsOp); + // TODO: enable the InputFillers + DISABLE_INPUT_FILLERS(Context) + bool RunOnDevice() override { return DispatchHelper>::call(this, Input(0)); } @@ -815,6 +821,9 @@ class SegmentIdsToRangesOp : public Operator { USE_OPERATOR_CONTEXT_FUNCTIONS; USE_SIMPLE_CTOR_DTOR(SegmentIdsToRangesOp); + // TODO: enable the InputFillers + DISABLE_INPUT_FILLERS(Context) + bool RunOnDevice() override { return DispatchHelper>::call(this, Input(0)); } diff --git a/caffe2/python/_import_c_extension.py b/caffe2/python/_import_c_extension.py index 5405329e40a58b..ba2cbe1677c8b1 100644 --- a/caffe2/python/_import_c_extension.py +++ b/caffe2/python/_import_c_extension.py @@ -9,33 +9,31 @@ # attempt to load the cpu version. The cpu backend is the minimum required, so # if that still fails, we will exit loud. with extension_loader.DlopenGuard(): + has_hip_support = False + has_gpu_support = False + try: from caffe2.python.caffe2_pybind11_state_gpu import * # noqa if num_cuda_devices(): # noqa has_gpu_support = True - else: - has_gpu_support = False - except ImportError as e: - has_gpu_support = False + except ImportError as gpu_e: + logging.info('Failed to import cuda module: {}'.format(gpu_e)) try: from caffe2.python.caffe2_pybind11_state_hip import * # noqa if num_hip_devices(): has_hip_support = True logging.info('This caffe2 python run has AMD GPU support!') - else: - has_hip_support = False - except ImportError as e: - logging.info('Failed to import AMD hip module: {}'.format(e)) + except ImportError as hip_e: + logging.info('Failed to import AMD hip module: {}'.format(hip_e)) logging.warning( 'This caffe2 python run does not have GPU support. ' 'Will run in CPU only mode.') - logging.warning('Debug message: {0}'.format(str(e))) try: from caffe2.python.caffe2_pybind11_state import * # noqa - except ImportError as e: + except ImportError as cpu_e: logging.critical( - 'Cannot load caffe2.python. Error: {0}'.format(str(e))) + 'Cannot load caffe2.python. Error: {0}'.format(str(cpu_e))) sys.exit(1) # libcaffe2_python contains a global Workspace that we need to properly delete diff --git a/caffe2/python/core.py b/caffe2/python/core.py index 3caa3ee715d5d2..9fef5724ad2f62 100644 --- a/caffe2/python/core.py +++ b/caffe2/python/core.py @@ -2732,6 +2732,8 @@ def create_from_proto(cls, plan_proto): assert isinstance(plan_proto, caffe2_pb2.PlanDef) plan = Plan(plan_proto.name) plan._plan.CopyFrom(plan_proto) + del plan._plan.network[:] + del plan._plan.execution_step[:] net_obj_dict = {} net_proto_dict = {} diff --git a/caffe2/python/core_test.py b/caffe2/python/core_test.py index eafdc5385836ee..2a4afc82c8067f 100644 --- a/caffe2/python/core_test.py +++ b/caffe2/python/core_test.py @@ -481,8 +481,11 @@ def test_create_plan_from_proto_correctly(self): self.assertEqual(len(plan.Steps()), 1) self.assertEqual(len(test_plan.Steps()), 1) + self.assertEqual(len(plan.Proto().network), 9) + self.assertEqual(len(test_plan.Proto().network), 9) + self.assertEqual(len(plan.Proto().execution_step), 1) + self.assertEqual(len(test_plan.Proto().execution_step), 1) self.assertEqual(plan.Steps()[0].Name(), test_plan.Steps()[0].Name()) - self.assertEqual(len(plan.Nets()), len(test_plan.Nets())) for idx in range(0, len(plan.Nets())): # When we create Net for test_plan, we will end up with new Net diff --git a/caffe2/python/hypothesis_test_util.py b/caffe2/python/hypothesis_test_util.py index 9deb588ea866d5..e501a7d41d3ecc 100644 --- a/caffe2/python/hypothesis_test_util.py +++ b/caffe2/python/hypothesis_test_util.py @@ -251,7 +251,8 @@ def tensors1d(n, min_len=1, max_len=64, dtype=np.float32, elements=None): cpu_do = caffe2_pb2.DeviceOption() gpu_do = caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA) -device_options = [cpu_do] + ([gpu_do] if workspace.has_gpu_support else []) +hip_do = caffe2_pb2.DeviceOption(device_type=caffe2_pb2.HIP) +device_options = [cpu_do] + ([gpu_do] if workspace.has_gpu_support else []) + ([hip_do] if workspace.has_hip_support else []) # Include device option for each GPU expanded_device_options = [cpu_do] + ( [caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA, cuda_gpu_id=i) diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py index 3d7e76a7176d35..fa62fbe6588d1e 100644 --- a/caffe2/python/onnx/backend.py +++ b/caffe2/python/onnx/backend.py @@ -888,13 +888,17 @@ def kmap(k): return cls._global_renamed_attrs[k] return k c2_op.arg.extend(onnx_node.attrs.caffe2(kmap=kmap)) - if c2_op.type in cls._broadcast_operators: - already_broadcast = False - for arg in c2_op.arg: - if arg.name == 'broadcast': - already_broadcast = True - if not already_broadcast: - c2_op.arg.extend([caffe2.python.utils.MakeArgument('broadcast', 1)]) + + if opset_version < 7: + # onnx opset 7 and newest caffe2 have adopted full onnx broadcast semantics + # so we don't need this hack anymore + if c2_op.type in cls._broadcast_operators: + already_broadcast = False + for arg in c2_op.arg: + if arg.name == 'broadcast': + already_broadcast = True + if not already_broadcast: + c2_op.arg.extend([caffe2.python.utils.MakeArgument('broadcast', 1)]) return c2_op diff --git a/caffe2/python/onnx/tests/c2_ref_test.py b/caffe2/python/onnx/tests/c2_ref_test.py index a531ee37d33a9c..97d824e05897a5 100644 --- a/caffe2/python/onnx/tests/c2_ref_test.py +++ b/caffe2/python/onnx/tests/c2_ref_test.py @@ -37,20 +37,16 @@ def test_dummy_name(self): assert n1 != n2, "Got same names in different calls: {}".format(n1) def test_check_arguments(self): - X = np.random.randn(3, 2).astype(np.float32) - Y = np.random.randn(3, 2).astype(np.float32) - Z = np.zeros((3, 2)).astype(np.float32) - b2 = C.Caffe2Backend() node_def = make_node("Add", inputs = ["X", "Y"], outputs = ["Z"]) - output = b2.convert_node(node_def.SerializeToString(), 6) + b2.convert_node(node_def.SerializeToString(), 6) bad_node_def = make_node("Add", inputs = ["X", "Y"], outputs = ["Z"], foo = 42, bar = 56) with self.assertRaisesRegexp( RuntimeError, ".*?Don't know how to map unexpected argument (foo|bar) \(from operator .*?\).*$"): - output = b2.convert_node(bad_node_def.SerializeToString(), 6) + b2.convert_node(bad_node_def.SerializeToString(), 6) def test_relu_graph(self): X = np.random.randn(3, 2).astype(np.float32) @@ -105,6 +101,37 @@ def sigmoid(x): output = c2_rep.run({"X": X, "Y": Y}) np.testing.assert_almost_equal(output["W3"], W_ref) + def test_upsample(self): + X = np.random.randn(1, 1, 2, 2).astype(np.float32) + width_scale = 2.0 + height_scale = 2.0 + + predict_net = caffe2_pb2.NetDef() + predict_net.name = 'test-upsample-net' + predict_net.external_input[:] = ['X'] + predict_net.external_output[:] = ['Y'] + predict_net.op.extend([ + core.CreateOperator( + 'ResizeNearest', + inputs=['X'], + outputs=['Y'], + width_scale=width_scale, + height_scale=height_scale, + ), + ]) + ws, c2_outputs = c2_native_run_net( + init_net=None, + predict_net=predict_net, + inputs=[X]) + + onnx_model = c2_onnx.caffe2_net_to_onnx_model( + predict_net=predict_net, + value_info={ + 'X': (onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[X.dtype], X.shape) + }) + onnx_outputs = c2.run_model(onnx_model, inputs=[X]) + self.assertSameOutputs(c2_outputs, onnx_outputs) + def test_gemm(self): # simple A = np.random.randn(3, 2).astype(np.float32) @@ -379,9 +406,9 @@ def test_vgg16(self): @unittest.skipIf( os.environ.get('JENKINS_URL'), - 'Running vgg19 on Travis with Python 2 keeps getting OOM!') - def test_vgg19(self): - self._test_net('vgg19') + 'Taking too long to download!') + def test_zfnet(self): + self._test_net('zfnet') def test_inception_v1(self): self._test_net('inception_v1', decimal=2) diff --git a/caffe2/python/onnx/tests/onnx_backend_test.py b/caffe2/python/onnx/tests/onnx_backend_test.py index e1604cc8a36d02..24d6bc83878def 100644 --- a/caffe2/python/onnx/tests/onnx_backend_test.py +++ b/caffe2/python/onnx/tests/onnx_backend_test.py @@ -35,7 +35,6 @@ '|test_operator_repeat.*' # Tile is not compliant with ONNX yet '|test_.*pool_.*same.*' # Does not support pool same. '|test_convtranspose.*' # ConvTranspose needs some more complicated translation - '|test_averagepool.*count_include_pad.*' # Waiting for the support in Caffe2 onnx backend. ')') # Quick patch to unbreak master CI, is working on the debugging. diff --git a/caffe2/python/operator_test/affine_channel_op_test.py b/caffe2/python/operator_test/affine_channel_op_test.py index 70aa45d6d50bdd..6e56da29b7f6a9 100644 --- a/caffe2/python/operator_test/affine_channel_op_test.py +++ b/caffe2/python/operator_test/affine_channel_op_test.py @@ -32,17 +32,15 @@ def affine_channel_nhwc_ref(self, X, scale, bias): @given(N=st.integers(1, 5), C=st.integers(1, 5), H=st.integers(1, 5), W=st.integers(1, 5), order=st.sampled_from(["NCHW", "NHWC"]), - is_learnable=st.booleans(), engine=st.sampled_from(["", "CUDNN"]), - in_place=st.booleans(), **hu.gcs) + is_learnable=st.booleans(), in_place=st.booleans(), **hu.gcs) def test_affine_channel_2d( - self, N, C, H, W, order, is_learnable, engine, in_place, gc, dc): + self, N, C, H, W, order, is_learnable, in_place, gc, dc): op = core.CreateOperator( "AffineChannel", ["X", "scale", "bias"], ["X"] if in_place and not is_learnable else ["Y"], order=order, is_learnable=is_learnable, - engine=engine, ) if order == "NCHW": @@ -73,17 +71,15 @@ def ref_op(X, scale, bias): @given(N=st.integers(1, 5), C=st.integers(1, 5), T=st.integers(1, 3), H=st.integers(1, 3), W=st.integers(1, 3), order=st.sampled_from(["NCHW", "NHWC"]), is_learnable=st.booleans(), - engine=st.sampled_from(["", "CUDNN"]), in_place=st.booleans(), - **hu.gcs) + in_place=st.booleans(), **hu.gcs) def test_affine_channel_3d( - self, N, C, T, H, W, order, is_learnable, engine, in_place, gc, dc): + self, N, C, T, H, W, order, is_learnable, in_place, gc, dc): op = core.CreateOperator( "AffineChannel", ["X", "scale", "bias"], ["X"] if in_place and not is_learnable else ["Y"], order=order, is_learnable=is_learnable, - engine=engine, ) if order == "NCHW": diff --git a/caffe2/python/operator_test/batch_moments_op_test.py b/caffe2/python/operator_test/batch_moments_op_test.py new file mode 100644 index 00000000000000..2db25e73892563 --- /dev/null +++ b/caffe2/python/operator_test/batch_moments_op_test.py @@ -0,0 +1,92 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from caffe2.python import core +from hypothesis import given + +import caffe2.python.hypothesis_test_util as hu +import hypothesis.strategies as st + + +class TestBatchMomentsOp(hu.HypothesisTestCase): + def batch_moments_nchw_ref(self, X): + dims = X.shape + N = dims[0] + C = dims[1] + X = X.reshape(N, C, -1) + mu = np.mean(X, axis=(0, 2)) + var = np.mean(np.square(X), axis=(0, 2)) + return [mu, var] + + def batch_moments_nhwc_ref(self, X): + dims = X.shape + C = dims[-1] + X = X.reshape(-1, C) + mu = np.mean(X, axis=0) + var = np.mean(np.square(X), axis=0) + return [mu, var] + + @given(N=st.integers(1, 5), C=st.integers(1, 5), H=st.integers(1, 5), + W=st.integers(1, 5), order=st.sampled_from(["NCHW", "NHWC"]), + **hu.gcs) + def test_batch_moments_2d(self, N, C, H, W, order, gc, dc): + op = core.CreateOperator( + "BatchMoments", + ["X"], + ["mu", "var"], + order=order, + ) + + if order == "NCHW": + X = np.random.randn(N, C, H, W).astype(np.float32) + else: + X = np.random.randn(N, H, W, C).astype(np.float32) + + def ref(X): + if order == "NCHW": + return self.batch_moments_nchw_ref(X) + else: + return self.batch_moments_nhwc_ref(X) + + self.assertReferenceChecks( + device_option=gc, + op=op, + inputs=[X], + reference=ref, + ) + self.assertDeviceChecks(dc, op, [X], [0, 1]) + self.assertGradientChecks(gc, op, [X], 0, [0, 1]) + + @given(N=st.integers(1, 5), C=st.integers(1, 5), T=st.integers(1, 3), + H=st.integers(1, 3), W=st.integers(1, 3), + order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs) + def test_batch_moments_3d(self, N, C, T, H, W, order, gc, dc): + op = core.CreateOperator( + "BatchMoments", + ["X"], + ["mu", "var"], + order=order, + ) + + if order == "NCHW": + X = np.random.randn(N, C, T, H, W).astype(np.float32) + else: + X = np.random.randn(N, T, H, W, C).astype(np.float32) + + def ref(X): + if order == "NCHW": + return self.batch_moments_nchw_ref(X) + else: + return self.batch_moments_nhwc_ref(X) + + self.assertReferenceChecks( + device_option=gc, + op=op, + inputs=[X], + reference=ref, + ) + self.assertDeviceChecks(dc, op, [X], [0, 1]) + self.assertGradientChecks(gc, op, [X], 0, [0, 1]) diff --git a/caffe2/python/operator_test/normalize_op_test.py b/caffe2/python/operator_test/normalize_op_test.py index 965bbe73fec8c8..933d78f4e4f07f 100644 --- a/caffe2/python/operator_test/normalize_op_test.py +++ b/caffe2/python/operator_test/normalize_op_test.py @@ -9,45 +9,45 @@ import hypothesis.strategies as st from caffe2.python import core import caffe2.python.hypothesis_test_util as hu +import copy class TestNormalizeOp(hu.HypothesisTestCase): - - @given(X=hu.tensor(min_dim=1, - max_dim=5, - elements=st.floats(min_value=0.5, max_value=1.0)), - **hu.gcs) + @given( + X=hu.tensor( + min_dim=1, max_dim=5, elements=st.floats(min_value=0.5, max_value=1.0) + ), + **hu.gcs + ) def test_normalize(self, X, gc, dc): def ref_normalize(X, axis): - x_normed = X / ( - np.sqrt((X**2).sum(axis=axis, keepdims=True)) + np.finfo(X.dtype).tiny) + x_normed = X / np.maximum( + np.sqrt((X ** 2).sum(axis=axis, keepdims=True)), 1e-12 + ) return (x_normed,) for axis in range(-X.ndim, X.ndim): + x = copy.copy(X) op = core.CreateOperator("Normalize", "X", "Y", axis=axis) self.assertReferenceChecks( - gc, - op, - [X], - functools.partial(ref_normalize, axis=axis)) - self.assertDeviceChecks(dc, op, [X], [0]) - self.assertGradientChecks(gc, op, [X], 0, [0]) - - @given(X=hu.tensor(min_dim=1, - max_dim=5, - elements=st.floats(min_value=0.5, max_value=1.0)), - **hu.gcs) + gc, op, [x], functools.partial(ref_normalize, axis=axis) + ) + self.assertDeviceChecks(dc, op, [x], [0]) + self.assertGradientChecks(gc, op, [x], 0, [0]) + + @given( + X=hu.tensor( + min_dim=1, max_dim=5, elements=st.floats(min_value=0.5, max_value=1.0) + ), + **hu.gcs + ) def test_normalize_L1(self, X, gc, dc): def ref(X, axis): norm = abs(X).sum(axis=axis, keepdims=True) return (X / norm,) for axis in range(-X.ndim, X.ndim): - print('axis: ', axis) + print("axis: ", axis) op = core.CreateOperator("NormalizeL1", "X", "Y", axis=axis) - self.assertReferenceChecks( - gc, - op, - [X], - functools.partial(ref, axis=axis)) + self.assertReferenceChecks(gc, op, [X], functools.partial(ref, axis=axis)) self.assertDeviceChecks(dc, op, [X], [0]) diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc index e58f5ba6be1ebe..7c421ff2a870e1 100644 --- a/caffe2/python/pybind_state.cc +++ b/caffe2/python/pybind_state.cc @@ -5,6 +5,7 @@ #include "caffe2/contrib/script/compiler.h" #include "caffe2/core/asan.h" +#include "caffe2/core/blob_stats.h" #include "caffe2/core/db.h" #include "caffe2/core/numa.h" #include "caffe2/core/operator.h" @@ -1433,6 +1434,12 @@ void addGlobalMethods(py::module& m) { CAFFE_ENFORCE(raw_data); return GetNUMANode(raw_data); }); + m.def("get_blob_size_bytes", [](const std::string& blob_name) { + CAFFE_ENFORCE(gWorkspace); + auto* blob = gWorkspace->GetBlob(blob_name); + CAFFE_ENFORCE(blob); + return BlobStat::sizeBytes(*blob); + }); m.def("support_onnx_export", [](const std::string& op) -> bool { const OpSchema* schema = caffe2::OpSchemaRegistry::Schema(op); if (!schema) { diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py index cb73dda38e8120..1c618ac9efabd7 100644 --- a/caffe2/python/workspace.py +++ b/caffe2/python/workspace.py @@ -42,6 +42,7 @@ is_asan = C.is_asan has_gpu_support = C.has_gpu_support +has_hip_support = C.has_hip_support if has_gpu_support: NumCudaDevices = C.num_cuda_devices GetCUDAVersion = C.get_cuda_version @@ -61,6 +62,7 @@ def GetCudaPeerAccessPattern(): IsNUMAEnabled = C.is_numa_enabled GetNumNUMANodes = C.get_num_numa_nodes GetBlobNUMANode = C.get_blob_numa_node +GetBlobSizeBytes = C.get_blob_size_bytes def _GetFreeFlaskPort(): """Get a free flask port.""" diff --git a/caffe2/python/workspace_test.py b/caffe2/python/workspace_test.py index 7e5047c74a5181..78468ec8548af9 100644 --- a/caffe2/python/workspace_test.py +++ b/caffe2/python/workspace_test.py @@ -189,6 +189,30 @@ def testFetchFeedBlobBool(self): self.assertEqual(fetched_back.dtype, np.bool) np.testing.assert_array_equal(fetched_back, data) + def testGetBlobSizeBytes(self): + for dtype in [np.float16, np.float32, np.float64, np.bool, + np.int8, np.int16, np.int32, np.int64, + np.uint8, np.uint16]: + data = np.random.randn(2, 3).astype(dtype) + self.assertTrue(workspace.FeedBlob("testblob_sizeBytes", data), True) + self.assertEqual( + workspace.GetBlobSizeBytes("testblob_sizeBytes"), + 6 * np.dtype(dtype).itemsize) + strs1 = np.array([b'Hello World!', b'abcd']) + strs2 = np.array([b'element1', b'element2']) + strs1_len, strs2_len = 0, 0 + for str in strs1: + strs1_len += len(str) + for str in strs2: + strs2_len += len(str) + self.assertTrue(workspace.FeedBlob("testblob_str1", strs1), True) + self.assertTrue(workspace.FeedBlob("testblob_str2", strs2), True) + # size of blob "testblob_str1" = size_str1 * meta_.itemsize() + strs1_len + # size of blob "testblob_str2" = size_str2 * meta_.itemsize() + strs2_len + self.assertEqual( + workspace.GetBlobSizeBytes("testblob_str1") - + workspace.GetBlobSizeBytes("testblob_str2"), strs1_len - strs2_len) + def testFetchFeedBlobZeroDim(self): data = np.empty(shape=(2, 0, 3), dtype=np.float32) self.assertEqual(workspace.FeedBlob("testblob_empty", data), True) diff --git a/caffe2/sgd/CMakeLists.txt b/caffe2/sgd/CMakeLists.txt index 740d9741508b5c..55d0a9124836e4 100644 --- a/caffe2/sgd/CMakeLists.txt +++ b/caffe2/sgd/CMakeLists.txt @@ -9,6 +9,14 @@ set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp}) file(GLOB tmp *_test.cc) exclude(Caffe2_GPU_SRCS "${Caffe2_GPU_SRCS}" ${tmp}) +# ---[ HIP files +# ------[ general GPU +file(GLOB_RECURSE tmp *_hip.cc) +set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp}) +# exclude test files +file(GLOB_RECURSE tmp *_test.cc) +exclude(Caffe2_HIP_SRCS "${Caffe2_HIP_SRCS}" ${tmp}) + # ---[ CPU files. file(GLOB tmp *.cc) set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp}) @@ -16,18 +24,26 @@ set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp}) file(GLOB tmp *_test.cc) exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp}) exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_GPU_SRCS}) +exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_HIP_SRCS}) # ---[ GPU test files file(GLOB tmp *_gpu_test.cc) set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} ${tmp}) +# ---[ HIP test files +file(GLOB_RECURSE tmp *_hip_test.cc) +set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} ${tmp}) + # ---[ CPU test files file(GLOB tmp *_test.cc) set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp}) exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_GPU_TEST_SRCS}) +exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_HIP_TEST_SRCS}) # ---[ Send the lists to the parent scope. set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE) set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE) +set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE) set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE) set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE) +set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE) diff --git a/caffe2/sgd/adagrad_op_gpu.cu b/caffe2/sgd/adagrad_op_gpu.cu index 71e9f9253ab7d7..df43aaf042ac75 100644 --- a/caffe2/sgd/adagrad_op_gpu.cu +++ b/caffe2/sgd/adagrad_op_gpu.cu @@ -1,5 +1,5 @@ #include -#include "adagrad_op.h" +#include "caffe2/sgd/adagrad_op.h" #include "caffe2/core/common_gpu.h" #include "caffe2/core/context_gpu.h" #include "caffe2/utils/mixed_utils.h" @@ -19,7 +19,7 @@ __global__ void AdagradUpdate( CUDA_1D_KERNEL_LOOP(i, N) { float gi = g[i]; float hi = nh[i] = decay * h[i] + gi * gi; - nw[i] = w[i] + lr[0] * gi / (std::sqrt(hi) + epsilon); + nw[i] = w[i] + lr[0] * gi / (sqrtf(hi) + epsilon); } } @@ -63,7 +63,7 @@ __global__ void SparseAdagradKernel( mixed_add(grad[gradIdx] * grad[gradIdx], param_mom[paramIdx]); mixed_store(&mom_new, &(param_mom[paramIdx])); float param_new = mixed_add( - LR * grad[gradIdx] / (sqrt(mom_new) + epsilon), param[paramIdx]); + LR * grad[gradIdx] / (sqrtf(mom_new) + epsilon), param[paramIdx]); mixed_store(¶m_new, &(param[paramIdx])); } } @@ -107,7 +107,7 @@ __global__ void RowWiseSparseAdagradKernel( } __syncthreads(); // update param - float step = lr[0] / (std::sqrt(param_mom[index]) + epsilon); + float step = lr[0] / (sqrtf(param_mom[index]) + epsilon); for (int j = threadIdx.x; j < N; j += blockDim.x) { param[index * N + j] = param[index * N + j] + grad[i * N + j] * step; } diff --git a/caffe2/sgd/adam_op_gpu.cu b/caffe2/sgd/adam_op_gpu.cu index c9b94e62766f88..8eb1b8835c96da 100644 --- a/caffe2/sgd/adam_op_gpu.cu +++ b/caffe2/sgd/adam_op_gpu.cu @@ -1,4 +1,4 @@ -#include "adam_op.h" +#include "caffe2/sgd/adam_op.h" #include "caffe2/core/common_gpu.h" #include "caffe2/core/context_gpu.h" @@ -21,7 +21,7 @@ __global__ void AdamUpdate( float gi = g[i]; float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1); float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2); - ng[i] = lr[0] * correction * mi / (std::sqrt(vi) + eps_hat); + ng[i] = lr[0] * correction * mi / (sqrtf(vi) + eps_hat); } } @@ -66,7 +66,7 @@ __global__ void AdamCompute( float gi = g[i]; float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1); float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2); - float ng = lr[0] * correction * mi / (std::sqrt(vi) + eps_hat); + float ng = lr[0] * correction * mi / (sqrtf(vi) + eps_hat); nw[i] = w[i] + ng; } } @@ -130,7 +130,7 @@ bool SparseAdamOp::DoRunWithType() { auto grad_slice_sz = Input(GRAD).size_from_dim(Input(INDICES).ndim()); const auto iter = OperatorBase::Input(ITER).template data()[0]; - const float correction = std::sqrt(1.0f - std::pow(beta2_, iter + 1)) / + const float correction = sqrtf(1.0f - std::pow(beta2_, iter + 1)) / (1.0f - std::pow(beta1_, iter + 1)); SparseAdamKernel diff --git a/caffe2/sgd/fp16_momentum_sgd_op.cu b/caffe2/sgd/fp16_momentum_sgd_op.cu index d8d98bcda90a48..0067fb9a979180 100644 --- a/caffe2/sgd/fp16_momentum_sgd_op.cu +++ b/caffe2/sgd/fp16_momentum_sgd_op.cu @@ -1,10 +1,16 @@ #include "caffe2/core/common_gpu.h" #include "caffe2/core/context_gpu.h" -#include "fp16_momentum_sgd_op.h" +#include "caffe2/sgd/fp16_momentum_sgd_op.h" namespace caffe2 { namespace { + +#ifdef __HIPCC__ +typedef __half half; +typedef __half2 half2; +#endif + __global__ void FP16MomentumSGDKernel( int N, const half2* g, diff --git a/caffe2/sgd/fp32_momentum_sgd_op.cu b/caffe2/sgd/fp32_momentum_sgd_op.cu index 17a0d6badefd68..c7947dac440a23 100644 --- a/caffe2/sgd/fp32_momentum_sgd_op.cu +++ b/caffe2/sgd/fp32_momentum_sgd_op.cu @@ -1,7 +1,7 @@ #include "caffe2/core/common_gpu.h" #include "caffe2/core/context_gpu.h" -#include "fp32_momentum_sgd_op.h" +#include "caffe2/sgd/fp32_momentum_sgd_op.h" namespace caffe2 { namespace { diff --git a/caffe2/sgd/momentum_sgd_op_gpu.cu b/caffe2/sgd/momentum_sgd_op_gpu.cu index 9ef3f7e5d96da7..74d84f1ada8117 100644 --- a/caffe2/sgd/momentum_sgd_op_gpu.cu +++ b/caffe2/sgd/momentum_sgd_op_gpu.cu @@ -1,4 +1,4 @@ -#include "momentum_sgd_op.h" +#include "caffe2/sgd/momentum_sgd_op.h" #include "caffe2/core/common_gpu.h" #include "caffe2/core/context_gpu.h" diff --git a/caffe2/sgd/rmsprop_op_gpu.cu b/caffe2/sgd/rmsprop_op_gpu.cu index dd34e10f97c28a..fd293e240308bd 100644 --- a/caffe2/sgd/rmsprop_op_gpu.cu +++ b/caffe2/sgd/rmsprop_op_gpu.cu @@ -1,4 +1,4 @@ -#include "rmsprop_op.h" +#include "caffe2/sgd/rmsprop_op.h" #include "caffe2/core/common_gpu.h" #include "caffe2/core/context_gpu.h" @@ -21,7 +21,7 @@ __global__ void RmsPropUpdate( nms[i] = ms[i] + (1.0f - decay) * (g[i] * g[i] - ms[i]); // Update momentum estimate nmom[i] = - mom[i] * momentum + lr[0] * g[i] / std::sqrt(epsilon + nms[i]); + mom[i] * momentum + lr[0] * g[i] / sqrtf(epsilon + nms[i]); // New gradient is the momentum ng[i] = nmom[i]; } diff --git a/caffe2/utils/filler.h b/caffe2/utils/filler.h new file mode 100644 index 00000000000000..a2aa32fb56db7f --- /dev/null +++ b/caffe2/utils/filler.h @@ -0,0 +1,126 @@ +#ifndef CAFFE2_FILLER_H_ +#define CAFFE2_FILLER_H_ + +#include + +#include "caffe2/core/context.h" +#include "caffe2/core/logging.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { + +template +class TensorFiller { + public: + template + void Fill(Tensor* tensor) const { + CAFFE_ENFORCE(context_, "context is null"); + CAFFE_ENFORCE(tensor, "tensor is null"); + auto min = static_cast(min_); + auto max = static_cast(max_); + CAFFE_ENFORCE_LE(min, max); + + Tensor temp_tensor(shape_); + tensor->swap(temp_tensor); + Type* data = tensor->template mutable_data(); + Context_t* context = static_cast(context_); + + // TODO: Come up with a good distribution abstraction so that + // the users could plug in their own distribution. + if (has_fixed_sum_) { + auto fixed_sum = static_cast(fixed_sum_); + CAFFE_ENFORCE_LE(min * tensor->size(), fixed_sum); + CAFFE_ENFORCE_GE(max * tensor->size(), fixed_sum); + math::RandFixedSum( + tensor->size(), min, max, fixed_sum_, data, context); + } else { + math::RandUniform( + tensor->size(), min, max, data, context); + } + } + + template + TensorFiller& Min(Type min) { + min_ = (double)min; + return *this; + } + + template + TensorFiller& Max(Type max) { + max_ = (double)max; + return *this; + } + + template + TensorFiller& FixedSum(Type fixed_sum) { + has_fixed_sum_ = true; + fixed_sum_ = (double)fixed_sum; + return *this; + } + + // a helper function to construct the lengths vector for sparse features + template + TensorFiller& SparseLengths(Type total_length) { + return FixedSum(total_length).Min(0).Max(total_length); + } + + // a helper function to construct the segments vector for sparse features + template + TensorFiller& SparseSegments(Type max_segment) { + CAFFE_ENFORCE(!has_fixed_sum_); + return Min(0).Max(max_segment); + } + + TensorFiller& Shape(const std::vector& shape) { + shape_ = shape; + return *this; + } + + // Use new context so that it is independent from its operator + TensorFiller& Context(Context_t* context) { + context_ = (void*)context; + return *this; + } + + template + TensorFiller( + const std::vector& shape, + Type fixed_sum, + Context_t* context) + : shape_(shape), + has_fixed_sum_(true), + fixed_sum_((double)fixed_sum), + context_((void*)context) {} + + TensorFiller(const std::vector& shape, Context_t* context) + : shape_(shape), + has_fixed_sum_(false), + fixed_sum_(0), + context_((void*)context) {} + + TensorFiller() : TensorFiller({}, (Context_t*)nullptr) {} + + std::string DebugString() const { + std::stringstream stream; + stream << "shape = [" << shape_ << "]; min = " << min_ + << "; max = " << max_; + if (has_fixed_sum_) { + stream << "; fixed sum = " << fixed_sum_; + } + return stream.str(); + } + + private: + std::vector shape_; + // TODO: type is unknown until a user starts to fill data; + // cast everything to double for now. + double min_ = 0.0; + double max_ = 1.0; + bool has_fixed_sum_; + double fixed_sum_; + void* context_; +}; + +} // namespace caffe2 + +#endif // CAFFE2_FILLER_H_ diff --git a/caffe2/utils/hip/math_hip.cc b/caffe2/utils/hip/math_hip.cc index 902478337be5d3..9b2eef54216188 100644 --- a/caffe2/utils/hip/math_hip.cc +++ b/caffe2/utils/hip/math_hip.cc @@ -3,6 +3,7 @@ #include "caffe2/utils/math.h" +#include #include #include #include @@ -14,6 +15,8 @@ #include "caffe2/core/hip/context_hip.h" #include "caffe2/utils/conversions.h" +#include "caffe2/utils/fixed_divisor.h" +#include "caffe2/utils/math_utils.h" #if THRUST_VERSION >= 100800 #define THRUST_SUPPORTS_PER_THREAD @@ -26,40 +29,19 @@ namespace math { namespace { -inline __host__ __device__ bool Not(const bool x) { - return !x; -} - -template -inline __host__ __device__ T Negate(const T& x) { - return -x; -} - -template -inline __host__ __device__ T Square(const T& x) { - return x * x; -} - -template -inline __host__ __device__ T Sign(const T& x) { - return x > 0 ? T(1) : (x < 0 ? T(-1) : T(0)); -} - -#define DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Func, expr) \ - template \ - struct Func##Functor { \ - inline __host__ __device__ T \ - operator()(const T& lhs, const T& rhs) const { \ - return lhs expr rhs; \ - } \ - }; \ - template <> \ - struct Func##Functor { \ - inline __host__ __device__ float16 \ - operator()(const float16& lhs, const float16& rhs) const { \ - return convert::To(convert::To( \ - lhs) expr convert::To(rhs)); \ - } \ +#define DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Func, expr) \ + template struct Func##Functor { \ + inline __host__ __device__ T operator()(const T &lhs, \ + const T &rhs) const { \ + return lhs expr rhs; \ + } \ + }; \ + template <> struct Func##Functor { \ + inline __host__ __device__ float16 operator()(const float16 &lhs, \ + const float16 &rhs) const { \ + return convert::To(convert::To( \ + lhs) expr convert::To(rhs)); \ + } \ }; DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Add, +) DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Sub, -) @@ -68,28 +50,18 @@ DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Div, /) #undef DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR template -__global__ void SimpleBinaryOpHIPKernel( - const int N, - const BinaryOperator op, - const TIn* A, - const TIn* B, - TOut* C) { - HIP_1D_KERNEL_LOOP(i, N) { - C[i] = op(A[i], B[i]); - } +__global__ void SimpleBinaryOpHIPKernel(const int N, const BinaryOperator op, + const TIn *A, const TIn *B, TOut *C) { + HIP_1D_KERNEL_LOOP(i, N) { C[i] = op(A[i], B[i]); } } template -__global__ void RowwiseBinaryOpHIPKenel( - const int rows, - const int cols, - const BinaryOperator op, - const TIn* A, - const TIn* B, - TOut* C) { - const int size = rows * cols; +__global__ void RowwiseBinaryOpHIPKernel(const int size, + const FixedDivisor cols, + const BinaryOperator op, const TIn *A, + const TIn *B, TOut *C) { HIP_1D_KERNEL_LOOP(C_index, size) { - const int j = C_index % cols; + const int j = cols.Mod(C_index); const int A_index = broadcast_1st ? j : C_index; const int B_index = broadcast_1st ? C_index : j; C[C_index] = op(A[A_index], B[B_index]); @@ -97,16 +69,12 @@ __global__ void RowwiseBinaryOpHIPKenel( } template -__global__ void ColwiseBinaryOpHIPKenel( - const int rows, - const int cols, - const BinaryOperator op, - const TIn* A, - const TIn* B, - TOut* C) { - const int size = rows * cols; +__global__ void ColwiseBinaryOpHIPKernel(const int size, + const FixedDivisor cols, + const BinaryOperator op, const TIn *A, + const TIn *B, TOut *C) { HIP_1D_KERNEL_LOOP(C_index, size) { - const int i = C_index / cols; + const int i = cols.Div(C_index); const int A_index = broadcast_1st ? i : C_index; const int B_index = broadcast_1st ? C_index : i; C[C_index] = op(A[A_index], B[B_index]); @@ -114,260 +82,154 @@ __global__ void ColwiseBinaryOpHIPKenel( } template -__global__ void BroadcastBinaryOpHIPKernel( - const int size, - const SimpleArray A_strides, - const SimpleArray B_strides, - const SimpleArray C_dims, - const BinaryOperator op, - const TIn* A, - const TIn* B, - TOut* C) { +__global__ void +BroadcastBinaryOpHIPKernel(const int size, const SimpleArray A_strides, + const SimpleArray B_strides, + const SimpleArray, D> C_dims, + const BinaryOperator op, const TIn *A, const TIn *B, + TOut *C) { HIP_1D_KERNEL_LOOP(C_index, size) { int A_index = 0; int B_index = 0; int C_index_val = C_index; #pragma unroll for (int i = D - 1; i >= 0; --i) { - const int d = C_index_val % C_dims.data[i]; - A_index += A_strides.data[i] == 0 ? 0 : d * A_strides.data[i]; - B_index += B_strides.data[i] == 0 ? 0 : d * B_strides.data[i]; - C_index_val /= C_dims.data[i]; + int d; + C_dims.data[i].DivMod(C_index_val, &C_index_val, &d); + A_index += d * A_strides.data[i]; + B_index += d * B_strides.data[i]; } C[C_index] = op(A[A_index], B[B_index]); } } template -void BinaryOpWith2DBroadcasting( - const int ndim, - const int* dims, - const int pivot, - const bool rowwise_broadcast, - const bool broadcast_1st, - const BinaryOperator& op, - const TIn* A, - const TIn* B, - TOut* C, - HIPContext* context) { +void BinaryOpWith2DBroadcasting(const int ndim, const int *dims, + const int pivot, const bool rowwise_broadcast, + const bool broadcast_1st, + const BinaryOperator &op, const TIn *A, + const TIn *B, TOut *C, HIPContext *context) { const int rows = std::accumulate(dims, dims + pivot, 1, std::multiplies()); const int cols = std::accumulate(dims + pivot, dims + ndim, 1, std::multiplies()); + if (rows == 0 || cols == 0) { + return; + } const int size = rows * cols; + const FixedDivisor cols_div(cols); if (rowwise_broadcast) { if (broadcast_1st) { hipLaunchKernelGGL( - (RowwiseBinaryOpHIPKenel), - dim3(CAFFE_GET_BLOCKS(size)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - rows, - cols, - op, - A, - B, - C); + (RowwiseBinaryOpHIPKernel), + dim3(CAFFE_GET_BLOCKS(size)), dim3(CAFFE_HIP_NUM_THREADS), 0, + context->hip_stream(), size, cols_div, op, A, B, C); } else { hipLaunchKernelGGL( - (RowwiseBinaryOpHIPKenel), - dim3(CAFFE_GET_BLOCKS(size)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - rows, - cols, - op, - A, - B, - C); + (RowwiseBinaryOpHIPKernel), + dim3(CAFFE_GET_BLOCKS(size)), dim3(CAFFE_HIP_NUM_THREADS), 0, + context->hip_stream(), size, cols_div, op, A, B, C); } } else { if (broadcast_1st) { hipLaunchKernelGGL( - (ColwiseBinaryOpHIPKenel), - dim3(CAFFE_GET_BLOCKS(size)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - rows, - cols, - op, - A, - B, - C); + (ColwiseBinaryOpHIPKernel), + dim3(CAFFE_GET_BLOCKS(size)), dim3(CAFFE_HIP_NUM_THREADS), 0, + context->hip_stream(), size, cols_div, op, A, B, C); } else { hipLaunchKernelGGL( - (ColwiseBinaryOpHIPKenel), - dim3(CAFFE_GET_BLOCKS(size)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - rows, - cols, - op, - A, - B, - C); + (ColwiseBinaryOpHIPKernel), + dim3(CAFFE_GET_BLOCKS(size)), dim3(CAFFE_HIP_NUM_THREADS), 0, + context->hip_stream(), size, cols_div, op, A, B, C); } } } template -void BroadcastBinaryOpImpl( - const int* A_dims, - const int* B_dims, - const int* C_dims, - const BinaryOperator& op, - const TIn* A, - const TIn* B, - TOut* C, - HIPContext* context) { +void BroadcastBinaryOpImpl(const int *A_dims, const int *B_dims, + const int *C_dims, const BinaryOperator &op, + const TIn *A, const TIn *B, TOut *C, + HIPContext *context) { SimpleArray A_strides_array; SimpleArray B_strides_array; - SimpleArray C_dims_array; + SimpleArray, D> C_dims_array; int A_stride = 1; int B_stride = 1; for (int i = D - 1; i >= 0; --i) { + if (C_dims[i] == 0) { + return; + } A_strides_array.data[i] = A_dims[i] == 1 ? 0 : A_stride; B_strides_array.data[i] = B_dims[i] == 1 ? 0 : B_stride; A_stride *= A_dims[i]; B_stride *= B_dims[i]; + C_dims_array.data[i] = FixedDivisor(C_dims[i]); } - std::copy(C_dims, C_dims + D, C_dims_array.data); const int size = std::accumulate(C_dims, C_dims + D, 1, std::multiplies()); - hipLaunchKernelGGL( - (BroadcastBinaryOpHIPKernel), - dim3(CAFFE_GET_BLOCKS(size)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - size, - A_strides_array, - B_strides_array, - C_dims_array, - op, - A, - B, - C); + hipLaunchKernelGGL((BroadcastBinaryOpHIPKernel), + dim3(CAFFE_GET_BLOCKS(size)), dim3(CAFFE_HIP_NUM_THREADS), + 0, context->hip_stream(), size, A_strides_array, + B_strides_array, C_dims_array, op, A, B, C); } template -void BroadcastBinaryOp( - const int A_ndim, - const int* A_dims, - const int B_ndim, - const int* B_dims, - const BinaryOperator& op, - const TIn* A, - const TIn* B, - TOut* C, - HIPContext* context) { +void BroadcastBinaryOp(const int A_ndim, const int *A_dims, const int B_ndim, + const int *B_dims, const BinaryOperator &op, + const TIn *A, const TIn *B, TOut *C, + HIPContext *context) { const int ndim = std::max(A_ndim, B_ndim); std::vector A_dims_array(ndim); std::vector B_dims_array(ndim); std::vector C_dims_array(ndim); - utils::ComputeBroadcastBinaryOpDims( - A_ndim, - A_dims, - B_ndim, - B_dims, - A_dims_array.data(), - B_dims_array.data(), - C_dims_array.data()); + utils::ComputeBroadcastBinaryOpDims(A_ndim, A_dims, B_ndim, B_dims, + A_dims_array.data(), B_dims_array.data(), + C_dims_array.data()); if (A_dims_array == B_dims_array) { - const int size = std::accumulate( - C_dims_array.cbegin(), C_dims_array.cend(), 1, std::multiplies()); - hipLaunchKernelGGL( - (SimpleBinaryOpHIPKernel), - dim3(CAFFE_GET_BLOCKS(size)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - size, - op, - A, - B, - C); + const int size = std::accumulate(C_dims_array.cbegin(), C_dims_array.cend(), + 1, std::multiplies()); + hipLaunchKernelGGL((SimpleBinaryOpHIPKernel), + dim3(CAFFE_GET_BLOCKS(size)), + dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), + size, op, A, B, C); return; } int pivot; bool broadcast_1st; - if (utils::IsRowwiseBroadcastBinaryOp( - ndim, - A_dims_array.data(), - B_dims_array.data(), - &pivot, - &broadcast_1st)) { + if (utils::IsRowwiseBroadcastBinaryOp(ndim, A_dims_array.data(), + B_dims_array.data(), &pivot, + &broadcast_1st)) { BinaryOpWith2DBroadcasting( - ndim, - C_dims_array.data(), - pivot, - true, - broadcast_1st, - op, - A, - B, - C, + ndim, C_dims_array.data(), pivot, true, broadcast_1st, op, A, B, C, context); return; } - if (utils::IsColwiseBroadcastBinaryOp( - ndim, - A_dims_array.data(), - B_dims_array.data(), - &pivot, - &broadcast_1st)) { + if (utils::IsColwiseBroadcastBinaryOp(ndim, A_dims_array.data(), + B_dims_array.data(), &pivot, + &broadcast_1st)) { BinaryOpWith2DBroadcasting( - ndim, - C_dims_array.data(), - pivot, - false, - broadcast_1st, - op, - A, - B, - C, + ndim, C_dims_array.data(), pivot, false, broadcast_1st, op, A, B, C, context); return; } DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_3( - ndim, - BroadcastBinaryOpImpl, - TIn, - TOut, - BinaryOperator, - A_dims_array.data(), - B_dims_array.data(), - C_dims_array.data(), - op, - A, - B, - C, - context); + ndim, BroadcastBinaryOpImpl, TIn, TOut, BinaryOperator, + A_dims_array.data(), B_dims_array.data(), C_dims_array.data(), op, A, B, + C, context); } + } // namespace -#define DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(T, Func, op) \ - __global__ void Func##HIPKernel(const int N, const T* X, T* Y) { \ - HIP_1D_KERNEL_LOOP(i, N) { \ - Y[i] = op(X[i]); \ - } \ - } \ - template <> \ - void Func( \ - const int N, const T* x, T* y, HIPContext* context) { \ - hipLaunchKernelGGL( \ - (Func##HIPKernel), \ - CAFFE_GET_BLOCKS(N), \ - CAFFE_HIP_NUM_THREADS, \ - 0, \ - context->hip_stream(), \ - N, \ - x, \ - y); \ +#define DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(T, Func, op) \ + __global__ void Func##HIPKernel(const int N, const T *X, T *Y) { \ + HIP_1D_KERNEL_LOOP(i, N) { Y[i] = op(X[i]); } \ + } \ + template <> \ + void Func(const int N, const T *x, T *y, \ + HIPContext *context) { \ + hipLaunchKernelGGL((Func##HIPKernel), CAFFE_GET_BLOCKS(N), \ + CAFFE_HIP_NUM_THREADS, 0, context->hip_stream(), N, x, \ + y); \ } DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Exp, expf) @@ -378,74 +240,70 @@ DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Sin, sinf) DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Asin, asinf) DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Tan, tanf) DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Atan, atanf) +DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Sinh, sinhf) +DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Cosh, coshf) +DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Tanh, tanhf) DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Abs, fabsf) +DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Sqr, utils::Square) DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Sqrt, sqrtf) DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Rsqrt, rsqrtf) -DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Sqr, Square) - -DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(bool, Not, Not) - -DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Neg, Negate) -DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(double, Neg, Negate) -DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(std::int32_t, Neg, Negate) -DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(std::int64_t, Neg, Negate) - -DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Sign, Sign) -DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(double, Sign, Sign) -DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(std::int32_t, Sign, Sign) -DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(std::int64_t, Sign, Sign) +DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Cbrt, cbrtf) + +DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Cube, utils::Cube) +DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(double, Cube, utils::Cube) +DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(std::int32_t, Cube, + utils::Cube) +DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(std::int64_t, Cube, + utils::Cube) + +DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(bool, Not, utils::Not) + +DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Neg, utils::Negate) +DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(double, Neg, utils::Negate) +DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(std::int32_t, Neg, + utils::Negate) +DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(std::int64_t, Neg, + utils::Negate) + +DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Sign, utils::Sign) +DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(double, Sign, utils::Sign) +DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(std::int32_t, Sign, + utils::Sign) +DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(std::int64_t, Sign, + utils::Sign) #undef DELEGATE_SIMPLE_HIP_UNARY_FUNCTION -#define DELEGATE_SINCOS_HIP_FUNCTION(T, fn) \ - __global__ void _Kernel_##T##_##SinCos( \ - const int N, const T* x, T* ys, T* yc) { \ - HIP_1D_KERNEL_LOOP(i, N) { \ - fn(__ldg(x + i), ys + i, yc + i); \ - } \ - } \ - template <> \ - void SinCos( \ - const int N, const T* x, T* ys, T* yc, HIPContext* context) { \ - hipLaunchKernelGGL( \ - (_Kernel_##T##_##SinCos), \ - CAFFE_GET_BLOCKS(N), \ - CAFFE_HIP_NUM_THREADS, \ - 0, \ - context->hip_stream(), \ - N, \ - x, \ - ys, \ - yc); \ +#define DELEGATE_SINCOS_HIP_FUNCTION(T, fn) \ + __global__ void _Kernel_##T##_##SinCos(const int N, const T *x, T *ys, \ + T *yc) { \ + HIP_1D_KERNEL_LOOP(i, N) { fn(__ldg(x + i), ys + i, yc + i); } \ + } \ + template <> \ + void SinCos(const int N, const T *x, T *ys, T *yc, \ + HIPContext *context) { \ + hipLaunchKernelGGL((_Kernel_##T##_##SinCos), CAFFE_GET_BLOCKS(N), \ + CAFFE_HIP_NUM_THREADS, 0, context->hip_stream(), N, x, \ + ys, yc); \ } DELEGATE_SINCOS_HIP_FUNCTION(float, sincosf) DELEGATE_SINCOS_HIP_FUNCTION(double, sincos) -#undef DELEGATE_SINCOS_HIP_FUNCTION - #define DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(TIn, TOut, Func, Op) \ template <> \ - void Func( \ - const int N, const TIn* A, const TIn* B, TOut* C, HIPContext* context) { \ - hipLaunchKernelGGL( \ - (SimpleBinaryOpHIPKernel>), \ - CAFFE_GET_BLOCKS(N), \ - CAFFE_HIP_NUM_THREADS, \ - 0, \ - context->hip_stream(), \ - N, \ - Op(), \ - A, \ - B, \ - C); \ - } - -#define DEFINE_SIMPLE_HIP_COMPARE_FUNCTION(Func, Op) \ - DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \ - DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \ - DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(float, bool, Func, Op) \ - DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(double, bool, Func, Op) \ + void Func(const int N, const TIn *A, const TIn *B, TOut *C, \ + HIPContext *context) { \ + hipLaunchKernelGGL((SimpleBinaryOpHIPKernel>), \ + CAFFE_GET_BLOCKS(N), CAFFE_HIP_NUM_THREADS, 0, \ + context->hip_stream(), N, Op(), A, B, C); \ + } + +#define DEFINE_SIMPLE_HIP_COMPARE_FUNCTION(Func, Op) \ + DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \ + DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \ + DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(float, bool, Func, Op) \ + DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(double, bool, Func, Op) \ DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(bool, bool, Func, Op) DEFINE_SIMPLE_HIP_COMPARE_FUNCTION(EQ, thrust::equal_to) @@ -457,11 +315,11 @@ DEFINE_SIMPLE_HIP_COMPARE_FUNCTION(GE, thrust::greater_equal) #undef DEFINE_SIMPLE_HIP_COMPARE_FUNCTION -#define DEFINE_SIMPLE_HIP_BINARY_FUNCTION(Func, Op) \ - DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \ - DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op) \ - DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(float, float, Func, Op) \ - DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(double, double, Func, Op) \ +#define DEFINE_SIMPLE_HIP_BINARY_FUNCTION(Func, Op) \ + DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \ + DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op) \ + DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(float, float, Func, Op) \ + DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(double, double, Func, Op) \ DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(float16, float16, Func, Op) DEFINE_SIMPLE_HIP_BINARY_FUNCTION(Add, AddFunctor) @@ -475,9 +333,9 @@ DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(bool, bool, And, thrust::logical_and) DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(bool, bool, Or, thrust::logical_or) DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(bool, bool, Xor, thrust::bit_xor) -#define DEFINE_SIMPLE_HIP_BITWISE_BINARY_FUNCTION(Func, Op) \ - DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(bool, bool, Func, Op) \ - DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \ +#define DEFINE_SIMPLE_HIP_BITWISE_BINARY_FUNCTION(Func, Op) \ + DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(bool, bool, Func, Op) \ + DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \ DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op) DEFINE_SIMPLE_HIP_BITWISE_BINARY_FUNCTION(BitwiseAnd, thrust::bit_and) @@ -490,101 +348,69 @@ DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(float, float, ElemwiseMax, thrust::maximum); #undef DELEGATE_SIMPLE_HIP_BINARY_FUNCTION -#define DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(TIn, TOut, Func, Op) \ - template <> \ - void Rowwise##Func( \ - const int rows, \ - const int cols, \ - const TIn* A, \ - const TIn* B, \ - TOut* C, \ - HIPContext* context) { \ - const int size = rows * cols; \ - hipLaunchKernelGGL( \ - (RowwiseBinaryOpHIPKenel, true>), \ - CAFFE_GET_BLOCKS(size), \ - CAFFE_HIP_NUM_THREADS, \ - 0, \ - context->hip_stream(), \ - rows, \ - cols, \ - Op(), \ - A, \ - B, \ - C); \ - } \ - template <> \ - void Rowwise##Func( \ - const int rows, \ - const int cols, \ - const TIn* A, \ - const TIn* B, \ - TOut* C, \ - HIPContext* context) { \ - const int size = rows * cols; \ - hipLaunchKernelGGL( \ - (RowwiseBinaryOpHIPKenel, false>), \ - CAFFE_GET_BLOCKS(size), \ - CAFFE_HIP_NUM_THREADS, \ - 0, \ - context->hip_stream(), \ - rows, \ - cols, \ - Op(), \ - A, \ - B, \ - C); \ - } \ - template <> \ - void Colwise##Func( \ - const int rows, \ - const int cols, \ - const TIn* A, \ - const TIn* B, \ - TOut* C, \ - HIPContext* context) { \ - const int size = rows * cols; \ - hipLaunchKernelGGL( \ - (ColwiseBinaryOpHIPKenel, true>), \ - CAFFE_GET_BLOCKS(size), \ - CAFFE_HIP_NUM_THREADS, \ - 0, \ - context->hip_stream(), \ - rows, \ - cols, \ - Op(), \ - A, \ - B, \ - C); \ - } \ - template <> \ - void Colwise##Func( \ - const int rows, \ - const int cols, \ - const TIn* A, \ - const TIn* B, \ - TOut* C, \ - HIPContext* context) { \ - const int size = rows * cols; \ - hipLaunchKernelGGL( \ - (ColwiseBinaryOpHIPKenel, false>), \ - CAFFE_GET_BLOCKS(size), \ - CAFFE_HIP_NUM_THREADS, \ - 0, \ - context->hip_stream(), \ - rows, \ - cols, \ - Op(), \ - A, \ - B, \ - C); \ - } - -#define DEFINE_2D_BROADCAST_HIP_COMPARE_FUNCTION(Func, Op) \ - DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \ - DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \ - DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(float, bool, Func, Op) \ - DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(double, bool, Func, Op) \ +#define DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(TIn, TOut, Func, Op) \ + template <> \ + void Rowwise##Func(const int rows, const int cols, \ + const TIn *A, const TIn *B, \ + TOut *C, HIPContext *context) { \ + if (rows == 0 || cols == 0) { \ + return; \ + } \ + const int size = rows * cols; \ + const FixedDivisor cols_div(cols); \ + hipLaunchKernelGGL(RowwiseBinaryOpHIPKernel, true>, \ + CAFFE_GET_BLOCKS(size), CAFFE_HIP_NUM_THREADS, 0, \ + context->hip_stream(), size, cols_div, Op(), A, B, \ + C); \ + } \ + template <> \ + void Rowwise##Func(const int rows, const int cols, \ + const TIn *A, const TIn *B, \ + TOut *C, HIPContext *context) { \ + if (rows == 0 || cols == 0) { \ + return; \ + } \ + const int size = rows * cols; \ + const FixedDivisor cols_div(cols); \ + hipLaunchKernelGGL(RowwiseBinaryOpHIPKernel, false>, \ + CAFFE_GET_BLOCKS(size), CAFFE_HIP_NUM_THREADS, 0, \ + context->hip_stream(), size, cols_div, Op(), A, B, \ + C); \ + } \ + template <> \ + void Colwise##Func(const int rows, const int cols, \ + const TIn *A, const TIn *B, \ + TOut *C, HIPContext *context) { \ + if (rows == 0 || cols == 0) { \ + return; \ + } \ + const int size = rows * cols; \ + const FixedDivisor cols_div(cols); \ + hipLaunchKernelGGL(ColwiseBinaryOpHIPKernel, true>, \ + CAFFE_GET_BLOCKS(size), CAFFE_HIP_NUM_THREADS, 0, \ + context->hip_stream(), size, cols_div, Op(), A, B, \ + C); \ + } \ + template <> \ + void Colwise##Func(const int rows, const int cols, \ + const TIn *A, const TIn *B, \ + TOut *C, HIPContext *context) { \ + if (rows == 0 || cols == 0) { \ + return; \ + } \ + const int size = rows * cols; \ + const FixedDivisor cols_div(cols); \ + hipLaunchKernelGGL(ColwiseBinaryOpHIPKernel, false>, \ + CAFFE_GET_BLOCKS(size), CAFFE_HIP_NUM_THREADS, 0, \ + context->hip_stream(), size, cols_div, Op(), A, B, \ + C); \ + } + +#define DEFINE_2D_BROADCAST_HIP_COMPARE_FUNCTION(Func, Op) \ + DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \ + DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \ + DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(float, bool, Func, Op) \ + DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(double, bool, Func, Op) \ DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(bool, bool, Func, Op) DEFINE_2D_BROADCAST_HIP_COMPARE_FUNCTION(EQ, thrust::equal_to) @@ -596,13 +422,13 @@ DEFINE_2D_BROADCAST_HIP_COMPARE_FUNCTION(GE, thrust::greater_equal) #undef DEFINE_2D_BROADCAST_HIP_COMPARE_FUNCTION -#define DEFINE_2D_BROADCAST_HIP_BINARY_FUNCTION(Func, Op) \ - DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION( \ - std::int32_t, std::int32_t, Func, Op) \ - DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION( \ - std::int64_t, std::int64_t, Func, Op) \ - DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(float, float, Func, Op) \ - DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(double, double, Func, Op) \ +#define DEFINE_2D_BROADCAST_HIP_BINARY_FUNCTION(Func, Op) \ + DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, \ + Op) \ + DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, \ + Op) \ + DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(float, float, Func, Op) \ + DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(double, double, Func, Op) \ DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(float16, float16, Func, Op) DEFINE_2D_BROADCAST_HIP_BINARY_FUNCTION(Add, AddFunctor) @@ -616,12 +442,12 @@ DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(bool, bool, And, thrust::logical_and) DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(bool, bool, Or, thrust::logical_or) DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(bool, bool, Xor, thrust::bit_xor) -#define DEFINE_2D_BROADCAST_HIP_BITWISE_BINARY_FUNCTION(Func, Op) \ - DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(bool, bool, Func, Op) \ - DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION( \ - std::int32_t, std::int32_t, Func, Op) \ - DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION( \ - std::int64_t, std::int64_t, Func, Op) +#define DEFINE_2D_BROADCAST_HIP_BITWISE_BINARY_FUNCTION(Func, Op) \ + DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(bool, bool, Func, Op) \ + DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, \ + Op) \ + DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, \ + Op) DEFINE_2D_BROADCAST_HIP_BITWISE_BINARY_FUNCTION(BitwiseAnd, thrust::bit_and) DEFINE_2D_BROADCAST_HIP_BITWISE_BINARY_FUNCTION(BitwiseOr, thrust::bit_or) @@ -631,26 +457,21 @@ DEFINE_2D_BROADCAST_HIP_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor) #undef DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION -#define DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(TIn, TOut, Func, Op) \ - template <> \ - void Func( \ - const int A_ndim, \ - const int* A_dims, \ - const int B_ndim, \ - const int* B_dims, \ - const TIn* A, \ - const TIn* B, \ - TOut* C, \ - HIPContext* context) { \ - BroadcastBinaryOp>( \ - A_ndim, A_dims, B_ndim, B_dims, Op(), A, B, C, context); \ - } - -#define DEFINE_BROADCAST_HIP_COMPARE_FUNCTION(Func, Op) \ - DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \ - DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \ - DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(float, bool, Func, Op) \ - DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(double, bool, Func, Op) \ +#define DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(TIn, TOut, Func, Op) \ + template <> \ + void Func(const int A_ndim, const int *A_dims, \ + const int B_ndim, const int *B_dims, \ + const TIn *A, const TIn *B, TOut *C, \ + HIPContext *context) { \ + BroadcastBinaryOp>(A_ndim, A_dims, B_ndim, B_dims, \ + Op(), A, B, C, context); \ + } + +#define DEFINE_BROADCAST_HIP_COMPARE_FUNCTION(Func, Op) \ + DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \ + DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \ + DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(float, bool, Func, Op) \ + DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(double, bool, Func, Op) \ DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(bool, bool, Func, Op) DEFINE_BROADCAST_HIP_COMPARE_FUNCTION(EQ, thrust::equal_to) @@ -693,27 +514,20 @@ DEFINE_BROADCAST_HIP_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor) #undef DELEGATE_BROADCAST_HIP_BINARY_FUNCTION -#define DELEGATE_REDUCTION_FUNCTION(T, Funcname, func) \ - template <> \ - void Funcname( \ - const int N, \ - const T* src, \ - T* dst, \ - Tensor* scratch_ptr, \ - HIPContext* context) { \ - size_t memRequired = 0; \ - cub::DeviceReduce::func( \ - nullptr, memRequired, src, dst, N, context->hip_stream()); \ - auto buffer_size = \ - static_cast((memRequired + sizeof(T) - 1) / sizeof(T)); \ - scratch_ptr->Resize(std::vector{buffer_size}); \ - cub::DeviceReduce::func( \ - static_cast(scratch_ptr->mutable_data()), \ - memRequired, \ - src, \ - dst, \ - N, \ - context->hip_stream()); \ +#define DELEGATE_REDUCTION_FUNCTION(T, Funcname, func) \ + template <> \ + void Funcname(const int N, const T *src, T *dst, \ + Tensor *scratch_ptr, \ + HIPContext *context) { \ + size_t memRequired = 0; \ + cub::DeviceReduce::func(nullptr, memRequired, src, dst, N, \ + context->hip_stream()); \ + auto buffer_size = \ + static_cast((memRequired + sizeof(T) - 1) / sizeof(T)); \ + scratch_ptr->Resize(std::vector{buffer_size}); \ + cub::DeviceReduce::func( \ + static_cast(scratch_ptr->mutable_data()), memRequired, src, \ + dst, N, context->hip_stream()); \ } DELEGATE_REDUCTION_FUNCTION(float, ReduceMin, Min) @@ -726,60 +540,34 @@ DELEGATE_REDUCTION_FUNCTION(int64_t, ReduceMax, Max) // Caffe2 gemm provides a simpler interface to the gemm functions, with the // limitation that the data has to be contiguous in memory. template <> -void Gemm( - const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, - const int M, - const int N, - const int K, - const float alpha, - const float* A, - const float* B, - const float beta, - float* C, - HIPContext* context, - TensorProto::DataType math_type) { +void Gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, + const int N, const int K, const float alpha, + const float *A, const float *B, const float beta, + float *C, HIPContext *context, + TensorProto::DataType math_type) { // Note that rocblas follows fortran order, so the order is different from // the cblas convention. int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; rocblas_operation cuTransA = (TransA == CblasNoTrans) - ? rocblas_operation_none - : rocblas_operation_transpose; + ? rocblas_operation_none + : rocblas_operation_transpose; rocblas_operation cuTransB = (TransB == CblasNoTrans) - ? rocblas_operation_none - : rocblas_operation_transpose; - ROCBLAS_ENFORCE(rocblas_sgemm( - context->rocblas_handle(), - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B, - ldb, - A, - lda, - &beta, - C, - N)); + ? rocblas_operation_none + : rocblas_operation_transpose; + ROCBLAS_ENFORCE(rocblas_sgemm(context->rocblas_handle(), cuTransB, cuTransA, + N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); } template <> -void Gemm( - const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, - const int M, - const int N, - const int K, - const float alpha, - const float16* A, - const float16* B, - const float beta, - float16* C, - HIPContext* context, - TensorProto::DataType math_type) { +void Gemm(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, + const int N, const int K, const float alpha, + const float16 *A, const float16 *B, + const float beta, float16 *C, + HIPContext *context, + TensorProto::DataType math_type) { CAFFE_THROW("Unsupported math type"); #if ROCBLAS_FP16 // rocblas does not support fp16 yet // Note that cublas follows fortran order, so the order is different from @@ -787,30 +575,15 @@ void Gemm( int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; rocblas_operation cuTransA = (TransA == CblasNoTrans) - ? rocblas_operation_none - : rocblas_operation_transpose; + ? rocblas_operation_none + : rocblas_operation_transpose; rocblas_operation cuTransB = (TransB == CblasNoTrans) - ? rocblas_operation_none - : rocblas_operation_transpose; + ? rocblas_operation_none + : rocblas_operation_transpose; if (math_type == TensorProto_DataType_FLOAT) { - ROCBLAS_CHECK(rocblas_sgemmEx( - context->rocblas_handle(), - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B, - CUDA_R_16F, - ldb, - A, - CUDA_R_16F, - lda, - &beta, - C, - CUDA_R_16F, - N)); + ROCBLAS_CHECK(rocblas_sgemmEx(context->rocblas_handle(), cuTransB, cuTransA, + N, M, K, &alpha, B, CUDA_R_16F, ldb, A, + CUDA_R_16F, lda, &beta, C, CUDA_R_16F, N)); } else if (math_type == TensorProto_DataType_FLOAT16) { // convert alpha, beta from float -> __half @@ -841,42 +614,19 @@ void Gemm( } template <> -void BiasCHW( - const float* bias, - const float* bias_multiplier, - const int bias_channels, - const int image_size, - float* image, - HIPContext* context) { - Gemm( - CblasNoTrans, - CblasNoTrans, - bias_channels, - image_size, - 1, - 1, - bias, - bias_multiplier, - 1, - image, - context); +void BiasCHW(const float *bias, const float *bias_multiplier, + const int bias_channels, const int image_size, + float *image, HIPContext *context) { + Gemm(CblasNoTrans, CblasNoTrans, bias_channels, image_size, + 1, 1, bias, bias_multiplier, 1, image, context); } template <> void GemmBatched( - const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, - const int batch_size, - const int M, - const int N, - const int K, - const float alpha, - const float* A, - const float* B, - const float beta, - float* C, - HIPContext* context, - Tensor* scratch, + const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int batch_size, const int M, const int N, const int K, + const float alpha, const float *A, const float *B, const float beta, + float *C, HIPContext *context, Tensor *scratch, TensorProto::DataType math_type) { const int a_stride = M * K; const int b_stride = K * N; @@ -886,63 +636,34 @@ void GemmBatched( const int lda = (TransA == CblasNoTrans) ? K : M; const int ldb = (TransB == CblasNoTrans) ? N : K; rocblas_operation cuTransA = (TransA == CblasNoTrans) - ? rocblas_operation_none - : rocblas_operation_transpose; + ? rocblas_operation_none + : rocblas_operation_transpose; rocblas_operation cuTransB = (TransB == CblasNoTrans) - ? rocblas_operation_none - : rocblas_operation_transpose; + ? rocblas_operation_none + : rocblas_operation_transpose; ROCBLAS_ENFORCE(rocblas_sgemm_strided_batched( - context->rocblas_handle(), - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B, - ldb, - b_stride, - A, - lda, - a_stride, - &beta, - C, - N, - c_stride, - batch_size)); + context->rocblas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, + b_stride, A, lda, a_stride, &beta, C, N, c_stride, batch_size)); } namespace { -__global__ void FloatToHalfKernel(const int N, const float* X, half* Y) { - HIP_1D_KERNEL_LOOP(i, N) { - Y[i] = __float2half(X[i]); - } +__global__ void FloatToHalfKernel(const int N, const float *X, half *Y) { + HIP_1D_KERNEL_LOOP(i, N) { Y[i] = __float2half(X[i]); } } -__global__ void HalfToFloatKernel(const int N, const half* X, float* Y) { - HIP_1D_KERNEL_LOOP(i, N) { - Y[i] = __half2float(X[i]); - } +__global__ void HalfToFloatKernel(const int N, const half *X, float *Y) { + HIP_1D_KERNEL_LOOP(i, N) { Y[i] = __half2float(X[i]); } } }; // namespace template <> void GemmBatched( - const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, - const int batch_size, - const int M, - const int N, - const int K, - const float alpha, - const float16* A, - const float16* B, - const float beta, - float16* C, - HIPContext* context, - Tensor* scratch, + const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int batch_size, const int M, const int N, const int K, + const float alpha, const float16 *A, const float16 *B, const float beta, + float16 *C, HIPContext *context, Tensor *scratch, TensorProto::DataType math_type) { const int a_stride = M * K; const int b_stride = K * N; @@ -961,74 +682,37 @@ void GemmBatched( size_t out_elems = c_stride * batch_size; scratch->Resize(in_elems + out_elems); - float* scratch_ptr = scratch->mutable_data(); + float *scratch_ptr = scratch->mutable_data(); - float* A_fp32 = scratch_ptr; - float* B_fp32 = scratch_ptr + A_size; - float* C_fp32 = scratch_ptr + A_size + B_size; + float *A_fp32 = scratch_ptr; + float *B_fp32 = scratch_ptr + A_size; + float *C_fp32 = scratch_ptr + A_size + B_size; // cast A, B into fp32 - hipLaunchKernelGGL( - (HalfToFloatKernel), - dim3(CAFFE_GET_BLOCKS(A_size)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - A_size, - (half*)A, - A_fp32); - hipLaunchKernelGGL( - (HalfToFloatKernel), - dim3(CAFFE_GET_BLOCKS(B_size)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - B_size, - (half*)B, - B_fp32); + hipLaunchKernelGGL((HalfToFloatKernel), dim3(CAFFE_GET_BLOCKS(A_size)), + dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), + A_size, (half *)A, A_fp32); + hipLaunchKernelGGL((HalfToFloatKernel), dim3(CAFFE_GET_BLOCKS(B_size)), + dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), + B_size, (half *)B, B_fp32); // run fp32 batched Gemm - GemmBatched( - TransA, - TransB, - batch_size, - M, - N, - K, - alpha, - A_fp32, - B_fp32, - beta, - C_fp32, - context); + GemmBatched(TransA, TransB, batch_size, M, N, K, alpha, + A_fp32, B_fp32, beta, C_fp32, context); // cast result back to fp16 - hipLaunchKernelGGL( - (FloatToHalfKernel), - dim3(CAFFE_GET_BLOCKS(batch_size * M * N)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - batch_size * M * N, - C_fp32, - (half*)C); + hipLaunchKernelGGL((FloatToHalfKernel), + dim3(CAFFE_GET_BLOCKS(batch_size * M * N)), + dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), + batch_size * M * N, C_fp32, (half *)C); } else { #if ROCBLAS_FP16 // rocblas does not support fp16 yet if (math_type == TensorProto_DataType_FLOAT) { // loop over matrices in the batch for (int i = 0; i < batch_size; ++i) { - math::Gemm( - TransA, - TransB, - M, - N, - K, - alpha, - A + a_stride * i, - B + b_stride * i, - beta, - C + c_stride * i, - context); + math::Gemm(TransA, TransB, M, N, K, alpha, + A + a_stride * i, B + b_stride * i, + beta, C + c_stride * i, context); } } else if (math_type == TensorProto_DataType_FLOAT16) { // Note that cublas follows fortran order, so the order is different from @@ -1036,122 +720,65 @@ void GemmBatched( const int lda = (TransA == CblasNoTrans) ? K : M; const int ldb = (TransB == CblasNoTrans) ? N : K; rocblas_operation cuTransA = (TransA == CblasNoTrans) - ? rocblas_operation_none - : rocblas_operation_transpose; + ? rocblas_operation_none + : rocblas_operation_transpose; rocblas_operation cuTransB = (TransB == CblasNoTrans) - ? rocblas_operation_none - : rocblas_operation_transpose; + ? rocblas_operation_none + : rocblas_operation_transpose; // convert alpha, beta from float -> __half auto alpha_fp16 = convert::floatToHalf(alpha); auto beta_fp16 = convert::floatToHalf(beta); ROCBLAS_ENFORCE(cublasHgemmStridedBatched( - context->rocblas_handle(), - cuTransB, - cuTransA, - N, - M, - K, - &alpha_fp16, - (const __half*)B, - ldb, - b_stride, - (const __half*)A, - lda, - a_stride, - &beta_fp16, - (__half*)C, - N, - c_stride, - batch_size)); + context->rocblas_handle(), cuTransB, cuTransA, N, M, K, &alpha_fp16, + (const __half *)B, ldb, b_stride, (const __half *)A, lda, a_stride, + &beta_fp16, (__half *)C, N, c_stride, batch_size)); } #endif } } template <> -void GemmEx( - const CBLAS_TRANSPOSE TransA, - const CBLAS_TRANSPOSE TransB, - const int M, - const int N, - const int K, - const float alpha, - const float* A, - const int lda, - const float* B, - const int ldb, - const float beta, - float* C, - const int ldc, - HIPContext* context) { +void GemmEx(const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, + const int N, const int K, const float alpha, + const float *A, const int lda, const float *B, + const int ldb, const float beta, float *C, + const int ldc, HIPContext *context) { // Note that cublas follows fortran order, so the order is different from // the cblas convention. rocblas_operation cuTransA = (TransA == CblasNoTrans) - ? rocblas_operation_none - : rocblas_operation_transpose; + ? rocblas_operation_none + : rocblas_operation_transpose; rocblas_operation cuTransB = (TransB == CblasNoTrans) - ? rocblas_operation_none - : rocblas_operation_transpose; - ROCBLAS_ENFORCE(rocblas_sgemm( - context->rocblas_handle(), - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B, - ldb, - A, - lda, - &beta, - C, - ldc)); + ? rocblas_operation_none + : rocblas_operation_transpose; + ROCBLAS_ENFORCE(rocblas_sgemm(context->rocblas_handle(), cuTransB, cuTransA, + N, M, K, &alpha, B, ldb, A, lda, &beta, C, + ldc)); } template <> -void Gemv( - const CBLAS_TRANSPOSE TransA, - const int M, - const int N, - const float alpha, - const float* A, - const float* x, - const float beta, - float* y, - HIPContext* context, - TensorProto::DataType math_type) { +void Gemv(const CBLAS_TRANSPOSE TransA, const int M, + const int N, const float alpha, const float *A, + const float *x, const float beta, float *y, + HIPContext *context, + TensorProto::DataType math_type) { rocblas_operation cuTransA = (TransA == CblasNoTrans) - ? rocblas_operation_transpose - : rocblas_operation_none; - ROCBLAS_ENFORCE(rocblas_sgemv( - context->rocblas_handle(), - cuTransA, - N, - M, - &alpha, - A, - N, - x, - 1, - &beta, - y, - 1)); + ? rocblas_operation_transpose + : rocblas_operation_none; + ROCBLAS_ENFORCE(rocblas_sgemv(context->rocblas_handle(), cuTransA, N, M, + &alpha, A, N, x, 1, &beta, y, 1)); } // Batched Add variants namespace { template -__global__ void AddStripedBatchKernel( - const int N, - const T* first, - T* Y, - const int stripe, - const int batch) { +__global__ void AddStripedBatchKernel(const int N, const T *first, T *Y, + const int stripe, const int batch) { for (int j = 0; j < batch; j++) { - const T* x = first + j * stripe; + const T *x = first + j * stripe; HIP_1D_KERNEL_LOOP(i, N) { float tmpY = convert::To(Y[i]); tmpY += convert::To(x[i]); @@ -1161,26 +788,14 @@ __global__ void AddStripedBatchKernel( } } // namespace -#define CAFFE2_SPECIALIZED_HIP_ADD_STRIPED_BATCH(T) \ - template <> \ - void AddStripedBatch( \ - const int N, \ - const T* first, \ - T* Y, \ - const int stripe, \ - const int batch, \ - HIPContext* context) { \ - hipLaunchKernelGGL( \ - (AddStripedBatchKernel), \ - CAFFE_GET_BLOCKS(N), \ - CAFFE_HIP_NUM_THREADS, \ - 0, \ - context->hip_stream(), \ - N, \ - first, \ - Y, \ - stripe, \ - batch); \ +#define CAFFE2_SPECIALIZED_HIP_ADD_STRIPED_BATCH(T) \ + template <> \ + void AddStripedBatch(const int N, const T *first, T *Y, \ + const int stripe, const int batch, \ + HIPContext *context) { \ + hipLaunchKernelGGL(AddStripedBatchKernel, CAFFE_GET_BLOCKS(N), \ + CAFFE_HIP_NUM_THREADS, 0, context->hip_stream(), N, \ + first, Y, stripe, batch); \ } CAFFE2_SPECIALIZED_HIP_ADD_STRIPED_BATCH(float); @@ -1188,22 +803,16 @@ CAFFE2_SPECIALIZED_HIP_ADD_STRIPED_BATCH(float16); #undef CAFFE2_SPECIALIZED_HIP_ADD_STRIPED_BATCH template <> -void Gemv( - const CBLAS_TRANSPOSE TransA, - const int M, - const int N, - const float alpha, - const float16* A, - const float16* x, - const float beta, - float16* y, - HIPContext* context, - TensorProto::DataType math_type) { +void Gemv(const CBLAS_TRANSPOSE TransA, const int M, + const int N, const float alpha, const float16 *A, + const float16 *x, const float beta, float16 *y, + HIPContext *context, + TensorProto::DataType math_type) { CAFFE_THROW("Unsupported math type"); #if ROCBLAS_FP16 // rocblas does not support fp16 yet rocblas_operation cuTransA = (TransA == CblasNoTrans) - ? rocblas_operation_transpose - : rocblas_operation_none; + ? rocblas_operation_transpose + : rocblas_operation_none; // sort out what we need to call cublasSgemmEx / cublasHgemm int m = (cuTransA == rocblas_operation_none) ? N : M; @@ -1212,71 +821,39 @@ void Gemv( int LDC = m; if (math_type == TensorProto_DataType_FLOAT) { - ROCBLAS_CHECK(cublasSgemmEx( - context->rocblas_handle(), - cuTransA, - rocblas_operation_none, - m, - 1, - k, - &alpha, - A, - CUDA_R_16F, - LDA, - x, - CUDA_R_16F, - k, - &beta, - y, - CUDA_R_16F, - LDC)); + ROCBLAS_CHECK(cublasSgemmEx(context->rocblas_handle(), cuTransA, + rocblas_operation_none, m, 1, k, &alpha, A, + CUDA_R_16F, LDA, x, CUDA_R_16F, k, &beta, y, + CUDA_R_16F, LDC)); } else if (math_type == TensorProto_DataType_FLOAT16) { auto alpha_fp16 = convert::floatToHalf(alpha); auto beta_fp16 = convert::floatToHalf(beta); - ROCBLAS_CHECK(cublasHgemm( - context->rocblas_handle(), - cuTransA, - rocblas_operation_none, - m, - 1, - k, - &alpha_fp16, - (const __half*)A, - LDA, - (const __half*)x, - k, - &beta_fp16, - (__half*)y, - LDC)); + ROCBLAS_CHECK(cublasHgemm(context->rocblas_handle(), cuTransA, + rocblas_operation_none, m, 1, k, &alpha_fp16, + (const __half *)A, LDA, (const __half *)x, k, + &beta_fp16, (__half *)y, LDC)); } else { // fail CAFFE_THROW("Unsupported math type"); } #endif } + namespace { template -__global__ void SetKernel(const int N, const T alpha, T* Y) { - HIP_1D_KERNEL_LOOP(i, N) { - Y[i] = alpha; - } +__global__ void SetKernel(const int N, const T alpha, T *Y) { + HIP_1D_KERNEL_LOOP(i, N) { Y[i] = alpha; } } } // namespace -#define CAFFE2_SPECIALIZED_HIP_SET(T) \ - template <> \ - void Set( \ - const size_t N, const T alpha, T* Y, HIPContext* context) { \ - hipLaunchKernelGGL( \ - (SetKernel), \ - CAFFE_GET_BLOCKS(N), \ - CAFFE_HIP_NUM_THREADS, \ - 0, \ - context->hip_stream(), \ - static_cast(N), \ - alpha, \ - Y); \ +#define CAFFE2_SPECIALIZED_HIP_SET(T) \ + template <> \ + void Set(const size_t N, const T alpha, T *Y, \ + HIPContext *context) { \ + hipLaunchKernelGGL((SetKernel), CAFFE_GET_BLOCKS(N), \ + CAFFE_HIP_NUM_THREADS, 0, context->hip_stream(), \ + static_cast(N), alpha, Y); \ } CAFFE2_SPECIALIZED_HIP_SET(float); @@ -1294,93 +871,56 @@ CAFFE2_SPECIALIZED_HIP_SET(uint16_t); namespace { template -__global__ void -UniformShift(const size_t N, const float min, const float max, T* x) { +__global__ void UniformShift(const size_t N, const float min, const float max, + T *x) { float scale = max - min; HIP_1D_KERNEL_LOOP(i, N) { x[i] = convert::To(convert::To(x[i]) * scale + min); } } -__global__ void -UniformIntFit(const size_t N, const int min, const int max, unsigned int* x) { - int* x_int = reinterpret_cast(x); +__global__ void UniformIntFit(const size_t N, const int min, const int max, + unsigned int *x) { + int *x_int = reinterpret_cast(x); int range = (max - min + 1); - HIP_1D_KERNEL_LOOP(i, N) { - x_int[i] = min + static_cast(x[i] % range); - } + HIP_1D_KERNEL_LOOP(i, N) { x_int[i] = min + static_cast(x[i] % range); } } } // namespace template <> -void RandUniform( - const size_t n, - const float min, - const float max, - float* r, - HIPContext* context) { +void RandUniform(const size_t n, const float min, + const float max, float *r, + HIPContext *context) { HIPRAND_ENFORCE(hiprandGenerateUniform(context->hiprand_generator(), r, n)); - hipLaunchKernelGGL( - (UniformShift), - dim3(CAFFE_GET_BLOCKS(n)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - n, - min, - max, - r); + hipLaunchKernelGGL((UniformShift), dim3(CAFFE_GET_BLOCKS(n)), + dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), n, + min, max, r); } template <> -void RandUniform( - const size_t n, - const double min, - const double max, - double* r, - HIPContext* context) { +void RandUniform(const size_t n, const double min, + const double max, double *r, + HIPContext *context) { HIPRAND_ENFORCE( hiprandGenerateUniformDouble(context->hiprand_generator(), r, n)); - hipLaunchKernelGGL( - (UniformShift), - dim3(CAFFE_GET_BLOCKS(n)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - n, - min, - max, - r); + hipLaunchKernelGGL((UniformShift), dim3(CAFFE_GET_BLOCKS(n)), + dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), n, + min, max, r); } template <> -void RandUniform( - const size_t n, - const int min, - const int max, - int* r, - HIPContext* context) { - HIPRAND_ENFORCE(hiprandGenerate( - context->hiprand_generator(), reinterpret_cast(r), n)); - hipLaunchKernelGGL( - (UniformIntFit), - dim3(CAFFE_GET_BLOCKS(n)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - n, - min, - max, - reinterpret_cast(r)); +void RandUniform(const size_t n, const int min, const int max, + int *r, HIPContext *context) { + HIPRAND_ENFORCE(hiprandGenerate(context->hiprand_generator(), + reinterpret_cast(r), n)); + hipLaunchKernelGGL((UniformIntFit), dim3(CAFFE_GET_BLOCKS(n)), + dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), n, + min, max, reinterpret_cast(r)); } template -size_t HandleOddLengthRandGaussian( - const size_t n, - const T mean, - const T std, - T* r, - HIPContext* context) { +size_t HandleOddLengthRandGaussian(const size_t n, const T mean, const T std, + T *r, HIPContext *context) { if (n % 2 == 1) { std::default_random_engine generator; std::normal_distribution distribution(mean, std); @@ -1392,41 +932,31 @@ size_t HandleOddLengthRandGaussian( } template <> -void RandGaussian( - const size_t n, - const float mean, - const float std, - float* r, - HIPContext* context) { +void RandGaussian(const size_t n, const float mean, + const float std, float *r, + HIPContext *context) { // If n is odd, we add a random Gaussian value at the end manually // and generate n-1 random values using curandGenerateNormal. // curandGenerateNormal requires n to be even. const size_t even_n = HandleOddLengthRandGaussian(n, mean, std, r, context); - HIPRAND_ENFORCE(hiprandGenerateNormal( - context->hiprand_generator(), r, even_n, mean, std)); + HIPRAND_ENFORCE(hiprandGenerateNormal(context->hiprand_generator(), r, even_n, + mean, std)); } template <> -void RandGaussian( - const size_t n, - const double mean, - const double std, - double* r, - HIPContext* context) { +void RandGaussian(const size_t n, const double mean, + const double std, double *r, + HIPContext *context) { const size_t even_n = HandleOddLengthRandGaussian(n, mean, std, r, context); - HIPRAND_ENFORCE(hiprandGenerateNormalDouble( - context->hiprand_generator(), r, even_n, mean, std)); + HIPRAND_ENFORCE(hiprandGenerateNormalDouble(context->hiprand_generator(), r, + even_n, mean, std)); } template <> -void Dot( - const int n, - const float* a, - const float* b, - float* y, - HIPContext* context) { +void Dot(const int n, const float *a, const float *b, + float *y, HIPContext *context) { float result; ROCBLAS_ENFORCE( rocblas_sdot(context->rocblas_handle(), n, a, 1, b, 1, &result)); @@ -1434,28 +964,14 @@ void Dot( } template <> -void Dot( - const int n, - const float16* a, - const float16* b, - float16* y, - HIPContext* context) { +void Dot(const int n, const float16 *a, const float16 *b, + float16 *y, HIPContext *context) { CAFFE_THROW("Unsupported math type"); #if ROCBLAS_FP16 // rocblas does not support fp16 yet float16 result; // execute with 32-bit math - ROCBLAS_CHECK(cublasDotEx( - context->rocblas_handle(), - n, - a, - CUDA_R_16F, - 1, - b, - CUDA_R_16F, - 1, - &result, - CUDA_R_16F, - CUDA_R_32F)); + ROCBLAS_CHECK(cublasDotEx(context->rocblas_handle(), n, a, CUDA_R_16F, 1, b, + CUDA_R_16F, 1, &result, CUDA_R_16F, CUDA_R_32F)); context->Copy(1, &result, y); #endif } @@ -1466,7 +982,7 @@ void Dot( // reduction here. #define SUM_KERNEL_NTHREADS 128 template -__global__ void SumKernel(const int N, const T* X, T* Y, bool square) { +__global__ void SumKernel(const int N, const T *X, T *Y, bool square) { const int idx = threadIdx.x; __shared__ float reduction_buffer[SUM_KERNEL_NTHREADS]; @@ -1488,7 +1004,8 @@ __global__ void SumKernel(const int N, const T* X, T* Y, bool square) { // 128 -> 32 if (idx < 32) { reduction_buffer[idx] += reduction_buffer[idx + 32] + - reduction_buffer[idx + 64] + reduction_buffer[idx + 96]; + reduction_buffer[idx + 64] + + reduction_buffer[idx + 96]; } __syncthreads(); // 32 -> 1 @@ -1508,21 +1025,16 @@ __global__ void SumKernel(const int N, const T* X, T* Y, bool square) { namespace { -template -__global__ void SumConvertKernel(float* sum, T* dest) { +template __global__ void SumConvertKernel(float *sum, T *dest) { *dest = convert::To(*sum); } template -void SumGenericIter( - const int N, - IterT it, - T*& dest, - HIPContext* context, - Tensor* scratch_ptr) { +void SumGenericIter(const int N, IterT it, T *&dest, HIPContext *context, + Tensor *scratch_ptr) { size_t memRequired = 0; - cub::DeviceReduce::Sum( - nullptr, memRequired, it, dest, N, context->hip_stream()); + cub::DeviceReduce::Sum(nullptr, memRequired, it, dest, N, + context->hip_stream()); auto buffer_size = static_cast((memRequired + sizeof(T) - 1) / sizeof(T)); if (!dest) { @@ -1533,184 +1045,106 @@ void SumGenericIter( scratch_ptr->Resize(std::vector{buffer_size}); } cub::DeviceReduce::Sum( - static_cast(scratch_ptr->template mutable_data()), - memRequired, - it, - dest, - N, - context->hip_stream()); + static_cast(scratch_ptr->template mutable_data()), memRequired, + it, dest, N, context->hip_stream()); } } // namespace template <> -void Sum( - const int N, - const float* x, - float* y, - HIPContext* context, - Tensor* scratch_ptr) { +void Sum(const int N, const float *x, float *y, + HIPContext *context, + Tensor *scratch_ptr) { if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) { SumGenericIter(N, x, y, context, scratch_ptr); } else { - hipLaunchKernelGGL( - (SumKernel), - dim3(1), - dim3(SUM_KERNEL_NTHREADS), - 0, - context->hip_stream(), - N, - x, - y, - false); + hipLaunchKernelGGL((SumKernel), dim3(1), dim3(SUM_KERNEL_NTHREADS), 0, + context->hip_stream(), N, x, y, false); } } template <> -void Sum( - const int N, - const int32_t* x, - int32_t* y, - HIPContext* context, - Tensor* scratch_ptr) { +void Sum(const int N, const int32_t *x, int32_t *y, + HIPContext *context, + Tensor *scratch_ptr) { if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) { SumGenericIter(N, x, y, context, scratch_ptr); } else { - hipLaunchKernelGGL( - (SumKernel), - dim3(1), - dim3(SUM_KERNEL_NTHREADS), - 0, - context->hip_stream(), - N, - x, - y, - false); + hipLaunchKernelGGL((SumKernel), dim3(1), dim3(SUM_KERNEL_NTHREADS), 0, + context->hip_stream(), N, x, y, false); } } namespace { -template -struct FloatTransform { +template struct FloatTransform { inline __host__ __device__ float operator()(const T v) const { return convert::To(v); } }; } // namespace -#define CAFFE2_MATH_SUM_FUNC(T) \ - template <> \ - void Sum( \ - const int N, \ - const T* x, \ - T* y, \ - HIPContext* context, \ - Tensor* scratch_ptr) { \ - if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) { \ - FloatTransform transform; \ - cub::TransformInputIterator, const T*> it( \ - x, transform); \ - float* sum = nullptr; \ - SumGenericIter(N, it, sum, context, scratch_ptr); \ - hipLaunchKernelGGL( \ - (SumConvertKernel), \ - dim3(1), \ - dim3(1), \ - 0, \ - context->hip_stream(), \ - sum, \ - y); \ - } else { \ - hipLaunchKernelGGL( \ - (SumKernel), \ - dim3(1), \ - dim3(SUM_KERNEL_NTHREADS), \ - 0, \ - context->hip_stream(), \ - N, \ - x, \ - y, \ - false); \ - } \ +#define CAFFE2_MATH_SUM_FUNC(T) \ + template <> \ + void Sum(const int N, const T *x, T *y, HIPContext *context, \ + Tensor *scratch_ptr) { \ + if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) { \ + FloatTransform transform; \ + cub::TransformInputIterator, const T *> it( \ + x, transform); \ + float *sum = nullptr; \ + SumGenericIter(N, it, sum, context, scratch_ptr); \ + hipLaunchKernelGGL((SumConvertKernel), dim3(1), dim3(1), 0, \ + context->hip_stream(), sum, y); \ + } else { \ + hipLaunchKernelGGL((SumKernel), dim3(1), dim3(SUM_KERNEL_NTHREADS), 0, \ + context->hip_stream(), N, x, y, false); \ + } \ } CAFFE2_MATH_SUM_FUNC(float16) #undef CAFFE2_MATH_SUM_FUNC namespace { -template -struct SqrTransform { - inline __host__ __device__ T operator()(const T v) const { - return v * v; - } +template struct SqrTransform { + inline __host__ __device__ T operator()(const T v) const { return v * v; } }; } // namespace template <> -void SumSqr( - const int N, - const float* x, - float* y, - HIPContext* context, - Tensor* scratch_ptr) { +void SumSqr(const int N, const float *x, float *y, + HIPContext *context, + Tensor *scratch_ptr) { if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) { SqrTransform transform; - cub::TransformInputIterator, const float*> it( + cub::TransformInputIterator, const float *> it( x, transform); SumGenericIter(N, it, y, context, scratch_ptr); } else { - hipLaunchKernelGGL( - (SumKernel), - dim3(1), - dim3(SUM_KERNEL_NTHREADS), - 0, - context->hip_stream(), - N, - x, - y, - true); - } -} - -#define CAFFE2_MATH_SUMSQR_FUNC(T) \ - template <> \ - void SumSqr( \ - const int N, \ - const T* x, \ - T* y, \ - HIPContext* context, \ - Tensor* scratch_ptr) { \ - if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) { \ - FloatTransform float_transform; \ - cub::TransformInputIterator, const T*> \ - float_it(x, float_transform); \ - SqrTransform sqr_transform; \ - cub::TransformInputIterator< \ - float, \ - SqrTransform, \ - decltype(float_it)> \ - it(float_it, sqr_transform); \ - float* sum = nullptr; \ - SumGenericIter(N, it, sum, context, scratch_ptr); \ - hipLaunchKernelGGL( \ - (SumConvertKernel), \ - dim3(1), \ - dim3(1), \ - 0, \ - context->hip_stream(), \ - sum, \ - y); \ - } else { \ - hipLaunchKernelGGL( \ - (SumKernel), \ - dim3(1), \ - dim3(SUM_KERNEL_NTHREADS), \ - 0, \ - context->hip_stream(), \ - N, \ - x, \ - y, \ - true); \ - } \ + hipLaunchKernelGGL((SumKernel), dim3(1), dim3(SUM_KERNEL_NTHREADS), 0, + context->hip_stream(), N, x, y, true); + } +} + +#define CAFFE2_MATH_SUMSQR_FUNC(T) \ + template <> \ + void SumSqr(const int N, const T *x, T *y, \ + HIPContext *context, \ + Tensor *scratch_ptr) { \ + if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) { \ + FloatTransform float_transform; \ + cub::TransformInputIterator, const T *> \ + float_it(x, float_transform); \ + SqrTransform sqr_transform; \ + cub::TransformInputIterator, \ + decltype(float_it)> \ + it(float_it, sqr_transform); \ + float *sum = nullptr; \ + SumGenericIter(N, it, sum, context, scratch_ptr); \ + hipLaunchKernelGGL((SumConvertKernel), dim3(1), dim3(1), 0, \ + context->hip_stream(), sum, y); \ + } else { \ + hipLaunchKernelGGL((SumKernel), dim3(1), dim3(SUM_KERNEL_NTHREADS), 0, \ + context->hip_stream(), N, x, y, true); \ + } \ } CAFFE2_MATH_SUMSQR_FUNC(float16) @@ -1719,59 +1153,32 @@ CAFFE2_MATH_SUMSQR_FUNC(float16) namespace { template -__global__ void -SelectKernel(const int N, const int D, const T* x, const int* idx, T* y) { - HIP_1D_KERNEL_LOOP(i, N) { - y[i] = x[i * D + idx[i]]; - } +__global__ void SelectKernel(const int N, const int D, const T *x, + const int *idx, T *y) { + HIP_1D_KERNEL_LOOP(i, N) { y[i] = x[i * D + idx[i]]; } } } // namespace template <> -void Select( - const int N, - const int D, - const float* x, - const int* idx, - float* y, - HIPContext* context) { - hipLaunchKernelGGL( - (SelectKernel), - dim3(CAFFE_GET_BLOCKS(N)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - N, - D, - x, - idx, - y); +void Select(const int N, const int D, const float *x, + const int *idx, float *y, HIPContext *context) { + hipLaunchKernelGGL((SelectKernel), dim3(CAFFE_GET_BLOCKS(N)), + dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), N, + D, x, idx, y); } template <> -void Select( - const int N, - const int D, - const float16* x, - const int* idx, - float16* y, - HIPContext* context) { - hipLaunchKernelGGL( - (SelectKernel), - dim3(CAFFE_GET_BLOCKS(N)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - N, - D, - x, - idx, - y); +void Select(const int N, const int D, const float16 *x, + const int *idx, float16 *y, + HIPContext *context) { + hipLaunchKernelGGL((SelectKernel), dim3(CAFFE_GET_BLOCKS(N)), + dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), N, + D, x, idx, y); } namespace { template -__global__ void ScaleKernel(const int n, const float alpha, const T* x, T* y) { +__global__ void ScaleKernel(const int n, const float alpha, const T *x, T *y) { HIP_1D_KERNEL_LOOP(i, n) { // y[i] = convert::To(convert::To(x[i]) * alpha); y[i] = convert::Get(convert::Get(x[i]) * alpha); @@ -1779,275 +1186,146 @@ __global__ void ScaleKernel(const int n, const float alpha, const T* x, T* y) { } template -__global__ void -ScaleKernelDeviceAlpha(const int n, const float* alpha, const T* x, T* y) { - HIP_1D_KERNEL_LOOP(i, n) { - y[i] = x[i] * (*alpha); - } +__global__ void ScaleKernelDeviceAlpha(const int n, const float *alpha, + const T *x, T *y) { + HIP_1D_KERNEL_LOOP(i, n) { y[i] = x[i] * (*alpha); } } template -__global__ void PowKernel(const int n, const T* x, const T exponent, T* y) { - HIP_1D_KERNEL_LOOP(i, n) { - y[i] = powf(x[i], exponent); - } +__global__ void PowKernel(const int n, const T *x, const T exponent, T *y) { + HIP_1D_KERNEL_LOOP(i, n) { y[i] = powf(x[i], exponent); } } // fp16 specialization template <> -__global__ void ScaleKernelDeviceAlpha( - const int n, - const float* alpha, - const float16* x, - float16* y) { +__global__ void ScaleKernelDeviceAlpha(const int n, const float *alpha, + const float16 *x, float16 *y) { HIP_1D_KERNEL_LOOP(i, n) { - y[i] = convert::To( - convert::To(x[i]) * (*alpha)); + y[i] = convert::To(convert::To(x[i]) * + (*alpha)); } } } // namespace template <> -void Powx( - const int N, - const float* a, - const float b, - float* y, - HIPContext* context) { - hipLaunchKernelGGL( - (PowKernel), - dim3(CAFFE_GET_BLOCKS(N)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - N, - a, - b, - y); +void Powx(const int N, const float *a, const float b, + float *y, HIPContext *context) { + hipLaunchKernelGGL((PowKernel), dim3(CAFFE_GET_BLOCKS(N)), + dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), N, + a, b, y); } template <> -void Scale( - const int n, - const float alpha, - const float* x, - float* y, - HIPContext* context) { - hipLaunchKernelGGL( - (ScaleKernel), - dim3(CAFFE_GET_BLOCKS(n)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - n, - alpha, - x, - y); +void Scale(const int n, const float alpha, const float *x, + float *y, HIPContext *context) { + hipLaunchKernelGGL((ScaleKernel), dim3(CAFFE_GET_BLOCKS(n)), + dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), n, + alpha, x, y); } template <> -void Scale( - const int n, - const float alpha, - const float16* x, - float16* y, - HIPContext* context) { - hipLaunchKernelGGL( - (ScaleKernel), - dim3(CAFFE_GET_BLOCKS(n)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - n, - alpha, - x, - y); +void Scale(const int n, const float alpha, + const float16 *x, float16 *y, + HIPContext *context) { + hipLaunchKernelGGL((ScaleKernel), dim3(CAFFE_GET_BLOCKS(n)), + dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), n, + alpha, x, y); } template <> -void Scale( - const int n, - const float* alpha, - const float* x, - float* y, - HIPContext* context) { - hipLaunchKernelGGL( - (ScaleKernelDeviceAlpha), - dim3(CAFFE_GET_BLOCKS(n)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - n, - alpha, - x, - y); +void Scale(const int n, const float *alpha, const float *x, + float *y, HIPContext *context) { + hipLaunchKernelGGL((ScaleKernelDeviceAlpha), dim3(CAFFE_GET_BLOCKS(n)), + dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), n, + alpha, x, y); } template <> -void Scale( - const int n, - const float* alpha, - const float16* x, - float16* y, - HIPContext* context) { - hipLaunchKernelGGL( - (ScaleKernelDeviceAlpha), - dim3(CAFFE_GET_BLOCKS(n)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - n, - alpha, - x, - y); +void Scale(const int n, const float *alpha, + const float16 *x, float16 *y, + HIPContext *context) { + hipLaunchKernelGGL((ScaleKernelDeviceAlpha), + dim3(CAFFE_GET_BLOCKS(n)), dim3(CAFFE_HIP_NUM_THREADS), 0, + context->hip_stream(), n, alpha, x, y); } template <> -void Axpy( - const int N, - const float alpha, - const float* X, - float* Y, - HIPContext* context) { +void Axpy(const int N, const float alpha, const float *X, + float *Y, HIPContext *context) { ROCBLAS_ENFORCE( rocblas_saxpy(context->rocblas_handle(), N, &alpha, X, 1, Y, 1)); } template <> -void Axpy( - const int N, - const float alpha, - const double* X, - double* Y, - HIPContext* context) { +void Axpy(const int N, const float alpha, const double *X, + double *Y, HIPContext *context) { double alpha_d{alpha}; ROCBLAS_ENFORCE( rocblas_daxpy(context->rocblas_handle(), N, &alpha_d, X, 1, Y, 1)); } template <> -void Axpy( - const int N, - const float alpha, - const float16* X, - float16* Y, - HIPContext* context) { +void Axpy(const int N, const float alpha, const float16 *X, + float16 *Y, HIPContext *context) { CAFFE_THROW("Unsupported math type"); #if ROCBLAS_FP16 - ROCBLAS_CHECK(cublasAxpyEx( - context->rocblas_handle(), - N, - &alpha, - CUDA_R_16F, - X, - CUDA_R_16F, - 1, - Y, - CUDA_R_16F, - 1, - CUDA_R_32F)); + ROCBLAS_CHECK(cublasAxpyEx(context->rocblas_handle(), N, &alpha, CUDA_R_16F, + X, CUDA_R_16F, 1, Y, CUDA_R_16F, 1, CUDA_R_32F)); #endif } namespace { template -__global__ void AxpyKernel(const int n, const float* a, const T* x, T* y) { +__global__ void AxpyKernel(const int n, const float *a, const T *x, T *y) { HIP_1D_KERNEL_LOOP(index, n) { - y[index] = convert::Get( - convert::Get(x[index]) * (*a) + convert::Get(y[index])); + y[index] = convert::Get(convert::Get(x[index]) * (*a) + + convert::Get(y[index])); } } } // namespace template <> -void Axpy( - const int n, - const float* alpha, - const float* X, - float* Y, - HIPContext* context) { - hipLaunchKernelGGL( - (AxpyKernel), - dim3(CAFFE_GET_BLOCKS(n)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - n, - alpha, - X, - Y); +void Axpy(const int n, const float *alpha, const float *X, + float *Y, HIPContext *context) { + hipLaunchKernelGGL((AxpyKernel), dim3(CAFFE_GET_BLOCKS(n)), + dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), n, + alpha, X, Y); } template <> -void Axpy( - const int n, - const float* alpha, - const float16* X, - float16* Y, - HIPContext* context) { - hipLaunchKernelGGL( - (AxpyKernel), - dim3(CAFFE_GET_BLOCKS(n)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - n, - alpha, - X, - Y); +void Axpy(const int n, const float *alpha, + const float16 *X, float16 *Y, + HIPContext *context) { + hipLaunchKernelGGL((AxpyKernel), dim3(CAFFE_GET_BLOCKS(n)), + dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), n, + alpha, X, Y); } namespace { template -__global__ void -AxpbyKernel(const int n, const T a, const T* x, const T b, T* y) { - HIP_1D_KERNEL_LOOP(index, n) { - y[index] = x[index] * a + y[index] * b; - } +__global__ void AxpbyKernel(const int n, const T a, const T *x, const T b, + T *y) { + HIP_1D_KERNEL_LOOP(index, n) { y[index] = x[index] * a + y[index] * b; } } } // namespace template <> -void Axpby( - const int n, - const float a, - const float* x, - const float b, - float* y, - HIPContext* context) { - hipLaunchKernelGGL( - (AxpbyKernel), - dim3(CAFFE_GET_BLOCKS(n)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - n, - a, - x, - b, - y); +void Axpby(const int n, const float a, const float *x, + const float b, float *y, HIPContext *context) { + hipLaunchKernelGGL((AxpbyKernel), dim3(CAFFE_GET_BLOCKS(n)), + dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), n, + a, x, b, y); } namespace { template __global__ void Im2ColNCHWHIPKernel( - const int n, - const int input_h, - const int input_w, - const int kernel_h, - const int kernel_w, - const int dilation_h, - const int dilation_w, - const int pad_t, - const int pad_l, - const int stride_h, - const int stride_w, - const int output_h, - const int output_w, - const T* img_data, - T* col_data) { + const int n, const int input_h, const int input_w, const int kernel_h, + const int kernel_w, const int dilation_h, const int dilation_w, + const int pad_t, const int pad_l, const int stride_h, const int stride_w, + const int output_h, const int output_w, const T *img_data, T *col_data) { HIP_1D_KERNEL_LOOP(index, n) { const int w_out = index % output_w; const int h_index = index / output_w; @@ -2057,9 +1335,9 @@ __global__ void Im2ColNCHWHIPKernel( const int h_in = h_out * stride_h - pad_t; const int w_in = w_out * stride_w - pad_l; const int output_size = output_h * output_w; - T* col_data_ptr = + T *col_data_ptr = col_data + (channel_out * output_h + h_out) * output_w + w_out; - const T* img_data_ptr = + const T *img_data_ptr = img_data + (channel_in * input_h + h_in) * input_w + w_in; int dh = 0; for (int i = 0; i < kernel_h; ++i) { @@ -2068,8 +1346,8 @@ __global__ void Im2ColNCHWHIPKernel( const int h = h_in + dh; const int w = w_in + dw; *col_data_ptr = (h >= 0 && w >= 0 && h < input_h && w < input_w) - ? __ldg(img_data_ptr + dh * input_w + dw) - : 0; + ? __ldg(img_data_ptr + dh * input_w + dw) + : 0; col_data_ptr += output_size; dw += dilation_w; } @@ -2080,29 +1358,18 @@ __global__ void Im2ColNCHWHIPKernel( template __global__ void Im2ColNHWCHIPKernel( - const int n, - const int input_h, - const int input_w, - const int kernel_h, - const int kernel_w, - const int dilation_h, - const int dilation_w, - const int pad_t, - const int pad_l, - const int stride_h, - const int stride_w, - const int output_w, - const int channels, - const T* img_data, - T* col_data) { + const int n, const int input_h, const int input_w, const int kernel_h, + const int kernel_w, const int dilation_h, const int dilation_w, + const int pad_t, const int pad_l, const int stride_h, const int stride_w, + const int output_w, const int channels, const T *img_data, T *col_data) { HIP_1D_KERNEL_LOOP(index, n) { const int channel_in = index % channels; const int w_out = index / channels % output_w; const int h_out = index / channels / output_w; const int h_in = h_out * stride_h - pad_t; const int w_in = w_out * stride_w - pad_l; - T* col_data_ptr = col_data + - (h_out * output_w + w_out) * channels * kernel_h * kernel_w + + T *col_data_ptr = + col_data + (h_out * output_w + w_out) * channels * kernel_h * kernel_w + channel_in; int dh = 0; for (int i = 0; i < kernel_h; ++i) { @@ -2110,9 +1377,10 @@ __global__ void Im2ColNHWCHIPKernel( for (int j = 0; j < kernel_w; ++j) { const int h = h_in + dh; const int w = w_in + dw; - *col_data_ptr = (h >= 0 && w >= 0 && h < input_h && w < input_w) - ? __ldg(img_data + (h * input_w + w) * channels + channel_in) - : 0; + *col_data_ptr = + (h >= 0 && w >= 0 && h < input_h && w < input_w) + ? __ldg(img_data + (h * input_w + w) * channels + channel_in) + : 0; col_data_ptr += channels; dw += dilation_w; } @@ -2122,22 +1390,12 @@ __global__ void Im2ColNHWCHIPKernel( } template -__global__ void Col2ImNCHWHIPKernel( - const int n, - const int input_h, - const int input_w, - const int patch_h, - const int patch_w, - const int dilation_h, - const int dilation_w, - const int pad_t, - const int pad_l, - const int stride_h, - const int stride_w, - const int output_h, - const int output_w, - const T* col_data, - T* img_data) { +__global__ void +Col2ImNCHWHIPKernel(const int n, const int input_h, const int input_w, + const int patch_h, const int patch_w, const int dilation_h, + const int dilation_w, const int pad_t, const int pad_l, + const int stride_h, const int stride_w, const int output_h, + const int output_w, const T *col_data, T *img_data) { const int dpatch_h = dilation_h * (patch_h - 1) + 1; const int dpatch_w = dilation_w * (patch_w - 1) + 1; @@ -2173,22 +1431,12 @@ __global__ void Col2ImNCHWHIPKernel( } template -__global__ void Col2ImNHWCHIPKernel( - const int n, - const int input_w, - const int channels, - const int patch_h, - const int patch_w, - const int dilation_h, - const int dilation_w, - const int pad_t, - const int pad_l, - const int stride_h, - const int stride_w, - const int output_h, - const int output_w, - const T* col_data, - T* img_data) { +__global__ void +Col2ImNHWCHIPKernel(const int n, const int input_w, const int channels, + const int patch_h, const int patch_w, const int dilation_h, + const int dilation_w, const int pad_t, const int pad_l, + const int stride_h, const int stride_w, const int output_h, + const int output_w, const T *col_data, T *img_data) { const int dpatch_h = dilation_h * (patch_h - 1) + 1; const int dpatch_w = dilation_w * (patch_w - 1) + 1; @@ -2212,8 +1460,8 @@ __global__ void Col2ImNHWCHIPKernel( h_k /= dilation_h; w_k /= dilation_w; const int c_col = (h_k * patch_w + w_k) * channels + c; - val += __ldg( - col_data + (h_col * output_w + w_col) * channels_col + c_col); + val += __ldg(col_data + (h_col * output_w + w_col) * channels_col + + c_col); } } } @@ -2222,18 +1470,13 @@ __global__ void Col2ImNHWCHIPKernel( } template -__global__ void Im2ColNdNCHWHIPKernel( - const int outer_size, - const int inner_size, - const int kernel_size, - SimpleArray img_shape, - SimpleArray col_shape, - SimpleArray kernel_shape, - SimpleArray stride, - SimpleArray dilation, - SimpleArray pad, - const T* X_data, - T* Y_data) { +__global__ void +Im2ColNdNCHWHIPKernel(const int outer_size, const int inner_size, + const int kernel_size, SimpleArray img_shape, + SimpleArray col_shape, + SimpleArray kernel_shape, + SimpleArray stride, SimpleArray dilation, + SimpleArray pad, const T *X_data, T *Y_data) { int d_offset[N]; int d_iter[N]; for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { @@ -2256,7 +1499,7 @@ __global__ void Im2ColNdNCHWHIPKernel( #pragma unroll for (int d_i = 0; d_i < N; ++d_i) { const int d_img = d_iter[d_i] * stride.data[d_i] - pad.data[d_i] + - d_offset[d_i] * dilation.data[d_i]; + d_offset[d_i] * dilation.data[d_i]; is_padding |= d_img < 0 || d_img >= img_shape.data[d_i + 1]; img_index = img_index * img_shape.data[d_i + 1] + d_img; } @@ -2270,22 +1513,16 @@ __global__ void Im2ColNdNCHWHIPKernel( } template -void Im2ColNdNCHWHIPImpl( - const int img_size, - const int col_size, - const int* img_shape, - const int* col_shape, - const int* kernel_shape, - const int* stride, - const int* dilation, - const int* pad, - const float* img_data, - float* col_data, - HIPContext* context) { +void Im2ColNdNCHWHIPImpl(const int img_size, const int col_size, + const int *img_shape, const int *col_shape, + const int *kernel_shape, const int *stride, + const int *dilation, const int *pad, + const float *img_data, float *col_data, + HIPContext *context) { const int outer_size = col_shape[0]; const int inner_size = col_size / outer_size; - const int kernel_size = std::accumulate( - kernel_shape, kernel_shape + N, 1, std::multiplies()); + const int kernel_size = std::accumulate(kernel_shape, kernel_shape + N, 1, + std::multiplies()); SimpleArray img_shape_array; SimpleArray col_shape_array; SimpleArray kernel_shape_array; @@ -2298,42 +1535,25 @@ void Im2ColNdNCHWHIPImpl( std::memcpy(stride_array.data, stride, N * sizeof(int)); std::memcpy(dilation_array.data, dilation, N * sizeof(int)); std::memcpy(pad_array.data, pad, N * sizeof(int)); - hipLaunchKernelGGL( - (Im2ColNdNCHWHIPKernel), - dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - outer_size, - inner_size, - kernel_size, - img_shape_array, - col_shape_array, - kernel_shape_array, - stride_array, - dilation_array, - pad_array, - img_data, - col_data); + hipLaunchKernelGGL((Im2ColNdNCHWHIPKernel), + dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)), + dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), + outer_size, inner_size, kernel_size, img_shape_array, + col_shape_array, kernel_shape_array, stride_array, + dilation_array, pad_array, img_data, col_data); } template -void Col2ImNdNCHWHIPImpl( - const int img_size, - const int col_size, - const int* img_shape, - const int* col_shape, - const int* kernel_shape, - const int* stride, - const int* dilation, - const int* pad, - const float* col_data, - float* img_data, - HIPContext* context) { +void Col2ImNdNCHWHIPImpl(const int img_size, const int col_size, + const int *img_shape, const int *col_shape, + const int *kernel_shape, const int *stride, + const int *dilation, const int *pad, + const float *col_data, float *img_data, + HIPContext *context) { const int outer_size = col_shape[0]; const int inner_size = col_size / outer_size; - const int kernel_size = std::accumulate( - kernel_shape, kernel_shape + N, 1, std::multiplies()); + const int kernel_size = std::accumulate(kernel_shape, kernel_shape + N, 1, + std::multiplies()); SimpleArray img_shape_array; SimpleArray col_shape_array; SimpleArray kernel_shape_array; @@ -2347,309 +1567,130 @@ void Col2ImNdNCHWHIPImpl( std::memcpy(dilation_array.data, dilation, N * sizeof(int)); std::memcpy(pad_array.data, pad, N * sizeof(int)); Set(img_size, 0, img_data, context); - hipLaunchKernelGGL( - (Im2ColNdNCHWHIPKernel), - dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - outer_size, - inner_size, - kernel_size, - img_shape_array, - col_shape_array, - kernel_shape_array, - stride_array, - dilation_array, - pad_array, - col_data, - img_data); + hipLaunchKernelGGL((Im2ColNdNCHWHIPKernel), + dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)), + dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), + outer_size, inner_size, kernel_size, img_shape_array, + col_shape_array, kernel_shape_array, stride_array, + dilation_array, pad_array, col_data, img_data); } } // namespace template <> void Im2Col( - const int channels, - const int height, - const int width, - const int kernel_h, - const int kernel_w, - const int dilation_h, - const int dilation_w, - const int pad_t, - const int pad_l, - const int pad_b, - const int pad_r, - const int stride_h, - const int stride_w, - const float* img_data, - float* col_data, - HIPContext* context) { + const int channels, const int height, const int width, const int kernel_h, + const int kernel_w, const int dilation_h, const int dilation_w, + const int pad_t, const int pad_l, const int pad_b, const int pad_r, + const int stride_h, const int stride_w, const float *img_data, + float *col_data, HIPContext *context) { const int dkernel_h = dilation_h * (kernel_h - 1) + 1; const int dkernel_w = dilation_w * (kernel_w - 1) + 1; const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1; const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1; const int num_kernels = channels * output_h * output_w; hipLaunchKernelGGL( - (Im2ColNCHWHIPKernel), - dim3(CAFFE_GET_BLOCKS(num_kernels)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - num_kernels, - height, - width, - kernel_h, - kernel_w, - dilation_h, - dilation_w, - pad_t, - pad_l, - stride_h, - stride_w, - output_h, - output_w, - img_data, - col_data); + (Im2ColNCHWHIPKernel), dim3(CAFFE_GET_BLOCKS(num_kernels)), + dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), num_kernels, + height, width, kernel_h, kernel_w, dilation_h, dilation_w, pad_t, pad_l, + stride_h, stride_w, output_h, output_w, img_data, col_data); } template <> void Im2Col( - const int channels, - const int height, - const int width, - const int kernel_h, - const int kernel_w, - const int dilation_h, - const int dilation_w, - const int pad_t, - const int pad_l, - const int pad_b, - const int pad_r, - const int stride_h, - const int stride_w, - const float* img_data, - float* col_data, - HIPContext* context) { + const int channels, const int height, const int width, const int kernel_h, + const int kernel_w, const int dilation_h, const int dilation_w, + const int pad_t, const int pad_l, const int pad_b, const int pad_r, + const int stride_h, const int stride_w, const float *img_data, + float *col_data, HIPContext *context) { const int dkernel_h = dilation_h * (kernel_h - 1) + 1; const int dkernel_w = dilation_w * (kernel_w - 1) + 1; const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1; const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1; const int num_kernels = output_h * output_w * channels; hipLaunchKernelGGL( - (Im2ColNHWCHIPKernel), - dim3(CAFFE_GET_BLOCKS(num_kernels)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - num_kernels, - height, - width, - kernel_h, - kernel_w, - dilation_h, - dilation_w, - pad_t, - pad_l, - stride_h, - stride_w, - output_w, - channels, - img_data, - col_data); + (Im2ColNHWCHIPKernel), dim3(CAFFE_GET_BLOCKS(num_kernels)), + dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), num_kernels, + height, width, kernel_h, kernel_w, dilation_h, dilation_w, pad_t, pad_l, + stride_h, stride_w, output_w, channels, img_data, col_data); } template <> void Col2Im( - const int channels, - const int height, - const int width, - const int kernel_h, - const int kernel_w, - const int dilation_h, - const int dilation_w, - const int pad_t, - const int pad_l, - const int pad_b, - const int pad_r, - const int stride_h, - const int stride_w, - const float* col_data, - float* img_data, - HIPContext* context) { + const int channels, const int height, const int width, const int kernel_h, + const int kernel_w, const int dilation_h, const int dilation_w, + const int pad_t, const int pad_l, const int pad_b, const int pad_r, + const int stride_h, const int stride_w, const float *col_data, + float *img_data, HIPContext *context) { const int dkernel_h = dilation_h * (kernel_h - 1) + 1; const int dkernel_w = dilation_w * (kernel_w - 1) + 1; const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1; const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1; const int num_kernels = channels * height * width; hipLaunchKernelGGL( - (Col2ImNCHWHIPKernel), - dim3(CAFFE_GET_BLOCKS(num_kernels)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - num_kernels, - height, - width, - kernel_h, - kernel_w, - dilation_h, - dilation_w, - pad_t, - pad_l, - stride_h, - stride_w, - output_h, - output_w, - col_data, - img_data); + (Col2ImNCHWHIPKernel), dim3(CAFFE_GET_BLOCKS(num_kernels)), + dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), num_kernels, + height, width, kernel_h, kernel_w, dilation_h, dilation_w, pad_t, pad_l, + stride_h, stride_w, output_h, output_w, col_data, img_data); } template <> void Col2Im( - const int channels, - const int height, - const int width, - const int kernel_h, - const int kernel_w, - const int dilation_h, - const int dilation_w, - const int pad_t, - const int pad_l, - const int pad_b, - const int pad_r, - const int stride_h, - const int stride_w, - const float* col_data, - float* img_data, - HIPContext* context) { + const int channels, const int height, const int width, const int kernel_h, + const int kernel_w, const int dilation_h, const int dilation_w, + const int pad_t, const int pad_l, const int pad_b, const int pad_r, + const int stride_h, const int stride_w, const float *col_data, + float *img_data, HIPContext *context) { const int dkernel_h = dilation_h * (kernel_h - 1) + 1; const int dkernel_w = dilation_w * (kernel_w - 1) + 1; const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1; const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1; const int num_kernels = height * width * channels; hipLaunchKernelGGL( - (Col2ImNHWCHIPKernel), - dim3(CAFFE_GET_BLOCKS(num_kernels)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - num_kernels, - width, - channels, - kernel_h, - kernel_w, - dilation_h, - dilation_w, - pad_t, - pad_l, - stride_h, - stride_w, - output_h, - output_w, - col_data, - img_data); + (Col2ImNHWCHIPKernel), dim3(CAFFE_GET_BLOCKS(num_kernels)), + dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), num_kernels, width, + channels, kernel_h, kernel_w, dilation_h, dilation_w, pad_t, pad_l, + stride_h, stride_w, output_h, output_w, col_data, img_data); } template <> void Im2ColNd( - const int N, - const int img_size, - const int col_size, - const int* img_shape, - const int* col_shape, - const int* kernel_shape, - const int* stride, - const int* dilation, - const int* pad, - const float* img_data, - float* col_data, - HIPContext* context) { + const int N, const int img_size, const int col_size, const int *img_shape, + const int *col_shape, const int *kernel_shape, const int *stride, + const int *dilation, const int *pad, const float *img_data, float *col_data, + HIPContext *context) { DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1( - N, - Im2ColNdNCHWHIPImpl, - float, - img_size, - col_size, - img_shape, - col_shape, - kernel_shape, - stride, - dilation, - pad, - img_data, - col_data, - context); + N, Im2ColNdNCHWHIPImpl, float, img_size, col_size, img_shape, col_shape, + kernel_shape, stride, dilation, pad, img_data, col_data, context); } template <> void Col2ImNd( - const int N, - const int img_size, - const int col_size, - const int* img_shape, - const int* col_shape, - const int* kernel_shape, - const int* stride, - const int* dilation, - const int* pad, - const float* col_data, - float* img_data, - HIPContext* context) { + const int N, const int img_size, const int col_size, const int *img_shape, + const int *col_shape, const int *kernel_shape, const int *stride, + const int *dilation, const int *pad, const float *col_data, float *img_data, + HIPContext *context) { DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1( - N, - Col2ImNdNCHWHIPImpl, - float, - img_size, - col_size, - img_shape, - col_shape, - kernel_shape, - stride, - dilation, - pad, - col_data, - img_data, - context); + N, Col2ImNdNCHWHIPImpl, float, img_size, col_size, img_shape, col_shape, + kernel_shape, stride, dilation, pad, col_data, img_data, context); } template <> -void CopyMatrix( - const size_t itemsize, - const int M, - const int N, - const void* A, - const int lda, - void* B, - const int ldb, - HIPContext* context, - TypeMeta::TypedCopy copy) { +void CopyMatrix(const size_t itemsize, const int M, const int N, + const void *A, const int lda, void *B, + const int ldb, HIPContext *context, + TypeMeta::TypedCopy copy) { CAFFE_ENFORCE(!copy, "Copy constructor is not supported in HIP context"); - hipMemcpy2DAsync( - B, - ldb * itemsize, - A, - lda * itemsize, - N * itemsize, - M, - hipMemcpyDeviceToDevice, - context->hip_stream()); + hipMemcpy2DAsync(B, ldb * itemsize, A, lda * itemsize, N * itemsize, M, + hipMemcpyDeviceToDevice, context->hip_stream()); } template <> -void CopyVector( - const int N, - const float* src, - float* dst, - HIPContext* context) { +void CopyVector(const int N, const float *src, float *dst, + HIPContext *context) { if (src != dst && N > 0) { - hipMemcpyAsync( - dst, - src, - sizeof(float) * N, - hipMemcpyDeviceToDevice, - context->hip_stream()); + hipMemcpyAsync(dst, src, sizeof(float) * N, hipMemcpyDeviceToDevice, + context->hip_stream()); } } @@ -2659,13 +1700,9 @@ template using BlockReduce = cub::BlockReduce; template -__global__ void RowwiseReduceKernel( - const int rows, - const int cols, - const Reducer reducer, - const T init, - const T* X, - T* Y) { +__global__ void RowwiseReduceKernel(const int rows, const int cols, + const Reducer reducer, const T init, + const T *X, T *Y) { __shared__ typename BlockReduce::TempStorage temp_storage; for (int i = blockIdx.x; i < rows; i += gridDim.x) { T val = init; @@ -2681,13 +1718,9 @@ __global__ void RowwiseReduceKernel( } template -__global__ void ColwiseReduceKernel( - const int rows, - const int cols, - const Reducer reducer, - const T init, - const T* X, - T* Y) { +__global__ void ColwiseReduceKernel(const int rows, const int cols, + const Reducer reducer, const T init, + const T *X, T *Y) { __shared__ typename BlockReduce::TempStorage temp_storage; for (int i = blockIdx.x; i < cols; i += gridDim.x) { T val = init; @@ -2704,86 +1737,53 @@ __global__ void ColwiseReduceKernel( } // namespace -#define CAFFE2_SPECIALIZED_HIP_ROWWISE_MAX(T) \ - template <> \ - void RowwiseMax( \ - const int N, const int D, const T* x, T* y, HIPContext* context) { \ - hipLaunchKernelGGL( \ - (RowwiseReduceKernel), \ - std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), \ - CAFFE_HIP_NUM_THREADS, \ - 0, \ - context->hip_stream(), \ - N, \ - D, \ - cub::Max(), \ - std::numeric_limits::lowest(), \ - x, \ - y); \ +#define CAFFE2_SPECIALIZED_HIP_ROWWISE_MAX(T) \ + template <> \ + void RowwiseMax(const int N, const int D, const T *x, T *y, \ + HIPContext *context) { \ + hipLaunchKernelGGL(RowwiseReduceKernel, \ + std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), \ + CAFFE_HIP_NUM_THREADS, 0, context->hip_stream(), N, D, \ + cub::Max(), std::numeric_limits::lowest(), x, y); \ } CAFFE2_SPECIALIZED_HIP_ROWWISE_MAX(float) #undef CAFFE2_SPECIALIZED_HIP_ROWWISE_MAX -#define CAFFE2_SPECIALIZED_HIP_COLWISE_MAX(T) \ - template <> \ - void ColwiseMax( \ - const int N, const int D, const T* x, T* y, HIPContext* context) { \ - hipLaunchKernelGGL( \ - (ColwiseReduceKernel), \ - std::min(D, CAFFE_MAXIMUM_NUM_BLOCKS), \ - CAFFE_HIP_NUM_THREADS, \ - 0, \ - context->hip_stream(), \ - N, \ - D, \ - cub::Max(), \ - std::numeric_limits::lowest(), \ - x, \ - y); \ +#define CAFFE2_SPECIALIZED_HIP_COLWISE_MAX(T) \ + template <> \ + void ColwiseMax(const int N, const int D, const T *x, T *y, \ + HIPContext *context) { \ + hipLaunchKernelGGL(ColwiseReduceKernel, \ + std::min(D, CAFFE_MAXIMUM_NUM_BLOCKS), \ + CAFFE_HIP_NUM_THREADS, 0, context->hip_stream(), N, D, \ + cub::Max(), std::numeric_limits::lowest(), x, y); \ } CAFFE2_SPECIALIZED_HIP_COLWISE_MAX(float) #undef CAFFE2_SPECIALIZED_HIP_COLWISE_MAX namespace { -__global__ void -maximum_kernel(const int N, const float alpha, const float* x, float* y) { - HIP_1D_KERNEL_LOOP(i, N) { - y[i] = fmaxf(x[i], alpha); - } +__global__ void maximum_kernel(const int N, const float alpha, const float *x, + float *y) { + HIP_1D_KERNEL_LOOP(i, N) { y[i] = fmaxf(x[i], alpha); } } } // namespace template <> -void Maximum( - const int N, - const float alpha, - const float* x, - float* y, - HIPContext* context) { +void Maximum(const int N, const float alpha, const float *x, float *y, + HIPContext *context) { hipLaunchKernelGGL( - (maximum_kernel), - dim3(std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - N, - alpha, - x, - y); + (maximum_kernel), dim3(std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS)), + dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), N, alpha, x, y); } namespace { template -__global__ void ReduceTensorHIPKernel( - const int outer_size, - const int inner_size, - SimpleArray X_strides, - SimpleArray Y_dims, - const Reducer reducer, - const T init, - const T* X, - T* Y) { +__global__ void +ReduceTensorHIPKernel(const int outer_size, const int inner_size, + SimpleArray X_strides, + SimpleArray, D> Y_dims, + const Reducer reducer, const T init, const T *X, T *Y) { __shared__ typename BlockReduce::TempStorage temp_storage; for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { T val = init; @@ -2792,10 +1792,15 @@ __global__ void ReduceTensorHIPKernel( int Y_index = i * inner_size + j; #pragma unroll for (int d = D - 1; d >= 0; --d) { - X_index += (Y_index % Y_dims.data[d]) * X_strides.data[d]; - Y_index /= Y_dims.data[d]; + int r; + Y_dims.data[d].DivMod(Y_index, &Y_index, &r); + X_index += r * X_strides.data[d]; } +#if __HIP_ARCH__ >= 350 val = reducer(val, __ldg(X + X_index)); +#else + val = reducer(val, X[X_index]); +#endif } val = BlockReduce(temp_storage).Reduce(val, reducer); if (threadIdx.x == 0) { @@ -2806,53 +1811,34 @@ __global__ void ReduceTensorHIPKernel( } template -void ReduceTensorHIPImpl( - const int outer_size, - const int inner_size, - const int* dims, - const int* axes, - const Reducer& reducer, - const T& init, - const T* X, - T* Y, - HIPContext* context) { +void ReduceTensorHIPImpl(const int outer_size, const int inner_size, + const int *dims, const int *axes, + const Reducer &reducer, const T &init, const T *X, + T *Y, HIPContext *context) { SimpleArray X_strides; - SimpleArray Y_dims; + SimpleArray, D> Y_dims; utils::ComputeTransposedStrides(D, dims, axes, X_strides.data); for (int i = 0; i < D; ++i) { - Y_dims.data[i] = dims[axes[i]]; + Y_dims.data[i] = FixedDivisor(dims[axes[i]]); } - hipLaunchKernelGGL( - (ReduceTensorHIPKernel), - dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - outer_size, - inner_size, - X_strides, - Y_dims, - reducer, - init, - X, - Y); + hipLaunchKernelGGL((ReduceTensorHIPKernel), + dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)), + dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), + outer_size, inner_size, X_strides, Y_dims, reducer, init, + X, Y); } template -void ReduceTensorHIP( - const int num_dims, - const int* dims, - const int num_axes, - const int* axes, - const Reducer& reducer, - const T& init, - const T* X, - T* Y, - HIPContext* context) { +void ReduceTensorHIP(const int num_dims, const int *dims, const int num_axes, + const int *axes, const Reducer &reducer, const T &init, + const T *X, T *Y, HIPContext *context) { CAFFE_ENFORCE_LE(num_axes, num_dims); + if (X == Y) { + return; + } std::vector transpose_axes(num_dims); - utils::ComputeTransposeAxesForReduceOp( - num_dims, num_axes, axes, transpose_axes.data()); + utils::ComputeTransposeAxesForReduceOp(num_dims, num_axes, axes, + transpose_axes.data()); const int pivot = num_dims - num_axes; int outer_size = 1; for (int i = 0; i < pivot; ++i) { @@ -2862,48 +1848,27 @@ void ReduceTensorHIP( for (int i = pivot; i < num_dims; ++i) { inner_size *= dims[transpose_axes[i]]; } - if (transpose_axes[pivot] == pivot) { - hipLaunchKernelGGL( - (RowwiseReduceKernel), - dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - outer_size, - inner_size, - reducer, - init, - X, - Y); - return; + if (outer_size > 0 && inner_size > 0) { + if (transpose_axes[pivot] == pivot) { + hipLaunchKernelGGL((RowwiseReduceKernel), + dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)), + dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), + outer_size, inner_size, reducer, init, X, Y); + return; + } + DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2( + num_dims, ReduceTensorHIPImpl, T, Reducer, outer_size, inner_size, dims, + transpose_axes.data(), reducer, init, X, Y, context); + } else if (outer_size > 0) { + math::Set(outer_size, init, Y, context); } - DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2( - num_dims, - ReduceTensorHIPImpl, - T, - Reducer, - outer_size, - inner_size, - dims, - transpose_axes.data(), - reducer, - init, - X, - Y, - context); } template -void ReduceMeanHIPImpl( - const int num_dims, - const int* dims, - const int num_axes, - const int* axes, - const T* X, - T* Y, - HIPContext* context) { - ReduceTensorHIP( - num_dims, dims, num_axes, axes, cub::Sum(), T(0), X, Y, context); +void ReduceMeanHIPImpl(const int num_dims, const int *dims, const int num_axes, + const int *axes, const T *X, T *Y, HIPContext *context) { + ReduceTensorHIP(num_dims, dims, num_axes, axes, cub::Sum(), T(0), X, Y, + context); const int X_size = std::accumulate(dims, dims + num_dims, 1, std::multiplies()); int scale = 1; @@ -2916,26 +1881,13 @@ void ReduceMeanHIPImpl( } // namespace -#define CAFFE2_SPECIALIZED_HIP_REDUCE_MIN(T) \ - template <> \ - void ReduceMin( \ - const int num_dims, \ - const int* dims, \ - const int num_axes, \ - const int* axes, \ - const T* X, \ - T* Y, \ - HIPContext* context) { \ - ReduceTensorHIP( \ - num_dims, \ - dims, \ - num_axes, \ - axes, \ - cub::Min(), \ - std::numeric_limits::max(), \ - X, \ - Y, \ - context); \ +#define CAFFE2_SPECIALIZED_HIP_REDUCE_MIN(T) \ + template <> \ + void ReduceMin(const int num_dims, const int *dims, \ + const int num_axes, const int *axes, \ + const T *X, T *Y, HIPContext *context) { \ + ReduceTensorHIP(num_dims, dims, num_axes, axes, cub::Min(), \ + std::numeric_limits::max(), X, Y, context); \ } CAFFE2_SPECIALIZED_HIP_REDUCE_MIN(std::int32_t) CAFFE2_SPECIALIZED_HIP_REDUCE_MIN(std::int64_t) @@ -2943,26 +1895,13 @@ CAFFE2_SPECIALIZED_HIP_REDUCE_MIN(float) CAFFE2_SPECIALIZED_HIP_REDUCE_MIN(double) #undef CAFFE2_SPECIALIZED_HIP_REDUCE_MIN -#define CAFFE2_SPECIALIZED_HIP_REDUCE_MAX(T) \ - template <> \ - void ReduceMax( \ - const int num_dims, \ - const int* dims, \ - const int num_axes, \ - const int* axes, \ - const T* X, \ - T* Y, \ - HIPContext* context) { \ - ReduceTensorHIP( \ - num_dims, \ - dims, \ - num_axes, \ - axes, \ - cub::Max(), \ - std::numeric_limits::lowest(), \ - X, \ - Y, \ - context); \ +#define CAFFE2_SPECIALIZED_HIP_REDUCE_MAX(T) \ + template <> \ + void ReduceMax(const int num_dims, const int *dims, \ + const int num_axes, const int *axes, \ + const T *X, T *Y, HIPContext *context) { \ + ReduceTensorHIP(num_dims, dims, num_axes, axes, cub::Max(), \ + std::numeric_limits::lowest(), X, Y, context); \ } CAFFE2_SPECIALIZED_HIP_REDUCE_MAX(std::int32_t) CAFFE2_SPECIALIZED_HIP_REDUCE_MAX(std::int64_t) @@ -2970,18 +1909,13 @@ CAFFE2_SPECIALIZED_HIP_REDUCE_MAX(float) CAFFE2_SPECIALIZED_HIP_REDUCE_MAX(double) #undef CAFFE2_SPECIALIZED_HIP_REDUCE_MAX -#define CAFFE2_SPECIALIZED_HIP_REDUCE_SUM(T) \ - template <> \ - void ReduceSum( \ - const int num_dims, \ - const int* dims, \ - const int num_axes, \ - const int* axes, \ - const T* X, \ - T* Y, \ - HIPContext* context) { \ - ReduceTensorHIP( \ - num_dims, dims, num_axes, axes, cub::Sum(), T(0), X, Y, context); \ +#define CAFFE2_SPECIALIZED_HIP_REDUCE_SUM(T) \ + template <> \ + void ReduceSum(const int num_dims, const int *dims, \ + const int num_axes, const int *axes, \ + const T *X, T *Y, HIPContext *context) { \ + ReduceTensorHIP(num_dims, dims, num_axes, axes, cub::Sum(), T(0), X, Y, \ + context); \ } CAFFE2_SPECIALIZED_HIP_REDUCE_SUM(std::int32_t) CAFFE2_SPECIALIZED_HIP_REDUCE_SUM(std::int64_t) @@ -2989,17 +1923,12 @@ CAFFE2_SPECIALIZED_HIP_REDUCE_SUM(float) CAFFE2_SPECIALIZED_HIP_REDUCE_SUM(double) #undef CAFFE2_SPECIALIZED_HIP_REDUCE_SUM -#define CAFFE2_SPECIALIZED_HIP_REDUCE_MEAN(T) \ - template <> \ - void ReduceMean( \ - const int num_dims, \ - const int* dims, \ - const int num_axes, \ - const int* axes, \ - const T* X, \ - T* Y, \ - HIPContext* context) { \ - ReduceMeanHIPImpl(num_dims, dims, num_axes, axes, X, Y, context); \ +#define CAFFE2_SPECIALIZED_HIP_REDUCE_MEAN(T) \ + template <> \ + void ReduceMean(const int num_dims, const int *dims, \ + const int num_axes, const int *axes, \ + const T *X, T *Y, HIPContext *context) { \ + ReduceMeanHIPImpl(num_dims, dims, num_axes, axes, X, Y, context); \ } CAFFE2_SPECIALIZED_HIP_REDUCE_MEAN(float) #undef CAFFE2_SPECIALIZED_HIP_REDUCE_MEAN @@ -3007,20 +1936,16 @@ CAFFE2_SPECIALIZED_HIP_REDUCE_MEAN(float) namespace { template -__global__ void BroadcastHIPKernel( - const int Y_size, - const SimpleArray X_strides, - const SimpleArray Y_dims, - const T* X, - T* Y) { +__global__ void +BroadcastHIPKernel(const int Y_size, const SimpleArray X_strides, + const SimpleArray Y_dims, const T *X, T *Y) { HIP_1D_KERNEL_LOOP(Y_index, Y_size) { int X_index = 0; int Y_index_val = Y_index; #pragma unroll for (int i = D - 1; i >= 0; --i) { - X_index += X_strides.data[i] == 0 - ? 0 - : (Y_index_val % Y_dims.data[i]) * X_strides.data[i]; + X_index += X_strides.data[i] == 0 ? 0 : (Y_index_val % Y_dims.data[i]) * + X_strides.data[i]; Y_index_val /= Y_dims.data[i]; } Y[Y_index] = __ldg(X + X_index); @@ -3028,13 +1953,8 @@ __global__ void BroadcastHIPKernel( } template -void BroadcastHIPImpl( - const int X_ndim, - const int* X_dims, - const int* Y_dims, - const T* X, - T* Y, - HIPContext* context) { +void BroadcastHIPImpl(const int X_ndim, const int *X_dims, const int *Y_dims, + const T *X, T *Y, HIPContext *context) { SimpleArray X_strides_array; SimpleArray Y_dims_array; const int d = D - X_ndim; @@ -3048,34 +1968,21 @@ void BroadcastHIPImpl( std::copy_n(Y_dims, D, Y_dims_array.data); const int Y_size = std::accumulate(Y_dims, Y_dims + D, 1, std::multiplies()); - hipLaunchKernelGGL( - (BroadcastHIPKernel), - dim3(CAFFE_GET_BLOCKS(Y_size)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - Y_size, - X_strides_array, - Y_dims_array, - X, - Y); + hipLaunchKernelGGL((BroadcastHIPKernel), dim3(CAFFE_GET_BLOCKS(Y_size)), + dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), + Y_size, X_strides_array, Y_dims_array, X, Y); } } // namespace -#define CAFFE2_SPECIALIZED_HIP_BROADCAST(T) \ - template <> \ - void Broadcast( \ - const int X_ndim, \ - const int* X_dims, \ - const int Y_ndim, \ - const int* Y_dims, \ - const T* X, \ - T* Y, \ - HIPContext* context) { \ - CAFFE_ENFORCE_LE(X_ndim, Y_ndim); \ - DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1( \ - Y_ndim, BroadcastHIPImpl, T, X_ndim, X_dims, Y_dims, X, Y, context); \ +#define CAFFE2_SPECIALIZED_HIP_BROADCAST(T) \ + template <> \ + void Broadcast(const int X_ndim, const int *X_dims, \ + const int Y_ndim, const int *Y_dims, \ + const T *X, T *Y, HIPContext *context) { \ + CAFFE_ENFORCE_LE(X_ndim, Y_ndim); \ + DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1( \ + Y_ndim, BroadcastHIPImpl, T, X_ndim, X_dims, Y_dims, X, Y, context); \ } CAFFE2_SPECIALIZED_HIP_BROADCAST(std::int32_t) CAFFE2_SPECIALIZED_HIP_BROADCAST(std::int64_t) @@ -3086,12 +1993,8 @@ CAFFE2_SPECIALIZED_HIP_BROADCAST(double) namespace { template -__global__ void RowwiseMomentsHIPKernel( - const int rows, - const int cols, - const T* X, - T* mean, - T* variance) { +__global__ void RowwiseMomentsHIPKernel(const int rows, const int cols, + const T *X, T *mean, T *variance) { __shared__ typename BlockReduce::TempStorage m_storage; __shared__ typename BlockReduce::TempStorage v_storage; for (int i = blockIdx.x; i < rows; i += gridDim.x) { @@ -3113,14 +2016,10 @@ __global__ void RowwiseMomentsHIPKernel( } template -__global__ void MomentsHIPKernel( - const int outer_size, - const int inner_size, - SimpleArray X_strides, - SimpleArray Y_dims, - const T* X, - T* mean, - T* variance) { +__global__ void MomentsHIPKernel(const int outer_size, const int inner_size, + SimpleArray X_strides, + SimpleArray, D> Y_dims, + const T *X, T *mean, T *variance) { __shared__ typename BlockReduce::TempStorage m_storage; __shared__ typename BlockReduce::TempStorage v_storage; for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { @@ -3130,9 +2029,10 @@ __global__ void MomentsHIPKernel( int X_index = 0; int Y_index = i * inner_size + j; #pragma unroll - for (int i = D - 1; i >= 0; --i) { - X_index += (Y_index % Y_dims.data[i]) * X_strides.data[i]; - Y_index /= Y_dims.data[i]; + for (int d = D - 1; d >= 0; --d) { + int r; + Y_dims.data[d].DivMod(Y_index, &Y_index, &r); + X_index += r * X_strides.data[d]; } m_val += __ldg(X + X_index); v_val += __ldg(X + X_index) * __ldg(X + X_index); @@ -3148,50 +2048,30 @@ __global__ void MomentsHIPKernel( } template -void MomentsHIPImpl( - const int outer_size, - const int inner_size, - const int* dims, - const int* axes, - const T* X, - T* mean, - T* variance, - HIPContext* context) { +void MomentsHIPImpl(const int outer_size, const int inner_size, const int *dims, + const int *axes, const T *X, T *mean, T *variance, + HIPContext *context) { SimpleArray X_strides; - SimpleArray Y_dims; + SimpleArray, D> Y_dims; utils::ComputeTransposedStrides(D, dims, axes, X_strides.data); for (int i = 0; i < D; ++i) { - Y_dims.data[i] = dims[axes[i]]; + Y_dims.data[i] = FixedDivisor(dims[axes[i]]); } - hipLaunchKernelGGL( - (MomentsHIPKernel), - dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - outer_size, - inner_size, - X_strides, - Y_dims, - X, - mean, - variance); + hipLaunchKernelGGL((MomentsHIPKernel), + dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)), + dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), + outer_size, inner_size, X_strides, Y_dims, X, mean, + variance); } template -void MomentsHIP( - const int num_dims, - const int* dims, - const int num_axes, - const int* axes, - const T* X, - T* mean, - T* variance, - HIPContext* context) { +void MomentsHIP(const int num_dims, const int *dims, const int num_axes, + const int *axes, const T *X, T *mean, T *variance, + HIPContext *context) { CAFFE_ENFORCE_LE(num_axes, num_dims); std::vector transpose_axes(num_dims); - utils::ComputeTransposeAxesForReduceOp( - num_dims, num_axes, axes, transpose_axes.data()); + utils::ComputeTransposeAxesForReduceOp(num_dims, num_axes, axes, + transpose_axes.data()); const int pivot = num_dims - num_axes; int outer_size = 1; for (int i = 0; i < pivot; ++i) { @@ -3201,47 +2081,27 @@ void MomentsHIP( for (int i = pivot; i < num_dims; ++i) { inner_size *= dims[transpose_axes[i]]; } - if (transpose_axes[pivot] == pivot) { - hipLaunchKernelGGL( - (RowwiseMomentsHIPKernel), - dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - outer_size, - inner_size, - X, - mean, - variance); - return; + if (outer_size > 0 && inner_size > 0) { + if (transpose_axes[pivot] == pivot) { + hipLaunchKernelGGL((RowwiseMomentsHIPKernel), + dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)), + dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), + outer_size, inner_size, X, mean, variance); + return; + } + DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1( + num_dims, MomentsHIPImpl, T, outer_size, inner_size, dims, + transpose_axes.data(), X, mean, variance, context); } - DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1( - num_dims, - MomentsHIPImpl, - T, - outer_size, - inner_size, - dims, - transpose_axes.data(), - X, - mean, - variance, - context); } } // namespace #define CAFFE2_SPECIALIZED_HIP_MOMENTS(T) \ template <> \ - void Moments( \ - const int num_dims, \ - const int* dims, \ - const int num_axes, \ - const int* axes, \ - const T* X, \ - T* mean, \ - T* variance, \ - HIPContext* context) { \ + void Moments(const int num_dims, const int *dims, \ + const int num_axes, const int *axes, const T *X, \ + T *mean, T *variance, HIPContext *context) { \ MomentsHIP(num_dims, dims, num_axes, axes, X, mean, variance, context); \ } CAFFE2_SPECIALIZED_HIP_MOMENTS(float) @@ -3250,65 +2110,54 @@ CAFFE2_SPECIALIZED_HIP_MOMENTS(float) namespace { template -__global__ void TransposeHIPKernel( - const int size, - const SimpleArray X_strides, - const SimpleArray Y_dims, - const T* X, - T* Y) { +__global__ void +TransposeHIPKernel(const int size, const SimpleArray X_strides, + const SimpleArray, D> Y_dims, const T *X, + T *Y) { HIP_1D_KERNEL_LOOP(Y_index, size) { int X_index = 0; int Y_index_val = Y_index; #pragma unroll for (int i = D - 1; i >= 0; --i) { - X_index += (Y_index_val % Y_dims.data[i]) * X_strides.data[i]; - Y_index_val /= Y_dims.data[i]; + int d; + Y_dims.data[i].DivMod(Y_index_val, &Y_index_val, &d); + X_index += d * X_strides.data[i]; } Y[Y_index] = __ldg(X + X_index); } } template -void TransposeHIPImpl( - const int* dims, - const int* axes, - const T* X, - T* Y, - HIPContext* context) { +void TransposeHIPImpl(const int *dims, const int *axes, const T *X, T *Y, + HIPContext *context) { SimpleArray X_strides; - SimpleArray Y_dims; + SimpleArray, D> Y_dims; utils::ComputeTransposedStrides(D, dims, axes, X_strides.data); int size = 1; for (int i = 0; i < D; ++i) { - Y_dims.data[i] = dims[axes[i]]; + Y_dims.data[i] = FixedDivisor(dims[axes[i]]); size *= dims[i]; } - hipLaunchKernelGGL( - (TransposeHIPKernel), - dim3(CAFFE_GET_BLOCKS(size)), - dim3(CAFFE_HIP_NUM_THREADS), - 0, - context->hip_stream(), - size, - X_strides, - Y_dims, - X, - Y); + hipLaunchKernelGGL((TransposeHIPKernel), dim3(CAFFE_GET_BLOCKS(size)), + dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), + size, X_strides, Y_dims, X, Y); } } // namespace -#define CAFFE2_SPECIALIZED_HIP_TRANSPOSE(T) \ - template <> \ - void Transpose( \ - const int ndim, \ - const int* dims, \ - const int* axes, \ - const T* X, \ - T* Y, \ - HIPContext* context) { \ - DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1( \ - ndim, TransposeHIPImpl, T, dims, axes, X, Y, context); \ +#define CAFFE2_SPECIALIZED_HIP_TRANSPOSE(T) \ + template <> \ + void Transpose(const int ndim, const int *dims, \ + const int *axes, const T *X, T *Y, \ + HIPContext *context) { \ + if (utils::IsIdentityPermutation(ndim, axes)) { \ + const int size = \ + std::accumulate(dims, dims + ndim, 1, std::multiplies()); \ + context->template Copy(size, X, Y); \ + return; \ + } \ + DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(ndim, TransposeHIPImpl, T, dims, \ + axes, X, Y, context); \ } CAFFE2_SPECIALIZED_HIP_TRANSPOSE(float) CAFFE2_SPECIALIZED_HIP_TRANSPOSE(double) diff --git a/caffe2/utils/math.h b/caffe2/utils/math.h index 88d8db509847e1..ca8535e4aa3dd3 100644 --- a/caffe2/utils/math.h +++ b/caffe2/utils/math.h @@ -383,6 +383,17 @@ void Set(const size_t N, const T alpha, T* X, Context* context); template void RandUniform(const size_t n, const T a, const T b, T* r, Context* context); +// Generate n values that sum up to a fixed sum +// and subject to a restriction a <= x <= b for each x generated +template +void RandFixedSum( + const size_t n, + const T a, + const T b, + const T sum, + T* r, + Context* context); + template void RandUniformUnique( const size_t n, diff --git a/caffe2/utils/math_cpu.cc b/caffe2/utils/math_cpu.cc index 72c22853b2dfdb..6ebf41ab7cba12 100644 --- a/caffe2/utils/math_cpu.cc +++ b/caffe2/utils/math_cpu.cc @@ -1714,31 +1714,101 @@ DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor) #undef DELEGATE_BROADCAST_BINARY_FUNCTION -template <> -void RandUniform( - const size_t n, - const float a, - const float b, - float* r, - CPUContext* context) { - std::uniform_real_distribution distribution(a, b); - for (size_t i = 0; i < n; ++i) { - r[i] = distribution(context->RandGenerator()); - } -} - -template <> -void RandUniform( - const size_t n, - const int a, - const int b, - int* r, - CPUContext* context) { - std::uniform_int_distribution distribution(a, b); - for (size_t i = 0; i < n; ++i) { - r[i] = distribution(context->RandGenerator()); - } -} +#define CAFFE2_RAND_UNIFORM_REAL(T) \ + template <> \ + void RandUniform( \ + const size_t n, const T a, const T b, T* r, CPUContext* context) { \ + std::uniform_real_distribution distribution(a, b); \ + for (size_t i = 0; i < n; ++i) { \ + r[i] = distribution(context->RandGenerator()); \ + } \ + } +CAFFE2_RAND_UNIFORM_REAL(float); +CAFFE2_RAND_UNIFORM_REAL(double); +#undef CAFFE2_RAND_UNIFORM_REAL + +#define CAFFE2_RAND_UNIFORM_CHAR(T) \ + template <> \ + void RandUniform( \ + const size_t n, const T a, const T b, T* r, CPUContext* context) { \ + std::uniform_int_distribution distribution((short)a, (short)b); \ + for (size_t i = 0; i < n; ++i) { \ + r[i] = static_cast(distribution(context->RandGenerator())); \ + } \ + } +CAFFE2_RAND_UNIFORM_CHAR(int8_t); +CAFFE2_RAND_UNIFORM_CHAR(uint8_t); +#undef CAFFE2_RAND_UNIFORM_CHAR + +#define CAFFE2_RAND_UNIFORM_INT(T) \ + template <> \ + void RandUniform( \ + const size_t n, const T a, const T b, T* r, CPUContext* context) { \ + std::uniform_int_distribution distribution(a, b); \ + for (size_t i = 0; i < n; ++i) { \ + r[i] = distribution(context->RandGenerator()); \ + } \ + } + +CAFFE2_RAND_UNIFORM_INT(int16_t); +CAFFE2_RAND_UNIFORM_INT(int32_t); +CAFFE2_RAND_UNIFORM_INT(int64_t); +CAFFE2_RAND_UNIFORM_INT(uint16_t); +CAFFE2_RAND_UNIFORM_INT(uint32_t); +CAFFE2_RAND_UNIFORM_INT(uint64_t); +#undef CAFFE2_RAND_UNIFORM_INT + +// This is not uniformly distributed between a and b. +// It takes advantage of normal distribution to generate numbers +// with mean = sum / n. +// Ideally the algorithm should be generating n numbers between 0 and 1, +// sum them up as scaled_sum, and use sum / scaled_sum to adjust the values +// to between a and b. +// The algorithm is non-trivial given the adjustment would be different towards +// each value. +#define CAFFE2_RAND_FIXED_SUM(T) \ + template <> \ + void RandFixedSum( \ + const size_t n, \ + const T a, \ + const T b, \ + const T sum, \ + T* r, \ + CPUContext* context) { \ + CAFFE_ENFORCE_GE(a, 0); \ + CAFFE_ENFORCE_GE(sum / (double)n, a); \ + CAFFE_ENFORCE_LE(sum / (double)n, b); \ + T current_sum = 0; \ + for (size_t i = 0; i < n - 1; ++i) { \ + auto remaining_numbers = n - 1 - i; \ + double mean = (sum - current_sum) / remaining_numbers; \ + double stdev = std::min(mean - a, b - mean); \ + std::normal_distribution distribution{mean, stdev / 4.0}; \ + T value = distribution(context->RandGenerator()); \ + auto remaining_sum = sum - current_sum - value; \ + if (value < a || remaining_sum > b * remaining_numbers) { \ + value = a; \ + } else if (value > b || remaining_sum < a * remaining_numbers) { \ + value = b; \ + } \ + r[i] = value; \ + CAFFE_ENFORCE(a <= value && value <= b); \ + current_sum += value; \ + } \ + r[n - 1] = sum - current_sum; \ + CAFFE_ENFORCE(a <= r[n - 1] && r[n - 1] <= b); \ + } +CAFFE2_RAND_FIXED_SUM(float); +CAFFE2_RAND_FIXED_SUM(double); +CAFFE2_RAND_FIXED_SUM(int8_t); +CAFFE2_RAND_FIXED_SUM(int16_t); +CAFFE2_RAND_FIXED_SUM(int32_t); +CAFFE2_RAND_FIXED_SUM(int64_t); +CAFFE2_RAND_FIXED_SUM(uint8_t); +CAFFE2_RAND_FIXED_SUM(uint16_t); +CAFFE2_RAND_FIXED_SUM(uint32_t); +CAFFE2_RAND_FIXED_SUM(uint64_t); +#undef CAFFE2_RAND_FIXED_SUM #define CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(T) \ template <> \ diff --git a/caffe2/utils/mixed_utils_hip.h b/caffe2/utils/mixed_utils_hip.h new file mode 100644 index 00000000000000..f8e07dca2cd134 --- /dev/null +++ b/caffe2/utils/mixed_utils_hip.h @@ -0,0 +1,116 @@ +// Copyright 2004-present Facebook. All Rights Reserved. +#ifndef CAFFE2_UTILS_MIXED_UTILS_HIP_H +#define CAFFE2_UTILS_MIXED_UTILS_HIP_H + +#include "caffe2/core/hip/common_hip.h" +#include "caffe2/core/hip/context_hip.h" + +// define functions to allow add/mult/store operaions for input/output with +// mixed precisions. +namespace caffe2 { + +// functions that will only be triggered when there is no spcialized version +// supported +template +inline __device__ T mixed_mult(T data1, T2 data2) +{ + return data1 * data2; +}; + +template +inline __device__ T mixed_add(T data1, T2 data2) +{ + return data1 + data2; +}; + +template +inline __device__ void mixed_store(TIN* data_in, TOUT* data_out) +{ + *data_out = *data_in; + return; +}; + +template +inline __device__ void mixed_store(T* data_in, T* data_out) +{ + *data_out = *data_in; + return; +}; + +template <> +inline __device__ float mixed_mult(float data1, const float data2) +{ + return data1 * data2; +} + +template <> +inline __device__ float mixed_mult(float data1, const half data2) +{ + return data1 * __half2float(data2); +} + +template <> +inline __device__ float mixed_mult(float data1, float16 data2) +{ + half* data2_half = reinterpret_cast(&data2); + return data1 * __half2float(*data2_half); +} +template <> +inline __device__ float mixed_add(float data1, const float data2) +{ + return data1 + data2; +} + +template <> +inline __device__ float mixed_add(float data1, const half data2) +{ + return data1 + __half2float(data2); +} + +template <> +inline __device__ float mixed_add(float data1, float16 data2) +{ + half* data2_half = reinterpret_cast(&data2); + return data1 + __half2float(*data2_half); +} + +template <> +inline __device__ void mixed_store(float* data_in, float* data_out) +{ + *data_out = *data_in; + return; +} + +template <> +inline __device__ void mixed_store(half* data_in, float* data_out) +{ + *data_out = __half2float(*data_in); + return; +} + +template <> +inline __device__ void mixed_store(float16* data_in, float* data_out) +{ + half* data_in_half = reinterpret_cast(data_in); + *data_out = __half2float(*data_in_half); + return; +} + +template <> +inline __device__ void mixed_store(float* data_in, float16* data_out) +{ + half data_in_half = __float2half(*data_in); + float16* data_in_float16 = reinterpret_cast(&data_in_half); + *data_out = *data_in_float16; + return; +} + +template <> +inline __device__ void mixed_store(float* data_in, half* data_out) +{ + half data_in_half = __float2half(*data_in); + *data_out = data_in_half; + return; +} +} // namespace caffe2 +#endif // for CAFFE2_UTILS_MIXED_UTILS_HIP_H diff --git a/cmake/public/utils.cmake b/cmake/public/utils.cmake index 84ed2c4c6edf7d..a18fb2911ab93b 100644 --- a/cmake/public/utils.cmake +++ b/cmake/public/utils.cmake @@ -176,6 +176,7 @@ function(aten_compile_options libname) -Wextra -fexceptions -Wno-missing-field-initializers + -Wno-strict-overflow -Wno-type-limits -Wno-unused-parameter -Wno-unknown-warning-option diff --git a/docs/source/nn.rst b/docs/source/nn.rst index 538d2603962b54..987044bbd212f4 100644 --- a/docs/source/nn.rst +++ b/docs/source/nn.rst @@ -1102,6 +1102,11 @@ Linear functions .. autofunction:: linear +:hidden:`bilinear` +~~~~~~~~~~~~~~~~ + +.. autofunction:: bilinear + Dropout functions ----------------- diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst index 86a862316dab92..89f96cf174ddbe 100644 --- a/docs/source/tensors.rst +++ b/docs/source/tensors.rst @@ -329,6 +329,7 @@ view of a storage and defines numeric operations on it. .. automethod:: repeat .. automethod:: requires_grad_ .. automethod:: reshape + .. automethod:: reshape_as .. automethod:: resize_ .. automethod:: resize_as_ .. automethod:: round @@ -336,6 +337,7 @@ view of a storage and defines numeric operations on it. .. automethod:: rsqrt .. automethod:: rsqrt_ .. automethod:: scatter_ + .. automethod:: scatter_add_ .. automethod:: select .. automethod:: set_ .. automethod:: share_memory_ diff --git a/docs/source/torch.rst b/docs/source/torch.rst index b8d6e2a349fe8d..75b71fd314bb56 100644 --- a/docs/source/torch.rst +++ b/docs/source/torch.rst @@ -223,6 +223,7 @@ Comparison Ops .. autofunction:: equal .. autofunction:: ge .. autofunction:: gt +.. autofunction:: isfinite .. autofunction:: isinf .. autofunction:: isnan .. autofunction:: kthvalue @@ -250,6 +251,7 @@ Spectral Ops Other Operations ~~~~~~~~~~~~~~~~~~~~~~ +.. autofunction:: bincount .. autofunction:: cross .. autofunction:: diag .. autofunction:: diagflat @@ -257,6 +259,7 @@ Other Operations .. autofunction:: einsum .. autofunction:: flip .. autofunction:: histc +.. autofunction:: meshgrid .. autofunction:: renorm .. autofunction:: trace .. autofunction:: tril diff --git a/setup.py b/setup.py index 85f2f5bca63969..042d8668bb7b96 100644 --- a/setup.py +++ b/setup.py @@ -318,6 +318,8 @@ def build_libs(libs): if USE_CUDA: my_env["CUDA_BIN_PATH"] = CUDA_HOME build_libs_cmd += ['--use-cuda'] + if IS_WINDOWS: + my_env["NVTOOLEXT_HOME"] = NVTOOLEXT_HOME if USE_ROCM: build_libs_cmd += ['--use-rocm'] if USE_NNPACK: @@ -338,6 +340,8 @@ def build_libs(libs): if FULL_CAFFE2: build_libs_cmd += ['--full-caffe2'] + my_env["BUILD_TORCH"] = "ON" + if subprocess.call(build_libs_cmd + libs, env=my_env) != 0: print("Failed to run '{}'".format(' '.join(build_libs_cmd + libs))) sys.exit(1) @@ -640,6 +644,7 @@ def run(self): '-std=c++11', '-Wall', '-Wextra', + '-Wno-strict-overflow', '-Wno-unused-parameter', '-Wno-missing-field-initializers', '-Wno-write-strings', @@ -719,120 +724,73 @@ def run(self): main_compile_args = ['-D_THP_CORE', '-DONNX_NAMESPACE=' + ONNX_NAMESPACE] main_libraries = ['shm'] main_link_args = CAFFE2_LIBS + [NANOPB_STATIC_LIB, PROTOBUF_STATIC_LIB] +if IS_WINDOWS: + main_link_args.append(os.path.join(lib_path, 'torch.lib')) +elif IS_DARWIN: + main_link_args.append(os.path.join(lib_path, 'libtorch.dylib')) +else: + main_link_args.append(os.path.join(lib_path, 'libtorch.so')) main_sources = [ - "torch/csrc/PtrWrapper.cpp", - "torch/csrc/Module.cpp", - "torch/csrc/Generator.cpp", - "torch/csrc/Size.cpp", - "torch/csrc/Dtype.cpp", + "torch/csrc/DataLoader.cpp", "torch/csrc/Device.cpp", + "torch/csrc/Dtype.cpp", + "torch/csrc/DynamicTypes.cpp", "torch/csrc/Exceptions.cpp", + "torch/csrc/Generator.cpp", "torch/csrc/Layout.cpp", + "torch/csrc/Module.cpp", + "torch/csrc/PtrWrapper.cpp", + "torch/csrc/Size.cpp", "torch/csrc/Storage.cpp", - "torch/csrc/DataLoader.cpp", - "torch/csrc/DynamicTypes.cpp", - "torch/csrc/assertions.cpp", + "torch/csrc/autograd/functions/init.cpp", + "torch/csrc/autograd/generated/python_functions.cpp", + "torch/csrc/autograd/generated/python_nn_functions.cpp", + "torch/csrc/autograd/generated/python_torch_functions.cpp", + "torch/csrc/autograd/generated/python_variable_methods.cpp", + "torch/csrc/autograd/init.cpp", + "torch/csrc/autograd/python_anomaly_mode.cpp", + "torch/csrc/autograd/python_cpp_function.cpp", + "torch/csrc/autograd/python_engine.cpp", + "torch/csrc/autograd/python_function.cpp", + "torch/csrc/autograd/python_hook.cpp", + "torch/csrc/autograd/python_legacy_variable.cpp", + "torch/csrc/autograd/python_variable.cpp", + "torch/csrc/autograd/python_variable_indexing.cpp", "torch/csrc/byte_order.cpp", - "torch/csrc/torch.cpp", - "torch/csrc/utils.cpp", - "torch/csrc/utils/cuda_lazy_init.cpp", - "torch/csrc/utils/invalid_arguments.cpp", - "torch/csrc/utils/object_ptr.cpp", - "torch/csrc/utils/python_arg_parser.cpp", - "torch/csrc/utils/tensor_list.cpp", - "torch/csrc/utils/tensor_new.cpp", - "torch/csrc/utils/tensor_numpy.cpp", - "torch/csrc/utils/tensor_dtypes.cpp", - "torch/csrc/utils/tensor_layouts.cpp", - "torch/csrc/utils/tensor_types.cpp", - "torch/csrc/utils/tuple_parser.cpp", - "torch/csrc/utils/tensor_apply.cpp", - "torch/csrc/utils/tensor_conversion_dispatch.cpp", - "torch/csrc/utils/tensor_flatten.cpp", - "torch/csrc/utils/variadic.cpp", - "torch/csrc/serialization.cpp", "torch/csrc/finalizer.cpp", + "torch/csrc/jit/batched/BatchTensor.cpp", "torch/csrc/jit/init.cpp", - "torch/csrc/jit/interpreter.cpp", - "torch/csrc/jit/register_prim_ops.cpp", + "torch/csrc/jit/passes/onnx.cpp", + "torch/csrc/jit/passes/onnx/fixup_onnx_loop.cpp", + "torch/csrc/jit/passes/onnx/peephole.cpp", + "torch/csrc/jit/passes/to_batch.cpp", + "torch/csrc/jit/python_arg_flatten.cpp", "torch/csrc/jit/python_interpreter.cpp", - "torch/csrc/jit/ir.cpp", - "torch/csrc/jit/fusion_compiler.cpp", - "torch/csrc/jit/graph_executor.cpp", "torch/csrc/jit/python_ir.cpp", - "torch/csrc/jit/test_jit.cpp", - "torch/csrc/jit/tracer.cpp", - "torch/csrc/jit/tracer_state.cpp", "torch/csrc/jit/python_tracer.cpp", - "torch/csrc/jit/passes/shape_analysis.cpp", - "torch/csrc/jit/interned_strings.cpp", - "torch/csrc/jit/type.cpp", - "torch/csrc/jit/export.cpp", - "torch/csrc/jit/import.cpp", - "torch/csrc/jit/autodiff.cpp", - "torch/csrc/jit/python_arg_flatten.cpp", - "torch/csrc/jit/variable_flags.cpp", - "torch/csrc/jit/passes/create_autodiff_subgraphs.cpp", - "torch/csrc/jit/passes/graph_fuser.cpp", - "torch/csrc/jit/passes/onnx.cpp", - "torch/csrc/jit/passes/dead_code_elimination.cpp", - "torch/csrc/jit/passes/remove_expands.cpp", - "torch/csrc/jit/passes/lower_tuples.cpp", - "torch/csrc/jit/passes/lower_grad_of.cpp", - "torch/csrc/jit/passes/common_subexpression_elimination.cpp", - "torch/csrc/jit/passes/peephole.cpp", - "torch/csrc/jit/passes/inplace_check.cpp", - "torch/csrc/jit/passes/canonicalize.cpp", - "torch/csrc/jit/passes/batch_mm.cpp", - "torch/csrc/jit/passes/decompose_addmm.cpp", - "torch/csrc/jit/passes/specialize_undef.cpp", - "torch/csrc/jit/passes/erase_number_types.cpp", - "torch/csrc/jit/passes/loop_unrolling.cpp", - "torch/csrc/jit/passes/to_batch.cpp", - "torch/csrc/jit/passes/onnx/peephole.cpp", - "torch/csrc/jit/passes/onnx/fixup_onnx_loop.cpp", - "torch/csrc/jit/generated/register_aten_ops.cpp", - "torch/csrc/jit/operator.cpp", + "torch/csrc/jit/script/init.cpp", "torch/csrc/jit/script/lexer.cpp", - "torch/csrc/jit/script/compiler.cpp", "torch/csrc/jit/script/module.cpp", - "torch/csrc/jit/script/init.cpp", "torch/csrc/jit/script/python_tree_views.cpp", - "torch/csrc/jit/batched/BatchTensor.cpp", - "torch/csrc/autograd/init.cpp", - "torch/csrc/autograd/aten_variable_hooks.cpp", - "torch/csrc/autograd/grad_mode.cpp", - "torch/csrc/autograd/anomaly_mode.cpp", - "torch/csrc/autograd/python_anomaly_mode.cpp", - "torch/csrc/autograd/engine.cpp", - "torch/csrc/autograd/function.cpp", - "torch/csrc/autograd/variable.cpp", - "torch/csrc/autograd/saved_variable.cpp", - "torch/csrc/autograd/input_buffer.cpp", - "torch/csrc/autograd/profiler.cpp", - "torch/csrc/autograd/python_function.cpp", - "torch/csrc/autograd/python_cpp_function.cpp", - "torch/csrc/autograd/python_variable.cpp", - "torch/csrc/autograd/python_variable_indexing.cpp", - "torch/csrc/autograd/python_legacy_variable.cpp", - "torch/csrc/autograd/python_engine.cpp", - "torch/csrc/autograd/python_hook.cpp", - "torch/csrc/autograd/generated/VariableType.cpp", - "torch/csrc/autograd/generated/Functions.cpp", - "torch/csrc/autograd/generated/python_torch_functions.cpp", - "torch/csrc/autograd/generated/python_variable_methods.cpp", - "torch/csrc/autograd/generated/python_functions.cpp", - "torch/csrc/autograd/generated/python_nn_functions.cpp", - "torch/csrc/autograd/functions/basic_ops.cpp", - "torch/csrc/autograd/functions/tensor.cpp", - "torch/csrc/autograd/functions/accumulate_grad.cpp", - "torch/csrc/autograd/functions/utils.cpp", - "torch/csrc/autograd/functions/init.cpp", "torch/csrc/nn/THNN.cpp", - "torch/csrc/tensor/python_tensor.cpp", - "torch/csrc/onnx/onnx.npb.cpp", - "torch/csrc/onnx/onnx.cpp", "torch/csrc/onnx/init.cpp", + "torch/csrc/serialization.cpp", + "torch/csrc/tensor/python_tensor.cpp", + "torch/csrc/utils.cpp", + "torch/csrc/utils/cuda_lazy_init.cpp", + "torch/csrc/utils/invalid_arguments.cpp", + "torch/csrc/utils/object_ptr.cpp", + "torch/csrc/utils/python_arg_parser.cpp", + "torch/csrc/utils/tensor_apply.cpp", + "torch/csrc/utils/tensor_conversion_dispatch.cpp", + "torch/csrc/utils/tensor_dtypes.cpp", + "torch/csrc/utils/tensor_flatten.cpp", + "torch/csrc/utils/tensor_layouts.cpp", + "torch/csrc/utils/tensor_list.cpp", + "torch/csrc/utils/tensor_new.cpp", + "torch/csrc/utils/tensor_numpy.cpp", + "torch/csrc/utils/tensor_types.cpp", + "torch/csrc/utils/tuple_parser.cpp", ] try: diff --git a/test/common.py b/test/common.py index 8ee3b2be855157..f3b60cf522f43c 100644 --- a/test/common.py +++ b/test/common.py @@ -28,7 +28,7 @@ import torch import torch.cuda from torch._utils_internal import get_writable_path -from torch._six import string_classes +from torch._six import string_classes, inf import torch.backends.cudnn import torch.backends.mkl @@ -56,25 +56,47 @@ def run_tests(argv=UNITTEST_ARGS): IS_WINDOWS = sys.platform == "win32" IS_PPC = platform.machine() == "ppc64le" -TEST_NUMPY = True -try: - import numpy -except ImportError: - TEST_NUMPY = False - -TEST_SCIPY = True -try: - import scipy -except ImportError: - TEST_SCIPY = False +def _check_module_exists(name): + r"""Returns if a top-level module with :attr:`name` exists *without** + importing it. This is generally safer than try-catch block around a + `import X`. It avoids third party libraries breaking assumptions of some of + our tests, e.g., setting multiprocessing start method when imported + (see librosa/#747, torchvision/#544). + """ + if not PY3: # Python 2 + import imp + try: + imp.find_module(name) + return True + except ImportError: + return False + elif PY34: # Python [3, 3.4) + import importlib + loader = importlib.find_loader(name) + return loader is not None + else: # Python >= 3.4 + import importlib + spec = importlib.util.find_spec(name) + return spec is not None + +TEST_NUMPY = _check_module_exists('numpy') +TEST_SCIPY = _check_module_exists('scipy') TEST_MKL = torch.backends.mkl.is_available() +# On Py2, importing librosa 0.6.1 triggers a TypeError (if using newest joblib) +# see librosa/librosa#729. +# TODO: allow Py2 when librosa 0.6.2 releases +TEST_LIBROSA = _check_module_exists('librosa') and PY3 + NO_MULTIPROCESSING_SPAWN = os.environ.get('NO_MULTIPROCESSING_SPAWN', '0') == '1' TEST_WITH_ASAN = os.getenv('PYTORCH_TEST_WITH_ASAN', '0') == '1' TEST_WITH_UBSAN = os.getenv('PYTORCH_TEST_WITH_UBSAN', '0') == '1' BUILT_WITH_ROCM = os.getenv('PYTORCH_BUILT_WITH_ROCM', '0') == '1' +if TEST_NUMPY: + import numpy + def skipIfNoLapack(fn): @wraps(fn) @@ -332,7 +354,7 @@ def assertTensorsEqual(a, b): elif isinstance(x, bool) and isinstance(y, bool): super(TestCase, self).assertEqual(x, y, message) elif isinstance(x, Number) and isinstance(y, Number): - if abs(x) == float('inf') or abs(y) == float('inf'): + if abs(x) == inf or abs(y) == inf: if allow_inf: super(TestCase, self).assertEqual(x, y, message) else: diff --git a/test/cpp/api/cursor.cpp b/test/cpp/api/cursor.cpp index 01a8cdb0c375a1..5c998661be2368 100644 --- a/test/cpp/api/cursor.cpp +++ b/test/cpp/api/cursor.cpp @@ -101,19 +101,36 @@ TEST_CASE("cursor/module") { SECTION("Map works") { std::vector vector(3); cursor.map(vector.begin(), [](Module& module) { return &module; }); + REQUIRE(vector[0] == &model[0]); + REQUIRE(vector[1] == &model[1]); + REQUIRE(vector[2] == &model[2]); std::list list; - cursor.map( - std::back_inserter(list), [](Module& module) { return &module; }); + cursor.map(std::inserter(list, list.end()), [](Module& module) { + return &module; + }); + REQUIRE(list.size() == 3); + auto iterator = list.begin(); + REQUIRE(*iterator++ == &model[0]); + REQUIRE(*iterator++ == &model[1]); + REQUIRE(*iterator++ == &model[2]); + REQUIRE(iterator == list.end()); } SECTION("Map_items works") { - std::map output; + std::map output; cursor.map_items( std::inserter(output, output.end()), [](const std::string& key, Module& module) { - return std::make_pair(key.c_str(), &module); + return std::make_pair(key, &module); }); + REQUIRE(output.size() == 3); + REQUIRE(output.count("0")); + REQUIRE(output.count("1")); + REQUIRE(output.count("2")); + REQUIRE(output["0"] == &model[0]); + REQUIRE(output["1"] == &model[1]); + REQUIRE(output["2"] == &model[2]); } SECTION("Count works for flat models") { @@ -280,29 +297,28 @@ TEST_CASE("cursor/parameter") { SECTION("Apply_items works") { size_t count = 0; - cursor.apply_items( - [&count, &model, &first, &second]( - const std::string& key, torch::Tensor& tensor) { - switch (count) { - case 0: { - REQUIRE(tensor.equal(first->tensor1)); - break; - } - case 1: { - REQUIRE(tensor.equal(first->tensor2)); - break; - } - case 2: { - REQUIRE(tensor.equal(second->tensor1)); - break; - } - case 3: { - REQUIRE(tensor.equal(second->tensor2)); - break; - } - } - count += 1; - }); + cursor.apply_items([&count, &model, &first, &second]( + const std::string& key, torch::Tensor& tensor) { + switch (count) { + case 0: { + REQUIRE(tensor.equal(first->tensor1)); + break; + } + case 1: { + REQUIRE(tensor.equal(first->tensor2)); + break; + } + case 2: { + REQUIRE(tensor.equal(second->tensor1)); + break; + } + case 3: { + REQUIRE(tensor.equal(second->tensor2)); + break; + } + } + count += 1; + }); REQUIRE(count == 4); } diff --git a/test/cpp/api/main.cpp b/test/cpp/api/main.cpp index 9dc554419809b9..4b1aaba64b2ef1 100644 --- a/test/cpp/api/main.cpp +++ b/test/cpp/api/main.cpp @@ -16,10 +16,16 @@ int main(int argc, char* argv[]) { return return_code; } + // ~ disables tags. if (!torch::cuda::is_available()) { - std::cerr << "CUDA not available. Disabling CUDA tests" << std::endl; - // ~ disables the [cuda] tag. + std::cerr << "CUDA not available. Disabling [cuda] and [multi-cuda] tests" + << std::endl; session.configData().testsOrTags.emplace_back("~[cuda]"); + session.configData().testsOrTags.emplace_back("~[multi-cuda]"); + } else if (torch::cuda::device_count() < 2) { + std::cerr << "Only one CUDA device detected. Disabling [multi-cuda] tests" + << std::endl; + session.configData().testsOrTags.emplace_back("~[multi-cuda]"); } return session.run(); diff --git a/test/cpp/api/module.cpp b/test/cpp/api/module.cpp index c8e7bdc605660d..66b11c126df14f 100644 --- a/test/cpp/api/module.cpp +++ b/test/cpp/api/module.cpp @@ -120,7 +120,7 @@ TEST_CASE("module/as") { REQUIRE(unit.as() == &unit); } -TEST_CASE("module/conversions", "[cuda]") { +TEST_CASE("module/conversions", "[multi-cuda]") { torch::manual_seed(0); Linear module(128, 64); SECTION("starts as float on CPU") { @@ -350,3 +350,31 @@ TEST_CASE("module/buffers") { REQUIRE(buffers.contains("c")); } } + +TEST_CASE("module/default-constructor") { + struct AImpl : torch::nn::Module { + AImpl() : x_(123) {} + AImpl(int x) : x_(x) {} + int x_; + }; + TORCH_MODULE(A); + + { + A a; + REQUIRE(a); + REQUIRE(!a.is_empty()); + REQUIRE(a->x_ == 123); + } + { + A a(5); + REQUIRE(a); + REQUIRE(!a.is_empty()); + REQUIRE(a->x_ == 5); + } + { + A a = nullptr; + REQUIRE(!a); + REQUIRE(a.is_empty()); + REQUIRE_THROWS_WITH(a->x_, StartsWith("Accessing empty ModuleHolder")); + } +} diff --git a/test/cpp/api/optim.cpp b/test/cpp/api/optim.cpp index 02a9ca14a36705..186159c8e98edf 100644 --- a/test/cpp/api/optim.cpp +++ b/test/cpp/api/optim.cpp @@ -135,6 +135,30 @@ void check_exact_values( } } +TEST_CASE("Optim/BasicInterface") { + struct MyOptimizer : Optimizer { + using Optimizer::Optimizer; + void step() override {} + }; + std::vector parameters = { + torch::ones({2, 3}), torch::zeros({2, 3}), torch::rand({2, 3})}; + { + MyOptimizer optimizer(parameters); + REQUIRE(optimizer.size() == parameters.size()); + } + { + MyOptimizer optimizer; + REQUIRE(optimizer.size() == 0); + optimizer.add_parameters(parameters); + REQUIRE(optimizer.size() == parameters.size()); + } + { + Linear linear(3, 4); + MyOptimizer optimizer(linear->parameters()); + REQUIRE(optimizer.size() == linear->parameters().size()); + } +} + TEST_CASE("Optim/XORConvergence/SGD") { REQUIRE(test_optimizer_xor( SGDOptions(0.1).momentum(0.9).nesterov(true).weight_decay(1e-6))); diff --git a/test/cpp/api/parallel.cpp b/test/cpp/api/parallel.cpp new file mode 100644 index 00000000000000..2d8f413c053006 --- /dev/null +++ b/test/cpp/api/parallel.cpp @@ -0,0 +1,230 @@ +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +using Catch::StartsWith; + +using namespace torch::autograd; +using namespace torch::nn; + +TEST_CASE("Parallel/DifferentiableScatter", "[multi-cuda]") { + Scatter scatter( + {torch::Device(torch::kCUDA, 0), torch::Device(torch::kCUDA, 1)}); + + auto input = torch::ones(10, torch::requires_grad(true)); + auto output = scatter.apply({input}); + + REQUIRE(output.size() == 2); + REQUIRE(output[0].size(0) == 5); + REQUIRE(output[1].size(0) == 5); + + REQUIRE(torch::cat({output[0].to(torch::kCPU), output[1].to(torch::kCPU)}) + .allclose(input)); + + auto sum = output[0].to({torch::kCUDA, 1}) + output[1]; + sum.backward(); + + REQUIRE(input.grad().defined()); + REQUIRE(input.grad().device().is_cpu()); + REQUIRE(input.grad().sum().toCInt() == 10); +} + +TEST_CASE("Parallel/DifferentiableGather", "[multi-cuda]") { + Gather gather(torch::Device(torch::kCUDA, 1)); + + auto a = torch::ones(5, torch::requires_grad(true).device({torch::kCUDA, 0})); + auto b = torch::ones(5, torch::requires_grad(true).device({torch::kCUDA, 1})); + + auto outputs = gather.apply({a, b}); + REQUIRE(outputs.size() == 1); + auto& output = outputs.front(); + + REQUIRE(output.size(0) == 10); + REQUIRE(output.device() == torch::Device(torch::kCUDA, 1)); + + auto chunks = output.chunk(2); + REQUIRE(chunks[0].to({torch::kCUDA, 0}).allclose(a)); + REQUIRE(chunks[1].allclose(b)); + + output.backward(); + + REQUIRE(a.grad().defined()); + REQUIRE(a.grad().device() == torch::Device(torch::kCUDA, 0)); + REQUIRE(a.grad().sum().toCInt() == 5); + + REQUIRE(b.grad().defined()); + REQUIRE(b.grad().device() == torch::Device(torch::kCUDA, 1)); + REQUIRE(b.grad().sum().toCInt() == 5); +} + +TEST_CASE("Parallel/Replicate", "[multi-cuda]") { + Linear linear(3, 4); + auto replicas = parallel::replicate( + linear, {torch::Device(torch::kCUDA, 0), torch::Device(torch::kCUDA, 1)}); + REQUIRE(replicas.size() == 2); + + auto original_parameters = linear->parameters(); + + auto replica1_parameters = replicas[0]->parameters(); + for (auto& parameter : replica1_parameters) { + REQUIRE(parameter->device() == torch::Device(torch::kCUDA, 0)); + } + replicas[0]->to(torch::kCPU); + REQUIRE(replica1_parameters.size() == original_parameters.size()); + for (size_t i = 0; i < original_parameters.size(); ++i) { + REQUIRE(replica1_parameters[i]->allclose(*original_parameters[i])); + REQUIRE( + replica1_parameters[i]->data().data() != + original_parameters[i]->data().data()); + } + + auto replica2_parameters = replicas[1]->parameters(); + for (auto& parameter : replica2_parameters) { + REQUIRE(parameter->device() == torch::Device(torch::kCUDA, 1)); + } + replicas[1]->to(torch::kCPU); + REQUIRE(replica2_parameters.size() == original_parameters.size()); + for (size_t i = 0; i < original_parameters.size(); ++i) { + REQUIRE(replica2_parameters[i]->allclose(*original_parameters[i])); + REQUIRE( + replica2_parameters[i]->data().data() != + original_parameters[i]->data().data()); + } +} + +TEST_CASE("Parallel/ParallelApply", "[multi-cuda]") { + Linear a(3, 4); + + Linear b(std::static_pointer_cast(a->clone())); + b->to({torch::kCUDA, 0}); + + Linear c(std::static_pointer_cast(a->clone())); + c->to({torch::kCUDA, 1}); + + std::vector modules = {a, b, c}; + std::vector inputs = { + torch::ones({2, 3}), + torch::ones({2, 3}, torch::device({torch::kCUDA, 0})), + torch::ones({2, 3}, torch::device({torch::kCUDA, 1}))}; + + auto outputs = parallel::parallel_apply(modules, inputs); + + REQUIRE(outputs.size() == 3); + REQUIRE(outputs[0].device().is_cpu()); + + REQUIRE(outputs[1].device() == torch::Device(torch::kCUDA, 0)); + REQUIRE(outputs[1].to(torch::kCPU).allclose(outputs[0])); + + REQUIRE(outputs[2].device() == torch::Device(torch::kCUDA, 1)); + REQUIRE(outputs[2].to(torch::kCPU).allclose(outputs[0])); +} + +TEST_CASE("Parallel/ParallelApplyWithDifferentOutputDevice", "[multi-cuda]") { + struct M : torch::nn::Module { + torch::Tensor forward(torch::Tensor input) { + return torch::ones({5}, torch::dtype(torch::kInt32)); + } + }; + + std::vector> modules = { + std::make_shared(), std::make_shared(), std::make_shared()}; + std::vector inputs = { + torch::empty({}), torch::empty({}), torch::empty({})}; + std::vector devices = { + {torch::kCUDA, 1}, {torch::kCUDA, 0}, {torch::kCPU}}; + + auto outputs = parallel::parallel_apply(modules, inputs, devices); + + REQUIRE(outputs.size() == 3); + REQUIRE(outputs[0].device().is_cuda()); + REQUIRE(outputs[0].device() == torch::Device(torch::kCUDA, 1)); + + REQUIRE(outputs[1].device().is_cuda()); + REQUIRE(outputs[1].device() == torch::Device(torch::kCUDA, 0)); + + REQUIRE(outputs[2].device().is_cpu()); +} + +TEST_CASE("Parallel/ParallelApplyRethrowsException", "[multi-cuda]") { + struct M : torch::nn::Cloneable { + void reset() override {} + torch::Tensor forward(torch::Tensor input) { + throw std::runtime_error("Badness!"); + } + }; + + auto m = std::make_shared(); + auto input = torch::ones({10, 3}); + REQUIRE_THROWS_WITH( + parallel::data_parallel(m, input), StartsWith("Badness!")); +} + +TEST_CASE( + "Parallel/DataParallelPlacesTheOutputOnTheRequestedDevice", + "[multi-cuda]") { + struct M : torch::nn::Cloneable { + void reset() override {} + torch::Tensor forward(torch::Tensor input) { + // Intermediate tensors should be on the replica's current device. + intermediate_tensor = torch::rand(5); + // The returned tensor should be on the output device. + return torch::ones(3); + } + torch::Tensor intermediate_tensor; + }; + auto m = std::make_shared(); + auto input = torch::ones({10, 3}); + { + auto output = parallel::data_parallel( + m, + input, + /*devices=*/at::nullopt, + /*output_device=*/torch::Device(torch::kCUDA, 1)); + REQUIRE(output.defined()); + REQUIRE(output.device().is_cuda()); + REQUIRE(output.device().index() == 1); + } + { + // Verify for the single-device case (where we don't scatter/gather). + auto output = parallel::data_parallel( + m, + input, + /*devices=*/std::vector{torch::Device(torch::kCUDA, 0)}, + /*output_device=*/torch::Device(torch::kCUDA, 1)); + REQUIRE(m->intermediate_tensor.defined()); + REQUIRE(m->intermediate_tensor.device().is_cuda()); + REQUIRE(m->intermediate_tensor.device().index() == 0); + REQUIRE(output.defined()); + REQUIRE(output.device().is_cuda()); + REQUIRE(output.device().index() == 1); + } +} + +TEST_CASE("Parallel/DataParallelUsesAllAvailableCUDADevices", "[cuda]") { + struct M : torch::nn::Cloneable { + void reset() override {} + torch::Tensor forward(torch::Tensor input) { + return torch::tensor(torch::DefaultTensorOptions::get().device().index()); + } + }; + + auto m = std::make_shared(); + auto input = torch::ones({10, 3}); + auto output = parallel::data_parallel(m, input); + + const auto device_count = torch::cuda::device_count(); + REQUIRE(output.numel() == device_count); + for (size_t i = 0; i < device_count; ++i) { + REQUIRE(output[i].toCInt() == i); + } +} diff --git a/test/cpp/api/tensor_cuda.cpp b/test/cpp/api/tensor_cuda.cpp index 5a92bd18ec90f1..82d874e74b11b0 100644 --- a/test/cpp/api/tensor_cuda.cpp +++ b/test/cpp/api/tensor_cuda.cpp @@ -4,7 +4,7 @@ #include -TEST_CASE("Tensor/AllocatesTensorOnTheCorrectDevice", "[cuda]") { +TEST_CASE("Tensor/AllocatesTensorOnTheCorrectDevice", "[multi-cuda]") { auto tensor = at::tensor({1, 2, 3}, at::device({at::kCUDA, 1})); REQUIRE(tensor.device().type() == at::Device::Type::CUDA); REQUIRE(tensor.device().index() == 1); diff --git a/test/cpp/api/tensor_options_cuda.cpp b/test/cpp/api/tensor_options_cuda.cpp index 596e3b77610365..f5b0635c834d9a 100644 --- a/test/cpp/api/tensor_options_cuda.cpp +++ b/test/cpp/api/tensor_options_cuda.cpp @@ -41,7 +41,7 @@ TEST_CASE("TensorOptions/ConstructsWellFromCUDATypes", "[cuda]") { REQUIRE_OPTIONS(kCUDA, 5, kFloat, kSparse); } -TEST_CASE("TensorOptions/ConstructsWellFromCUDATensors", "[cuda]") { +TEST_CASE("TensorOptions/ConstructsWellFromCUDATensors", "[multi-cuda]") { auto options = TensorOptions(empty(5, device(kCUDA).dtype(kDouble))); REQUIRE_OPTIONS(kCUDA, 0, kDouble, kStrided); @@ -66,7 +66,7 @@ TEST_CASE("TensorOptions/ConstructsWellFromCUDATensors", "[cuda]") { } } -TEST_CASE("OptionsGuardCUDA", "[cuda]") { +TEST_CASE("OptionsGuardCUDA", "[multi-cuda]") { Tensor tensor; { OptionsGuard guard(device(kCUDA)); @@ -87,7 +87,7 @@ TEST_CASE("OptionsGuardCUDA", "[cuda]") { REQUIRE_TENSOR_OPTIONS(kCUDA, 0, kInt, kStrided); } -TEST_CASE("DeviceGuardOptionsGuardInteraction", "[cuda]") { +TEST_CASE("DeviceGuardOptionsGuardInteraction", "[multi-cuda]") { Tensor tensor; { // Check that OptionsGuard respects any active device before construction. @@ -111,3 +111,18 @@ TEST_CASE("DeviceGuardOptionsGuardInteraction", "[cuda]") { } } } + +TEST_CASE("DeviceGuardIsMovable", "[cuda]") { + DeviceGuard first(1); + REQUIRE(first.original_index() == 0); + REQUIRE(first.last_index() == 1); + DeviceGuard second(std::move(first)); + REQUIRE(second.original_index() == 0); + REQUIRE(second.last_index() == 1); + REQUIRE(first.original_index() == -1); + DeviceGuard third; + third = std::move(second); + REQUIRE(third.original_index() == 0); + REQUIRE(third.last_index() == 1); + REQUIRE(second.original_index() == -1); +} diff --git a/test/expect/TestScript.test_call_python_fn_from_traced_module.expect b/test/expect/TestScript.test_call_python_fn_from_traced_module.expect index 2a87a361a9e3fa..4c9e2e2146aaf2 100644 --- a/test/expect/TestScript.test_call_python_fn_from_traced_module.expect +++ b/test/expect/TestScript.test_call_python_fn_from_traced_module.expect @@ -1,6 +1,6 @@ graph(%0 : Double(3, 4) %1 : Double(4, 3)) { - %2 : Double(3, 4) = aten::neg(%0) - %4 : Double(3, 3) = aten::mm(%2, %1) + %2 : Double(3, 4) = aten::neg(%0), scope: TracedModule + %4 : Double(3, 3) = aten::mm(%2, %1), scope: TracedModule return (%4); } diff --git a/test/expect/TestScript.test_call_python_mod_from_traced_module.expect b/test/expect/TestScript.test_call_python_mod_from_traced_module.expect index 925bbf19ea5058..d39acaf5257d3d 100644 --- a/test/expect/TestScript.test_call_python_mod_from_traced_module.expect +++ b/test/expect/TestScript.test_call_python_mod_from_traced_module.expect @@ -1,8 +1,8 @@ graph(%0 : Double(3, 4) %1 : Double(4, 5) %2 : Double(5, 7)) { - %4 : Double(3, 5) = aten::mm(%0, %1) - %6 : Double(3, 7) = aten::mm(%4, %2) - %7 : Double(3, 7) = aten::add[other={1}, alpha={1}](%6) + %4 : Double(3, 5) = aten::mm(%0, %1), scope: TracedModule + %6 : Double(3, 7) = aten::mm(%4, %2), scope: TracedModule/PythonModule[mod] + %7 : Double(3, 7) = aten::add[other={1}, alpha={1}](%6), scope: TracedModule return (%7); } diff --git a/test/expect/TestScript.test_call_python_mod_from_tracing_fn.expect b/test/expect/TestScript.test_call_python_mod_from_tracing_fn.expect index 4de15a540cc32e..ea847d630c8ba5 100644 --- a/test/expect/TestScript.test_call_python_mod_from_tracing_fn.expect +++ b/test/expect/TestScript.test_call_python_mod_from_tracing_fn.expect @@ -1,6 +1,6 @@ graph(%0 : Double(3, 4)) { - %1 : Double(4, 3) = prim::Constant[value=]() - %3 : Double(3, 3) = aten::mm(%0, %1) + %1 : Double(4, 3) = prim::Constant[value=](), scope: PythonMod + %3 : Double(3, 3) = aten::mm(%0, %1), scope: PythonMod %4 : Double(3, 3) = aten::add[other={1}, alpha={1}](%3) return (%4); } diff --git a/test/expect/TestScript.test_call_script_fn_from_traced_module.expect b/test/expect/TestScript.test_call_script_fn_from_traced_module.expect index adaab3880dc46d..6bf57b856cac8e 100644 --- a/test/expect/TestScript.test_call_script_fn_from_traced_module.expect +++ b/test/expect/TestScript.test_call_script_fn_from_traced_module.expect @@ -1,6 +1,6 @@ graph(%0 : Double(3, 4) %1 : Double(4, 5)) { - %3 : Double(3, 5) = aten::mm(%0, %1) - %5 : Double(3, 5) = aten::neg(%3) + %3 : Double(3, 5) = aten::mm(%0, %1), scope: TracedModule + %5 : Double(3, 5) = aten::neg(%3), scope: TracedModule/ScriptModule return (%5); } diff --git a/test/expect/TestScript.test_call_script_fn_from_tracing_fn.expect b/test/expect/TestScript.test_call_script_fn_from_tracing_fn.expect index cffec80d884616..dc8b4945df4773 100644 --- a/test/expect/TestScript.test_call_script_fn_from_tracing_fn.expect +++ b/test/expect/TestScript.test_call_script_fn_from_tracing_fn.expect @@ -1,5 +1,5 @@ graph(%0 : Double(3, 4)) { - %2 : Double(3, 4) = aten::neg(%0) + %2 : Double(3, 4) = aten::neg(%0), scope: ScriptModule %3 : Double(3, 4) = aten::add[other={1}, alpha={1}](%2) return (%3); } diff --git a/test/expect/TestScript.test_call_script_mod_from_tracing_fn.expect b/test/expect/TestScript.test_call_script_mod_from_tracing_fn.expect index d446882fbaa956..fc7039bd971f23 100644 --- a/test/expect/TestScript.test_call_script_mod_from_tracing_fn.expect +++ b/test/expect/TestScript.test_call_script_mod_from_tracing_fn.expect @@ -1,6 +1,6 @@ graph(%0 : Double(3, 4)) { - %1 : Double(4, 3) = prim::Constant[value=]() - %4 : Double(3, 3) = aten::mm(%0, %1) + %1 : Double(4, 3) = prim::Constant[value=](), scope: ScriptMod + %4 : Double(3, 3) = aten::mm(%0, %1), scope: ScriptMod %5 : Double(3, 3) = aten::add[other={1}, alpha={1}](%4) return (%5); } diff --git a/test/expect/TestScript.test_call_script_module_from_traced_module.expect b/test/expect/TestScript.test_call_script_module_from_traced_module.expect index c249ddc6b8c171..21b14a2a62f8cf 100644 --- a/test/expect/TestScript.test_call_script_module_from_traced_module.expect +++ b/test/expect/TestScript.test_call_script_module_from_traced_module.expect @@ -1,8 +1,8 @@ graph(%0 : Double(3, 4) %1 : Double(4, 5) %2 : Double(5, 7)) { - %4 : Double(3, 5) = aten::mm(%0, %1) - %7 : Double(3, 7) = aten::mm(%4, %2) - %8 : Double(3, 7) = aten::add[other={1}, alpha={1}](%7) + %4 : Double(3, 5) = aten::mm(%0, %1), scope: TracedModule + %7 : Double(3, 7) = aten::mm(%4, %2), scope: TracedModule/ScriptMod[mod] + %8 : Double(3, 7) = aten::add[other={1}, alpha={1}](%7), scope: TracedModule return (%8); } diff --git a/test/expect/TestScript.test_call_traced_fn_from_traced_module.expect b/test/expect/TestScript.test_call_traced_fn_from_traced_module.expect index 4e25a8581f0706..f45c3f15a9caed 100644 --- a/test/expect/TestScript.test_call_traced_fn_from_traced_module.expect +++ b/test/expect/TestScript.test_call_traced_fn_from_traced_module.expect @@ -1,6 +1,6 @@ graph(%0 : Double(3, 4) %1 : Double(4, 5)) { - %3 : Double(3, 5) = aten::mm(%0, %1) - %5 : Double(3, 4) = aten::neg(%3) + %3 : Double(3, 5) = aten::mm(%0, %1), scope: TracedModule + %5 : Double(3, 4) = aten::neg(%3), scope: TracedModule/traced_fn return (%5); } diff --git a/test/expect/TestScript.test_call_traced_fn_from_tracing_fn.expect b/test/expect/TestScript.test_call_traced_fn_from_tracing_fn.expect index cffec80d884616..ed737f4b6580b4 100644 --- a/test/expect/TestScript.test_call_traced_fn_from_tracing_fn.expect +++ b/test/expect/TestScript.test_call_traced_fn_from_tracing_fn.expect @@ -1,5 +1,5 @@ graph(%0 : Double(3, 4)) { - %2 : Double(3, 4) = aten::neg(%0) + %2 : Double(3, 4) = aten::neg(%0), scope: traced_fn1 %3 : Double(3, 4) = aten::add[other={1}, alpha={1}](%2) return (%3); } diff --git a/test/expect/TestScript.test_call_traced_mod_from_tracing_fn.expect b/test/expect/TestScript.test_call_traced_mod_from_tracing_fn.expect index d446882fbaa956..3fac45fc2dfdab 100644 --- a/test/expect/TestScript.test_call_traced_mod_from_tracing_fn.expect +++ b/test/expect/TestScript.test_call_traced_mod_from_tracing_fn.expect @@ -1,6 +1,6 @@ graph(%0 : Double(3, 4)) { - %1 : Double(4, 3) = prim::Constant[value=]() - %4 : Double(3, 3) = aten::mm(%0, %1) + %1 : Double(4, 3) = prim::Constant[value=](), scope: TracedModule[TracedModule] + %4 : Double(3, 3) = aten::mm(%0, %1), scope: TracedModule[TracedModule] %5 : Double(3, 3) = aten::add[other={1}, alpha={1}](%4) return (%5); } diff --git a/test/expect/TestScript.test_call_traced_module_from_traced_module.expect b/test/expect/TestScript.test_call_traced_module_from_traced_module.expect index c249ddc6b8c171..471f9f1c2ec3fe 100644 --- a/test/expect/TestScript.test_call_traced_module_from_traced_module.expect +++ b/test/expect/TestScript.test_call_traced_module_from_traced_module.expect @@ -1,8 +1,8 @@ graph(%0 : Double(3, 4) %1 : Double(4, 5) %2 : Double(5, 7)) { - %4 : Double(3, 5) = aten::mm(%0, %1) - %7 : Double(3, 7) = aten::mm(%4, %2) - %8 : Double(3, 7) = aten::add[other={1}, alpha={1}](%7) + %4 : Double(3, 5) = aten::mm(%0, %1), scope: TracedModule + %7 : Double(3, 7) = aten::mm(%4, %2), scope: TracedModule/TracedModule[TracedModule1][mod] + %8 : Double(3, 7) = aten::add[other={1}, alpha={1}](%7), scope: TracedModule return (%8); } diff --git a/test/onnx/model_defs/squeezenet.py b/test/onnx/model_defs/squeezenet.py index 3db99b338bca97..e4ace18194ab71 100644 --- a/test/onnx/model_defs/squeezenet.py +++ b/test/onnx/model_defs/squeezenet.py @@ -67,7 +67,7 @@ def __init__(self, version=1.0, num_classes=1000, ceil_mode=False): Fire(384, 64, 256, 256), Fire(512, 64, 256, 256), ) - # Final convolution is initialized differently form the rest + # Final convolution is initialized differently from the rest final_conv = nn.Conv2d(512, self.num_classes, kernel_size=1) self.classifier = nn.Sequential( nn.Dropout(p=0.5), diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py index cbde1fa75c3610..dd601881131b96 100644 --- a/test/onnx/test_operators.py +++ b/test/onnx/test_operators.py @@ -432,7 +432,7 @@ def symb(g, x, y): return g.op('Sum', x, y[0], y[1]), ( g.op('Neg', x), g.op('Neg', y[0])) - @torch.onnx.symbolic_override_first_arg_based(symb) + @torch.onnx.symbolic_override(symb) def foo(x, y): return x + y[0] + y[1], (-x, -y[0]) diff --git a/test/test_autograd.py b/test/test_autograd.py index cb1c4e3def7e2b..e09b3366029427 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -10,6 +10,7 @@ from itertools import product from operator import mul, itemgetter from functools import reduce, wraps +from torch._six import inf, nan from torch.autograd.gradcheck import gradgradcheck, gradcheck from torch.autograd.function import once_differentiable from torch.autograd.profiler import profile @@ -1524,12 +1525,12 @@ def _test_pyscalar_conversions(self, t, integral_conv): pyscalar = -12345.1 f[0] = pyscalar self.assertEqual(float(f), pyscalar) - f[0] = float('nan') + f[0] = nan self.assertTrue(math.isnan(float(f))) - f[0] = float('inf') - self.assertEqual(float(f), float('inf'), allow_inf=True) - f[0] = float('-inf') - self.assertEqual(float(f), float('-inf'), allow_inf=True) + f[0] = inf + self.assertEqual(float(f), inf, allow_inf=True) + f[0] = -inf + self.assertEqual(float(f), -inf, allow_inf=True) # integral -> floating point # check we can convert something that loses precision @@ -1539,11 +1540,11 @@ def _test_pyscalar_conversions(self, t, integral_conv): self.assertEqual(float(l), float(pyscalar)) # floating point -> integral - f[0] = float('nan') + f[0] = nan self.assertRaises(ValueError, lambda: integral_conv(f[0])) - f[0] = float('inf') + f[0] = inf self.assertRaises(OverflowError, lambda: integral_conv(f[0])) - f[0] = float('-inf') + f[0] = -inf self.assertRaises(OverflowError, lambda: integral_conv(f[0])) f[0] = sys.float_info.max self.assertEqual(integral_conv(f), sys.float_info.max) @@ -1558,9 +1559,9 @@ def test_nonzero(tensor, value, expected): test_nonzero(l, -2, True) test_nonzero(f, 0.0, False) test_nonzero(f, sys.float_info.min, True) - test_nonzero(f, float('nan'), bool(float('nan'))) - test_nonzero(f, float('inf'), bool(float('inf'))) - test_nonzero(f, float('-inf'), bool(float('-inf'))) + test_nonzero(f, nan, bool(nan)) + test_nonzero(f, inf, bool(inf)) + test_nonzero(f, -inf, bool(-inf)) def test_pyscalar_conversions(self): self._test_pyscalar_conversions(lambda x: x, lambda x: int(x)) @@ -2106,13 +2107,22 @@ def test_dir(self): def test_as_strided(self): - def test(x, repro_fn, *args): + def test(x, prepro_fn, size, strides, offset=None): + x = x.to(torch.double).detach().requires_grad_() + + # Check that forward will **not** resize storage because it may + # cause NaN in output and fail numerical Jacobian check consequently + with torch.no_grad(): + y = prepro_fn(x) if prepro_fn is not None else x + max_offset = sum((si - 1) * st for si, st in zip(size, strides)) + max_offset += offset if offset is not None else y.storage_offset() + assert max_offset < len(y.storage()), "test case resizes storage" + def closure(x): - if repro_fn is not None: - x = repro_fn(x) - return x.as_strided(*args) + if prepro_fn is not None: + x = prepro_fn(x) + return x.as_strided(size, strides, offset) - x = x.to(torch.double).detach().requires_grad_() gradcheck(closure, [x]) gradgradcheck(closure, [x]) @@ -2120,7 +2130,7 @@ def closure(x): test(torch.arange(0, 25), lambda x: x.view(5, 5), [3, 3], [6, 2], 2) # test crazy stride at dim with size 1 case - test(torch.randn(10), None, [1, 2, 1, 5], [0, 5, 100, 1], 2) + test(torch.randn(12), None, [1, 2, 1, 5], [0, 5, 100, 1], 2) # test expand case test(torch.randn(5), None, [3, 3, 3], [0, 1, 0], 2) @@ -2634,6 +2644,9 @@ class dont_convert(tuple): ('reshape', (S,), (S,), '1d'), ('reshape', (), (dont_convert(()),), 'scalar_to_scalar'), ('reshape', (), (1,), 'scalar_to_1d'), + ('reshape_as', (S, S, S), (non_differentiable(torch.rand(S * S, S)),)), + ('reshape_as', (), (non_differentiable(torch.tensor(42.)),), 'scalar'), + ('reshape_as', (), (non_differentiable(torch.rand(1, 1)),), 'scalar_to_dims'), ('flip', (S, S, S), ([0],), 'd0'), ('flip', (S, S, S), ([0, 1, 2],), 'd012'), ('flip', (S, S, S), ([0, 2],), 'd02'), @@ -2825,7 +2838,7 @@ class dont_convert(tuple): ('std', (S,), (0, True, True), 'keepdim_dim_1d', [0]), ('renorm', (S, S, S), (2, 1, 0.5), 'dim', [1]), ('renorm', (S, S, S), (1, 2, 3), 'norm_1'), - ('renorm', (S, S, S), (float('inf'), 2, 0.5), 'norm_inf'), + ('renorm', (S, S, S), (inf, 2, 0.5), 'norm_inf'), ('repeat', (S,), (2,), 'single_number'), ('repeat', (), (2, 3), 'scalar'), ('repeat', (2, 2), (3, 2)), @@ -2917,7 +2930,7 @@ class dont_convert(tuple): ('norm', (S, S), (0.5,), '0_5'), ('norm', (S, S), (1,), '1'), ('norm', (S, S), (3,), '3'), - ('norm', (S, S), (float('inf'),), 'inf'), + ('norm', (S, S), (inf,), 'inf'), ('norm', (S, S), (-1,), 'neg_1'), ('norm', (S, S), (-0.5,), 'neg_0_5'), ('norm', (S, S), (-1.5,), 'neg_1_5'), diff --git a/test/test_cuda.py b/test/test_cuda.py index 6e6705b9b3eac6..9d56682f3cf066 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -12,6 +12,7 @@ import torch.cuda import torch.cuda.comm as comm from torch import multiprocessing as mp +from torch._six import inf, nan from test_torch import TestTorch from common import TestCase, get_gpu_type, to_gpu, freeze_rng_state, run_tests, \ @@ -89,11 +90,7 @@ def is_half(t): def number(floating, integer, t): - name = type(t).__name__ - if 'Double' in name or 'Float' in name or 'Half' in name: - return floating - else: - return integer + return floating if is_floating(t) else integer def cast_tensor(tensor, t): @@ -480,6 +477,7 @@ def tmp(t): 'add': 1e-2, 'acos': 1e-3, 'addbmm': 1e-1, + 'addcdiv': 1e-2, 'addcmul': 1e-2, 'addmm': 1e-1, 'addmv': 1e-2, @@ -501,6 +499,7 @@ def tmp(t): 'erfinv': 1e-3, 'exp': 1e-2, 'expm1': 1e-2, + 'fill': 1e-3, 'lerp': 1e-2, 'lgamma': 1e-2, 'log': 1e-2, @@ -595,7 +594,7 @@ def tmp(self): gpu_tensor = to_gpu(cpu_tensor) cpu_args = arg_constructor(t) gpu_args = [to_gpu(arg) for arg in cpu_args] - if t.__name__ == 'HalfTensor': + if is_half(t): cpu_tensor = cpu_tensor.float() cpu_args = [arg.float() if isinstance(arg, torch.Tensor) and is_half(arg) else arg for arg in cpu_args] cpu_result = getattr(cpu_tensor, fn)(*cpu_args) @@ -784,7 +783,7 @@ def advance(gen, end): if not end0: gen1_max_times = torch.LongTensor(1).random_(0, 3)[0] else: - gen1_max_times = float('inf') + gen1_max_times = inf t = 0 while t < gen1_max_times and not end1: end1 = advance(gen1, end1) @@ -903,7 +902,7 @@ def test_min_max_nan(self): (lambda x: x.max(0)[0], 'max_dim')] for f, name in tests: a = torch.arange(25.0).view(5, 5) - a[2, 2] = float('nan') + a[2, 2] = nan actual = f(a.cuda()).cpu() expected = f(a).cpu() self.assertEqual(torch.isnan(actual), torch.isnan(expected), 'nans for {}'.format(name)) @@ -1479,11 +1478,8 @@ def mute(): os.dup2(os.open(os.devnull, os.O_WRONLY), sys.stderr.fileno()) def _spawn_method(self, method, arg): - try: - mp.set_start_method('spawn') - except RuntimeError: - pass - with mp.Pool(1, initializer=self.mute) as pool: + ctx = mp.get_context("spawn") + with ctx.Pool(1, initializer=self.mute) as pool: errors = pool.map(method, [arg]) for e in errors: if 'device-side assert triggered' not in str(e): @@ -1508,9 +1504,9 @@ def _test_multinomial_invalid_probs_cuda(probs): def test_multinomial_invalid_probs_cuda(self): test_method = TestCuda._test_multinomial_invalid_probs_cuda self._spawn_method(test_method, torch.Tensor([0, -1])) - self._spawn_method(test_method, torch.Tensor([0, float('inf')])) - self._spawn_method(test_method, torch.Tensor([0, float('-inf')])) - self._spawn_method(test_method, torch.Tensor([0, float('nan')])) + self._spawn_method(test_method, torch.Tensor([0, inf])) + self._spawn_method(test_method, torch.Tensor([0, -inf])) + self._spawn_method(test_method, torch.Tensor([0, nan])) def test_broadcast(self): TestTorch._test_broadcast(self, lambda t: t.cuda()) @@ -1691,7 +1687,6 @@ def test(use_double=False): cpu_tensor = torch.tensor([-0.999999994, -1.999999994, -2.0000000111, -100.99999994, -1931.99999994, 0.000000111, -0.000000111, 0, -1, -2, -931]) - nan = float('nan') expected_errors = torch.tensor([0, 0, 0, 0, 0, 0, 0, nan, nan, nan, nan]) gpu_tensor = cpu_tensor.cuda() cpu_out = cpu_tensor.digamma() @@ -1912,7 +1907,7 @@ def generate_tests(): continue precision = custom_precision.get(name, TestCuda.precision) - if t == torch.HalfTensor: + if is_half(t): precision = custom_half_precision.get(name, precision) for inplace in (True, False): diff --git a/test/test_dataloader.py b/test/test_dataloader.py index d9a03f3401158d..8b3136a57d8e6a 100644 --- a/test/test_dataloader.py +++ b/test/test_dataloader.py @@ -9,7 +9,7 @@ import traceback import unittest import subprocess -from torch import multiprocessing +from torch import multiprocessing as mp from torch.utils.data import Dataset, TensorDataset, DataLoader, ConcatDataset from torch.utils.data.dataset import random_split from torch.utils.data.dataloader import default_collate, ExceptionWrapper, MANAGER_STATUS_CHECK_INTERVAL @@ -24,12 +24,9 @@ # We need spawn start method for test_manager_unclean_exit, but # Python 2.7 doesn't allow it. if sys.version_info[0] == 3: - # Without the try-catch block, some tests will complain that - # context has already been set. - try: - multiprocessing.set_start_method('spawn') - except RuntimeError: - pass + # Get a multiprocessing context because some test / third party library will + # set start_method when imported, and setting again triggers RuntimeError. + mp = mp.get_context(method='spawn') JOIN_TIMEOUT = 17.0 if IS_WINDOWS else 6.5 @@ -144,11 +141,11 @@ def test_add_dataset(self): # Stores the first encountered exception in .exception. # Inspired by https://stackoverflow.com/a/33599967 -class ErrorTrackingProcess(multiprocessing.Process): +class ErrorTrackingProcess(mp.Process): def __init__(self, *args, **kwargs): super(ErrorTrackingProcess, self).__init__(*args, **kwargs) - self._pconn, self._cconn = multiprocessing.Pipe() + self._pconn, self._cconn = mp.Pipe() self._exception = None def run(self): @@ -235,8 +232,8 @@ class SynchronizedSeedDataset(Dataset): def __init__(self, size, num_workers): assert size >= num_workers - self.count = multiprocessing.Value('i', 0, lock=True) - self.barrier = multiprocessing.Semaphore(0) + self.count = mp.Value('i', 0, lock=True) + self.barrier = mp.Semaphore(0) self.num_workers = num_workers self.size = size @@ -537,12 +534,12 @@ def _is_process_alive(pid, pname): def test_manager_unclean_exit(self): '''there might be ConnectionResetError or leaked semaphore warning (due to dirty process exit), \ but they are all safe to ignore''' - worker_pids = multiprocessing.Array('i', [0] * 4) + worker_pids = mp.Array('i', [0] * 4) - manager_exit_event = multiprocessing.Event() - mp = multiprocessing.Process(target=TestDataLoader._manager_process, - args=(self.dataset, worker_pids, manager_exit_event)) - mp.start() + manager_exit_event = mp.Event() + p = mp.Process(target=TestDataLoader._manager_process, + args=(self.dataset, worker_pids, manager_exit_event)) + p.start() manager_exit_event.wait() diff --git a/test/test_distributions.py b/test/test_distributions.py index 2f97370f713d8f..f53271e1ea0277 100644 --- a/test/test_distributions.py +++ b/test/test_distributions.py @@ -30,6 +30,7 @@ from random import shuffle import torch +from torch._six import inf from common import TestCase, run_tests, set_rng_seed, TEST_WITH_UBSAN from common_cuda import TEST_CUDA from torch.autograd import grad, gradcheck @@ -782,7 +783,7 @@ def test_geometric(self): s = 0.3 self.assertEqual(Geometric(p).sample((8,)).size(), (8, 3)) self.assertEqual(Geometric(1).sample(), 0) - self.assertEqual(Geometric(1).log_prob(torch.tensor(1.)), -float('inf'), allow_inf=True) + self.assertEqual(Geometric(1).log_prob(torch.tensor(1.)), -inf, allow_inf=True) self.assertEqual(Geometric(1).log_prob(torch.tensor(0.)), 0) self.assertFalse(Geometric(p).sample().requires_grad) self.assertEqual(Geometric(r).sample((8,)).size(), (8,)) @@ -1162,8 +1163,8 @@ def test_uniform(self): uniform = Uniform(low_1d, high_1d) above_high = torch.tensor([4.0]) below_low = torch.tensor([-1.0]) - self.assertEqual(uniform.log_prob(above_high).item(), -float('inf'), allow_inf=True) - self.assertEqual(uniform.log_prob(below_low).item(), -float('inf'), allow_inf=True) + self.assertEqual(uniform.log_prob(above_high).item(), -inf, allow_inf=True) + self.assertEqual(uniform.log_prob(below_low).item(), -inf, allow_inf=True) # check cdf computation when value outside range self.assertEqual(uniform.cdf(below_low).item(), 0) @@ -1190,7 +1191,7 @@ def test_cauchy(self): loc_1d = torch.zeros(1, requires_grad=True) scale_1d = torch.ones(1, requires_grad=True) self.assertTrue(is_all_nan(Cauchy(loc_1d, scale_1d).mean)) - self.assertEqual(Cauchy(loc_1d, scale_1d).variance, float('inf'), allow_inf=True) + self.assertEqual(Cauchy(loc_1d, scale_1d).variance, inf, allow_inf=True) self.assertEqual(Cauchy(loc, scale).sample().size(), (5, 5)) self.assertEqual(Cauchy(loc, scale).sample((7,)).size(), (7, 5, 5)) self.assertEqual(Cauchy(loc_1d, scale_1d).sample().size(), (1,)) @@ -1216,7 +1217,7 @@ def test_halfcauchy(self): scale = torch.ones(5, 5, requires_grad=True) scale_1d = torch.ones(1, requires_grad=True) self.assertTrue(is_all_nan(HalfCauchy(scale_1d).mean)) - self.assertEqual(HalfCauchy(scale_1d).variance, float('inf'), allow_inf=True) + self.assertEqual(HalfCauchy(scale_1d).variance, inf, allow_inf=True) self.assertEqual(HalfCauchy(scale).sample().size(), (5, 5)) self.assertEqual(HalfCauchy(scale).sample((7,)).size(), (7, 5, 5)) self.assertEqual(HalfCauchy(scale_1d).sample().size(), (1,)) @@ -1714,8 +1715,8 @@ def test_pareto(self): alpha = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True) scale_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True) alpha_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True) - self.assertEqual(Pareto(scale_1d, 0.5).mean, float('inf'), allow_inf=True) - self.assertEqual(Pareto(scale_1d, 0.5).variance, float('inf'), allow_inf=True) + self.assertEqual(Pareto(scale_1d, 0.5).mean, inf, allow_inf=True) + self.assertEqual(Pareto(scale_1d, 0.5).variance, inf, allow_inf=True) self.assertEqual(Pareto(scale, alpha).sample().size(), (2, 3)) self.assertEqual(Pareto(scale, alpha).sample((5,)).size(), (5, 2, 3)) self.assertEqual(Pareto(scale_1d, alpha_1d).sample((1,)).size(), (1, 1)) @@ -1832,7 +1833,7 @@ def test_studentT(self): df_1d = torch.tensor(torch.exp(torch.randn(1)), requires_grad=True) self.assertTrue(is_all_nan(StudentT(1).mean)) self.assertTrue(is_all_nan(StudentT(1).variance)) - self.assertEqual(StudentT(2).variance, float('inf'), allow_inf=True) + self.assertEqual(StudentT(2).variance, inf, allow_inf=True) self.assertEqual(StudentT(df).sample().size(), (2, 3)) self.assertEqual(StudentT(df).sample((5,)).size(), (5, 2, 3)) self.assertEqual(StudentT(df_1d).sample((1,)).size(), (1, 1)) @@ -2962,7 +2963,7 @@ def test_kl_exponential_family(self): def test_kl_infinite(self): for p, q in self.infinite_examples: - self.assertTrue((kl_divergence(p, q) == float('inf')).all(), + self.assertTrue((kl_divergence(p, q) == inf).all(), 'Incorrect KL({}, {})'.format(type(p).__name__, type(q).__name__)) def test_kl_edgecases(self): @@ -2996,7 +2997,7 @@ def test_entropy_monte_carlo(self): continue x = dist.sample(sample_shape=(60000,)) expected = -dist.log_prob(x).mean(0) - ignore = (expected == float('inf')) + ignore = (expected == inf) expected[ignore] = actual[ignore] self.assertEqual(actual, expected, prec=0.2, message='\n'.join([ '{} example {}/{}, incorrect .entropy().'.format(Dist.__name__, i + 1, len(params)), @@ -3157,12 +3158,12 @@ def test_categorical_log_prob(self): def test_categorical_log_prob_with_logits(self): for dtype in ([torch.float, torch.double]): - p = torch.tensor([-float('inf'), 0], dtype=dtype, requires_grad=True) + p = torch.tensor([-inf, 0], dtype=dtype, requires_grad=True) categorical = OneHotCategorical(logits=p) log_pdf_prob_1 = categorical.log_prob(torch.tensor([0, 1], dtype=dtype)) self.assertEqual(log_pdf_prob_1.item(), 0) log_pdf_prob_0 = categorical.log_prob(torch.tensor([1, 0], dtype=dtype)) - self.assertEqual(log_pdf_prob_0.item(), -float('inf'), allow_inf=True) + self.assertEqual(log_pdf_prob_0.item(), -inf, allow_inf=True) def test_multinomial_log_prob(self): for dtype in ([torch.float, torch.double]): @@ -3174,12 +3175,12 @@ def test_multinomial_log_prob(self): def test_multinomial_log_prob_with_logits(self): for dtype in ([torch.float, torch.double]): - p = torch.tensor([-float('inf'), 0], dtype=dtype, requires_grad=True) + p = torch.tensor([-inf, 0], dtype=dtype, requires_grad=True) multinomial = Multinomial(10, logits=p) log_pdf_prob_1 = multinomial.log_prob(torch.tensor([0, 10], dtype=dtype)) self.assertEqual(log_pdf_prob_1.item(), 0) log_pdf_prob_0 = multinomial.log_prob(torch.tensor([10, 0], dtype=dtype)) - self.assertEqual(log_pdf_prob_0.item(), -float('inf'), allow_inf=True) + self.assertEqual(log_pdf_prob_0.item(), -inf, allow_inf=True) class TestLazyLogitsInitialization(TestCase): diff --git a/test/test_jit.py b/test/test_jit.py index 706f33fe2db1d5..a20436e167188f 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -333,9 +333,9 @@ def test_scopes(self): def f(x, y): out = x + y - with torch.jit.scope('Foo', out): + with torch.jit.scope('Foo'): out = x * out - with torch.jit.scope('Bar', out): + with torch.jit.scope('Bar'): out = torch.tanh(out) out = torch.sigmoid(out) return out @@ -488,6 +488,16 @@ def test_relu(self): ge = self.checkTrace(self.fn_test_relu, (x, y)) + @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows") + @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") + def test_small_constant(self): + def fn_test_small_constant(x, y): + return (1e-8 * x + 5e-9 * y) * 1e8 + x = torch.randn(4, 4, dtype=torch.float, device='cuda') + y = torch.randn(4, 4, dtype=torch.float, device='cuda') + + ge = self.checkTrace(fn_test_small_constant, (x, y)) + @staticmethod def fn_test_exp(x, y): return (x + .5 * y).exp() diff --git a/test/test_multiprocessing.py b/test/test_multiprocessing.py index e88a3578f2d9d2..da7ee1cdd545c7 100644 --- a/test/test_multiprocessing.py +++ b/test/test_multiprocessing.py @@ -249,8 +249,6 @@ def test_fd_sharing(self): self._test_sharing(repeat=TEST_REPEATS) @unittest.skipIf(platform == 'darwin', "file descriptor strategy is not supported on macOS") - @unittest.skipIf(TEST_WITH_ASAN, - "test_fd_preserve_sharing is known buggy, see https://github.com/pytorch/pytorch/issues/5311") def test_fd_preserve_sharing(self): self._test_preserve_sharing(repeat=TEST_REPEATS) @@ -264,8 +262,6 @@ def test_fs_sharing(self): with fs_sharing(): self._test_sharing(repeat=TEST_REPEATS) - @unittest.skipIf(TEST_WITH_ASAN, - "test_fs_preserve_sharing is known buggy, see https://github.com/pytorch/pytorch/issues/5311") def test_fs_preserve_sharing(self): with fs_sharing(): self._test_preserve_sharing(repeat=TEST_REPEATS) diff --git a/test/test_nn.py b/test/test_nn.py index 6e4de3aba533cb..f318132d9b51ea 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -15,6 +15,7 @@ import os import torch +from torch._six import inf, nan import torch.backends.cudnn as cudnn import torch.nn as nn import torch.nn.functional as F @@ -733,6 +734,20 @@ def _test_dropout(self, cls, input): module.__repr__() str(module) + def _test_alpha_dropout(self, cls, input): + mean = input.mean() + std = input.std() + + for p in [0.2, 0.5, 0.8]: + module = cls(p) + input_var = torch.tensor(input, requires_grad=True) + output = module(input_var) + # output mean should be close to input mean + self.assertLess(abs(output.data.mean() - mean), 0.1) + # output std should be close to input std + self.assertLess(abs(output.data.std() - std), 0.1) + output.backward(input) + def test_parameters(self): def num_params(module): return len(list(module.parameters())) @@ -1451,7 +1466,7 @@ def test_clip_grad_norm(self): def compute_norm(norm_type): norm_type = float(norm_type) - if norm_type != float('inf'): + if norm_type != inf: total_norm = 0 for p in l.parameters(): total_norm += p.grad.data.abs().pow(norm_type).sum() @@ -1546,8 +1561,6 @@ def test_vector_to_parameters(self): # We don't want to make propagating NaN a hard requirement on ops, but for # these easy ones, we should make them do so. def _test_nonlinearity_propagate_nan(self, device): - nan = float('nan') - def test(nonlinearity, *args, **kwargs): x = torch.tensor([nan], device=device) fn = getattr(F, nonlinearity) @@ -2086,19 +2099,16 @@ def test_Dropout3d(self): def test_AlphaDropout(self): # generate random tensor with zero mean and unit std input = torch.randn(5000) + self._test_alpha_dropout(nn.AlphaDropout, input) - mean = input.mean() - std = input.std() - - for p in [0.2, 0.5, 0.8]: - module = nn.AlphaDropout(p) - input_var = torch.tensor(input, requires_grad=True) - output = module(input_var) - # output mean should be close to input mean - self.assertLess(abs(output.data.mean() - mean), 0.1) - # output std should be close to input std - self.assertLess(abs(output.data.std() - std), 0.1) - output.backward(input) + def test_FeatureAlphaDropout(self): + b = random.randint(1, 5) + w = random.randint(1, 5) + h = random.randint(1, 5) + d = random.randint(1, 2) + num_features = 1000 + input = torch.randn(num_features, b, d, w, h) + self._test_alpha_dropout(nn.FeatureAlphaDropout, input) def _test_InstanceNorm_general(self, cls, input, device="cpu", dtype=torch.float): # default case track_running_stats=False @@ -2536,7 +2546,7 @@ def _test_max_pool_nan(self, device, dtype=torch.float): for num_dim in [1, 2, 3]: fn_name = '{}max_pool{}d'.format(adaptive, num_dim) fn = getattr(F, fn_name) - x = torch.full([1, 1] + num_dim * [3], float('nan')) + x = torch.full([1, 1] + num_dim * [3], nan) res = fn(x, 1 if adaptive else 3) self.assertTrue(math.isnan(res.item())) diff --git a/test/test_optim.py b/test/test_optim.py index 67328919c32df6..35aa7b2bfb7a5b 100644 --- a/test/test_optim.py +++ b/test/test_optim.py @@ -3,6 +3,7 @@ import functools from copy import deepcopy import torch +from torch._six import inf import torch.optim as optim import torch.legacy.optim as old_optim import torch.nn.functional as F @@ -479,8 +480,8 @@ def test_lbfgs(self): @unittest.skipIf(TEST_WITH_UBSAN, "division-by-zero error with UBSAN") def test_lbfgs_return_type(self): params = [torch.randn(10, 5), torch.randn(10)] - opt1 = optim.LBFGS(params, 0.01, tolerance_grad=float('inf')) - opt2 = optim.LBFGS(params, 0.01, tolerance_grad=-float('inf')) + opt1 = optim.LBFGS(params, 0.01, tolerance_grad=inf) + opt2 = optim.LBFGS(params, 0.01, tolerance_grad=-inf) def closure(): return torch.Tensor([10]) diff --git a/test/test_torch.py b/test/test_torch.py index 4a015829c389a5..bf8f9102529595 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -16,12 +16,13 @@ from torch._utils_internal import get_file_path, get_file_path_2 from torch.utils.dlpack import from_dlpack, to_dlpack from torch._utils import _rebuild_tensor +from torch._six import inf, nan from itertools import product, combinations from functools import reduce from torch import multiprocessing as mp from common import TestCase, iter_indices, TEST_NUMPY, TEST_SCIPY, TEST_MKL, \ - run_tests, download_file, skipIfNoLapack, suppress_warnings, IS_WINDOWS, \ - PY3, NO_MULTIPROCESSING_SPAWN, skipIfNoZeroSize + TEST_LIBROSA, run_tests, download_file, skipIfNoLapack, suppress_warnings, \ + IS_WINDOWS, PY3, NO_MULTIPROCESSING_SPAWN, skipIfNoZeroSize from multiprocessing.reduction import ForkingPickler if TEST_NUMPY: @@ -30,6 +31,9 @@ if TEST_SCIPY: from scipy import signal +if TEST_LIBROSA: + import librosa + SIZE = 100 can_retrieve_source = True @@ -84,6 +88,60 @@ def __exit__(self, *args): class TestTorch(TestCase): + def _check_sum_dim(tensors, dim): + for tensor in tensors: + expected = tensor.numpy().sum(dim) + actual = tensor.sum(dim) + self.assertEqual(expected.shape, actual.shape) + if actual.dtype == torch.float: + self.assertTrue(np.allclose(expected, actual.numpy(), rtol=1e-03, atol=1e-05)) + else: + self.assertTrue(np.allclose(expected, actual.numpy())) + + def _make_tensors(self, shape, val_range=(-100, 100), use_floating=True, use_integral=True): + float_types = [torch.double, + torch.float] + int_types = [torch.int64, + torch.int32, + torch.int16] + + def make_contiguous(shape, dtype): + if dtype in float_types: + val = torch.randn(shape, dtype=dtype) + val = val * ((val_range[1] - val_range[0]) / (math.pi * 2.0)) + val = val + ((val_range[1] - val_range[0]) / 2.0) + val = torch.clamp(val, min=val_range[0], max=val_range[1]) + return val + result = torch.zeros(shape, dtype=dtype) + result.apply_(lambda x: random.randint(val_range[0], val_range[1])) + return result + + def make_non_contiguous(shape, dtype): + contig = make_contiguous(shape, dtype) + non_contig = torch.empty(shape + (2, 2), dtype=dtype)[..., 0] + non_contig = non_contig.select(-1, -1) + non_contig.copy_(contig) + self.assertFalse(non_contig.is_contiguous()) + return non_contig + + def make_contiguous_slice(size, dtype): + contig = make_contiguous((1, size), dtype) + non_contig = contig[:1, 1:size - 1] + self.assertTrue(non_contig.is_contiguous()) + return contig + + types = [] + if use_floating: + types += float_types + if use_integral: + types += int_types + tensors = {"cont": [], "noncont": [], "slice": []} + for dtype in types: + tensors["cont"].append(make_contiguous(shape, dtype)) + tensors["noncont"].append(make_non_contiguous(shape, dtype)) + tensors["slice"].append(make_contiguous_slice(sum(list(shape)), dtype)) + + return tensors def test_dot(self): types = { @@ -238,17 +296,17 @@ def test_allclose(self): self.assertTrue(torch.allclose(x, y, rtol=0.01, atol=0.0)) self.assertFalse(torch.allclose(x, y)) self.assertTrue(torch.allclose(torch.tensor([0.0]), torch.tensor([1e-8]))) - x = torch.tensor([2.0, 3.0, float('nan')]) - y = torch.tensor([2.01, 3.01, float('nan')]) + x = torch.tensor([2.0, 3.0, nan]) + y = torch.tensor([2.01, 3.01, nan]) self.assertFalse(torch.allclose(x, y, rtol=1e-2)) self.assertTrue(torch.allclose(x, y, rtol=1e-2, equal_nan=True)) self.assertFalse(torch.allclose(x, y, rtol=1e-3, equal_nan=True)) - inf = torch.tensor([float('inf')]) - self.assertTrue(torch.allclose(inf, inf)) - self.assertTrue(torch.allclose(-inf, -inf)) - self.assertFalse(torch.allclose(inf, -inf)) - self.assertFalse(torch.allclose(inf, torch.tensor([1e20]))) - self.assertFalse(torch.allclose(-inf, torch.tensor([-1e20]))) + inf_t = torch.tensor([inf]) + self.assertTrue(torch.allclose(inf_t, inf_t)) + self.assertTrue(torch.allclose(-inf_t, -inf_t)) + self.assertFalse(torch.allclose(inf_t, -inf_t)) + self.assertFalse(torch.allclose(inf_t, torch.tensor([1e20]))) + self.assertFalse(torch.allclose(-inf_t, torch.tensor([-1e20]))) def test_linear_algebra_scalar_raises(self): m = torch.randn(5, 5) @@ -356,13 +414,13 @@ def sinh(x): try: return math.sinh(x) except OverflowError: - return float('inf') if x > 0 else float('-inf') + return inf if x > 0 else -inf self._test_math(torch.sinh, sinh) def test_lgamma(self): def lgamma(x): if x <= 0 and x == int(x): - return float('inf') + return inf return math.lgamma(x) self._test_math(torch.lgamma, lgamma) @@ -389,14 +447,14 @@ def test_digamma(self): # scipy 1.1.0 changed when it returns +/-inf vs. NaN def torch_digamma_without_inf(inp): res = torch.digamma(inp) - res[(res == float('-inf')) | (res == float('inf'))] = float('nan') + res[(res == -inf) | (res == inf)] = nan return res def scipy_digamma_without_inf(inp): res = digamma(inp) if np.isscalar(res): - return res if np.isfinite(res) else float('nan') - res[np.isinf(res)] = float('nan') + return res if np.isfinite(res) else nan + res[np.isinf(res)] = nan return res self._test_math(torch_digamma_without_inf, scipy_digamma_without_inf, self._digamma_input()) @@ -410,7 +468,7 @@ def test_polygamma(self): self._digamma_input(test_poles=False)) def test_asin(self): - self._test_math(torch.asin, lambda x: math.asin(x) if abs(x) <= 1 else float('nan')) + self._test_math(torch.asin, lambda x: math.asin(x) if abs(x) <= 1 else nan) def test_cos(self): self._test_math_by_name('cos') @@ -422,11 +480,11 @@ def cosh(x): except OverflowError: # Return inf on overflow. # See http://en.cppreference.com/w/cpp/numeric/math/cosh - return float('inf') + return inf self._test_math(torch.cosh, cosh) def test_acos(self): - self._test_math(torch.acos, lambda x: math.acos(x) if abs(x) <= 1 else float('nan')) + self._test_math(torch.acos, lambda x: math.acos(x) if abs(x) <= 1 else nan) def test_tan(self): self._test_math_by_name('tan') @@ -440,36 +498,36 @@ def test_atan(self): def test_log(self): def log(x): if x == 0: - return float('-inf') + return -inf elif x < 0: - return float('nan') + return nan return math.log(x) self._test_math(torch.log, log) def test_log10(self): def log10(x): if x == 0: - return float('-inf') + return -inf elif x < 0: - return float('nan') + return nan return math.log10(x) self._test_math(torch.log10, log10) def test_log1p(self): def log1p(x): if x == -1: - return float('-inf') + return -inf elif x < -1: - return float('nan') + return nan return math.log1p(x) self._test_math(torch.log1p, log1p) def test_log2(self): def log2(x): if x == 0: - return float('-inf') + return -inf elif x < 0: - return float('nan') + return nan try: return math.log2(x) except AttributeError: @@ -477,7 +535,7 @@ def log2(x): self._test_math(torch.log2, log2) def test_sqrt(self): - self._test_math(torch.sqrt, lambda x: math.sqrt(x) if x >= 0 else float('nan')) + self._test_math(torch.sqrt, lambda x: math.sqrt(x) if x >= 0 else nan) def test_erf(self): self._test_math_by_name('erf') @@ -490,9 +548,9 @@ def checkType(tensor): inputValues = torch.randn(4, 4, out=tensor()).clamp(-2., 2.) self.assertEqual(tensor(inputValues).erf().erfinv(), tensor(inputValues)) # test inf - self.assertTrue(torch.equal(tensor([-1, 1]).erfinv(), tensor([float('-inf'), float('inf')]))) + self.assertTrue(torch.equal(tensor([-1, 1]).erfinv(), tensor([-inf, inf]))) # test nan - self.assertEqual(tensor([-2, 2]).erfinv(), tensor([float('nan'), float('nan')])) + self.assertEqual(tensor([-2, 2]).erfinv(), tensor([nan, nan])) checkType(torch.FloatTensor) checkType(torch.DoubleTensor) @@ -502,7 +560,7 @@ def exp(x): try: return math.exp(x) except OverflowError: - return float('inf') + return inf self._test_math(torch.exp, exp) def test_expm1(self): @@ -510,7 +568,7 @@ def expm1(x): try: return math.expm1(x) except OverflowError: - return float('inf') + return inf self._test_math(torch.expm1, expm1) def test_floor(self): @@ -522,9 +580,9 @@ def test_ceil(self): def test_rsqrt(self): def rsqrt(x): if x == 0: - return float('inf') + return inf elif x < 0: - return float('nan') + return nan return 1.0 / math.sqrt(x) self._test_math(torch.rsqrt, rsqrt) @@ -612,7 +670,7 @@ def _testSelection(self, torchfn, mathfn): # NaNs for index in (0, 4, 99): m1 = torch.randn(100) - m1[index] = float('nan') + m1[index] = nan res1val, res1ind = torch.max(m1, 0) self.assertTrue(math.isnan(res1val)) self.assertEqual(res1ind, index) @@ -630,14 +688,14 @@ def _test_norm(self, device): # full reduction x = torch.randn(5, device=device) xn = x.cpu().numpy() - for p in [0, 1, 2, 3, 4, float('inf')]: + for p in [0, 1, 2, 3, 4, inf]: res = x.norm(p).item() expected = np.linalg.norm(xn, p) self.assertEqual(res, expected, "full reduction failed for {}-norm".format(p)) # one dimension x = torch.randn(5, 5, device=device) xn = x.cpu().numpy() - for p in [0, 1, 2, 3, 4, float('inf')]: + for p in [0, 1, 2, 3, 4, inf]: res = x.norm(p, 1).cpu().numpy() expected = np.linalg.norm(xn, p, 1) self.assertEqual(res.shape, expected.shape) @@ -805,10 +863,10 @@ def test_reduction_empty(self): ('prod', lambda *args, **kwargs: torch.prod(*args, **kwargs), 1), ('sum', lambda *args, **kwargs: torch.sum(*args, **kwargs), 0), ('norm', lambda *args, **kwargs: torch.norm(*args, p=2, **kwargs), 0), - ('mean', lambda *args, **kwargs: torch.mean(*args, **kwargs), float('nan')), - ('var', lambda *args, **kwargs: torch.var(*args, **kwargs), float('nan')), - ('std', lambda *args, **kwargs: torch.std(*args, **kwargs), float('nan')), - ('logsumexp', lambda *args, **kwargs: torch.logsumexp(*args, **kwargs), float('-inf')), + ('mean', lambda *args, **kwargs: torch.mean(*args, **kwargs), nan), + ('var', lambda *args, **kwargs: torch.var(*args, **kwargs), nan), + ('std', lambda *args, **kwargs: torch.std(*args, **kwargs), nan), + ('logsumexp', lambda *args, **kwargs: torch.logsumexp(*args, **kwargs), -inf), ] devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] @@ -875,8 +933,8 @@ def test_pairwise_distance_empty(self): def test_logsumexp(self): from scipy.special import logsumexp a = torch.randn(5, 4) - a[0, 0] = float('inf') - a[1, :] = float('-inf') + a[0, 0] = inf + a[1, :] = -inf actual = a.logsumexp(1) expected = logsumexp(a.numpy(), 1) self.assertEqual(expected.shape, actual.shape) @@ -1064,18 +1122,18 @@ def test_csub(self): @staticmethod def _test_neg(self, cast): - float_types = ['torch.DoubleTensor', 'torch.FloatTensor', 'torch.LongTensor'] - int_types = ['torch.IntTensor', 'torch.ShortTensor', 'torch.ByteTensor', - 'torch.CharTensor'] + float_types = [torch.DoubleTensor, torch.FloatTensor, torch.LongTensor] + int_types = [torch.IntTensor, torch.ShortTensor, torch.ByteTensor, + torch.CharTensor] for t in float_types + int_types: if t in float_types: a = cast(torch.randn(100, 90).type(t)) else: - a = cast(torch.Tensor(100, 90).type(t).random_(-128, 128)) + a = cast(torch.randint(-128, 128, (100, 90), dtype=t.dtype)) zeros = cast(torch.Tensor().type(t)).resize_as_(a).zero_() - if t == 'torch.ByteTensor': + if t == torch.ByteTensor: res_add = torch.add(zeros, a, alpha=255) else: res_add = torch.add(zeros, a, alpha=-1) @@ -1537,7 +1595,7 @@ def test_cmul(self): self._test_cop(torch.mul, lambda x, y: x * y) def test_cpow(self): - self._test_cop(torch.pow, lambda x, y: float('nan') if x < 0 else math.pow(x, y)) + self._test_cop(torch.pow, lambda x, y: nan if x < 0 else math.pow(x, y)) @unittest.skipIf(not TEST_NUMPY, 'Numpy not found') def test_einsum(self): @@ -1615,65 +1673,32 @@ def check_sum_all(tensor): @unittest.skipIf(not TEST_NUMPY, 'Numpy not found') def test_sum_dim(self): - def check_sum_dim(tensors, dim): - for tensor in tensors: - expected = tensor.numpy().sum(dim) - actual = tensor.sum(dim) - self.assertEqual(expected.shape, actual.shape) - if actual.dtype == torch.float: - self.assertTrue(np.allclose(expected, actual.numpy(), rtol=1e-03, atol=1e-05)) - else: - self.assertTrue(np.allclose(expected, actual.numpy())) - - float_types = [torch.double, - torch.float] - int_types = [torch.int64, - torch.int32, - torch.int16] - - def make_contiguous(shape, dtype): - if dtype in float_types: - return torch.randn(*shape, dtype=dtype) - result = torch.zeros(*shape, dtype=dtype) - result.apply_(lambda x: random.randint(-100, 100)) - return result - - def make_non_contiguous(shape, dtype): - contig = make_contiguous(shape, dtype) - non_contig = torch.empty(shape + (2,), dtype=dtype)[..., 0] - non_contig.copy_(contig) - self.assertFalse(non_contig.is_contiguous()) - return non_contig - - def make_tensors(*shape): - tensors = [] - for dtype in float_types + int_types: - tensors.append(make_contiguous(shape, dtype)) - tensors.append(make_non_contiguous(shape, dtype)) - return tensors - - check_sum_dim(make_tensors(5, 400000), 1) - check_sum_dim(make_tensors(3, 5, 7), 0) - check_sum_dim(make_tensors(3, 5, 7), 1) - check_sum_dim(make_tensors(3, 5, 7), 2) - check_sum_dim(make_tensors(100000), -1) - check_sum_dim(make_tensors(50, 50, 50), 0) - check_sum_dim(make_tensors(50, 50, 50), 1) - check_sum_dim(make_tensors(50, 50, 50), 2) - check_sum_dim(make_tensors(50, 50, 50), (1, 2)) - check_sum_dim(make_tensors(50, 50, 50), (1, -1)) - - def make_contiguous_slice(size, dtype): - contig = make_contiguous((1, size), dtype) - non_contig = contig[:1, 1:size - 1] - self.assertTrue(non_contig.is_contiguous()) - return contig - - for dtype in float_types + int_types: - check_sum_dim(make_contiguous_slice(5, dtype), 0) - check_sum_dim(make_contiguous_slice(50, dtype), 0) - check_sum_dim(make_contiguous_slice(500, dtype), 0) - check_sum_dim(make_contiguous_slice(100000, dtype), 0) + def check_sum_dim(tensors_dict, dim): + for category, tensors in tensors_dict.items(): + if category == "slice": + dim = 0 + for tensor in tensors: + expected = tensor.numpy().sum(dim) + actual = tensor.sum(dim) + self.assertEqual(expected.shape, actual.shape) + if actual.dtype == torch.float: + self.assertTrue(np.allclose(expected, actual.numpy(), rtol=1e-03, atol=1e-05)) + else: + self.assertTrue(np.allclose(expected, actual.numpy())) + + float_types = [torch.double, torch.float] + int_types = [torch.int64, torch.int32, torch.int16] + + check_sum_dim(self._make_tensors((5, 400000)), 1) + check_sum_dim(self._make_tensors((3, 5, 7)), 0) + check_sum_dim(self._make_tensors((3, 5, 7)), 1) + check_sum_dim(self._make_tensors((3, 5, 7)), 2) + check_sum_dim(self._make_tensors((100000, )), -1) + check_sum_dim(self._make_tensors((50, 50, 50)), 0) + check_sum_dim(self._make_tensors((50, 50, 50)), 1) + check_sum_dim(self._make_tensors((50, 50, 50)), 2) + check_sum_dim(self._make_tensors((50, 50, 50)), (1, 2)) + check_sum_dim(self._make_tensors((50, 50, 50)), (1, -1)) def test_sum_out(self): x = torch.rand(100, 100) @@ -1906,6 +1931,12 @@ def test_device(self): self.assertRaises(TypeError, lambda: torch.device('other')) self.assertRaises(TypeError, lambda: torch.device('other:0')) + device_set = {'cpu', 'cpu:0', 'cuda', 'cuda:0', 'cuda:1', 'cuda:10', 'cuda:100'} + device_hash_set = set() + for device in list(device_set): + device_hash_set.add(hash(torch.device(device))) + self.assertEqual(len(device_set), len(device_hash_set)) + def test_tensor_device(self): def assertEqual(device_str, fn): self.assertEqual(torch.device(device_str), fn().device) @@ -2407,7 +2438,7 @@ def _test_renorm_ps(self, device): # full reduction x = torch.randn(5, 5) xn = x.numpy() - for p in [1, 2, 3, 4, float('inf')]: + for p in [1, 2, 3, 4, inf]: res = x.renorm(p, 1, 1) expected = x / x.norm(p, 0, keepdim=True).clamp(min=1) self.assertEqual(res.numpy(), expected.numpy(), "renorm failed for {}-norm".format(p)) @@ -2523,9 +2554,9 @@ def _test_multinomial_invalid_probs(probs): def test_multinomial_invalid_probs(self): test_method = TestTorch._test_multinomial_invalid_probs self._spawn_method(test_method, torch.Tensor([0, -1])) - self._spawn_method(test_method, torch.Tensor([0, float('inf')])) - self._spawn_method(test_method, torch.Tensor([0, float('-inf')])) - self._spawn_method(test_method, torch.Tensor([0, float('nan')])) + self._spawn_method(test_method, torch.Tensor([0, inf])) + self._spawn_method(test_method, torch.Tensor([0, -inf])) + self._spawn_method(test_method, torch.Tensor([0, nan])) @suppress_warnings def test_range(self): @@ -4455,106 +4486,61 @@ def test_fft_ifft_rfft_irfft(self): @staticmethod def _test_stft(self, device='cpu'): - def naive_stft(x, frame_length, hop, fft_size=None, normalized=False, - onesided=True, window=None, pad_end=0): - if fft_size is None: - fft_size = frame_length - x = x.clone() + if not TEST_LIBROSA: + raise unittest.SkipTest('librosa not found') + + def librosa_stft(x, n_fft, hop_length, win_length, window, center): if window is None: - window = x.new_ones(frame_length) + window = np.ones(n_fft if win_length is None else win_length) else: - window = window.clone() + window = window.cpu().numpy() input_1d = x.dim() == 1 if input_1d: x = x.view(1, -1) - batch = x.size(0) - if pad_end > 0: - x_pad = x.new(batch, pad_end).fill_(0) - x = torch.cat([x, x_pad], 1) - length = x.size(1) - if TEST_NUMPY and TEST_SCIPY: - sp_result = signal.stft( - x, - nperseg=frame_length, - noverlap=frame_length - hop, - window=window, - nfft=fft_size, - return_onesided=onesided, - boundary=None, - padded=False, - )[2].transpose((0, 2, 1)) * np.abs(window.sum().item()) - result = torch.Tensor(np.stack([sp_result.real, sp_result.imag], -1)) - else: - if onesided: - return_size = int(fft_size / 2) + 1 - else: - return_size = fft_size - result = x.new(batch, int((length - frame_length) / float(hop)) + 1, return_size, 2) - for w in range(return_size): # freq - radians = torch.arange(float(frame_length)) * w * 2 * math.pi / fft_size - radians = radians.type_as(x) - re_kernel = radians.cos().mul_(window) - im_kernel = -radians.sin().mul_(window) - for b in range(batch): - for i, t in enumerate(range(0, length - frame_length + 1, hop)): - seg = x[b, t:(t + frame_length)] - re = seg.dot(re_kernel) - im = seg.dot(im_kernel) - result[b, i, w, 0] = re - result[b, i, w, 1] = im - if normalized: - result /= frame_length ** 0.5 + result = [] + for xi in x: + ri = librosa.stft(xi.cpu().numpy(), n_fft, hop_length, win_length, window, center=center) + result.append(torch.from_numpy(np.stack([ri.real, ri.imag], -1))) + result = torch.stack(result, 0) if input_1d: result = result[0] return result - def _test(sizes, frame_length, hop, fft_size=None, normalized=False, - onesided=True, window_sizes=None, pad_end=0, expected_error=None): + def _test(sizes, n_fft, hop_length=None, win_length=None, win_sizes=None, + center=True, expected_error=None): x = torch.randn(*sizes, device=device) - if window_sizes is not None: - window = torch.randn(*window_sizes, device=device) + if win_sizes is not None: + window = torch.randn(*win_sizes, device=device) else: window = None if expected_error is None: - result = x.stft(frame_length, hop, fft_size, normalized, onesided, window, pad_end) - ref_result = naive_stft(x, frame_length, hop, fft_size, normalized, onesided, window, pad_end) - self.assertEqual(result.data, ref_result, 7e-6, 'stft result') + result = x.stft(n_fft, hop_length, win_length, window, center=center) + ref_result = librosa_stft(x, n_fft, hop_length, win_length, window, center) + self.assertEqual(result, ref_result, 7e-6, 'stft comparison against librosa') else: self.assertRaises(expected_error, - lambda: x.stft(frame_length, hop, fft_size, normalized, onesided, window, pad_end)) - - _test((2, 5), 4, 2, pad_end=1) - _test((4, 150), 90, 45, pad_end=0) - _test((10,), 7, 2, pad_end=0) - _test((10, 4000), 1024, 512, pad_end=0) + lambda: x.stft(n_fft, hop_length, win_length, window, center=center)) - _test((2, 5), 4, 2, window_sizes=(4,), pad_end=1) - _test((4, 150), 90, 45, window_sizes=(90,), pad_end=0) - _test((10,), 7, 2, window_sizes=(7,), pad_end=0) - _test((10, 4000), 1024, 512, window_sizes=(1024,), pad_end=0) + for center in [True, False]: + _test((10,), 7, center=center) + _test((10, 4000), 1024, center=center) - _test((2, 5), 4, 2, fft_size=5, window_sizes=(4,), pad_end=1) - _test((4, 150), 90, 45, fft_size=100, window_sizes=(90,), pad_end=0) - _test((10,), 7, 2, fft_size=33, window_sizes=(7,), pad_end=0) - _test((10, 4000), 1024, 512, fft_size=1500, window_sizes=(1024,), pad_end=0) + _test((10,), 7, 2, center=center) + _test((10, 4000), 1024, 512, center=center) - _test((2, 5), 4, 2, fft_size=5, onesided=False, window_sizes=(4,), pad_end=1) - _test((4, 150), 90, 45, fft_size=100, onesided=False, window_sizes=(90,), pad_end=0) - _test((10,), 7, 2, fft_size=33, onesided=False, window_sizes=(7,), pad_end=0) - _test((10, 4000), 1024, 512, fft_size=1500, onesided=False, window_sizes=(1024,), pad_end=0) + _test((10,), 7, 2, win_sizes=(7,), center=center) + _test((10, 4000), 1024, 512, win_sizes=(1024,), center=center) - _test((2, 5), 4, 2, fft_size=5, normalized=True, onesided=False, window_sizes=(4,), pad_end=1) - _test((4, 150), 90, 45, fft_size=100, normalized=True, onesided=False, window_sizes=(90,), pad_end=0) - _test((10,), 7, 2, fft_size=33, normalized=True, onesided=False, window_sizes=(7,), pad_end=0) - _test((10, 4000), 1024, 512, fft_size=1500, normalized=True, onesided=False, window_sizes=(1024,), pad_end=0) + # spectral oversample + _test((10,), 7, 2, win_length=5, center=center) + _test((10, 4000), 1024, 512, win_length=100, center=center) _test((10, 4, 2), 1, 1, expected_error=RuntimeError) - _test((10,), 11, 1, expected_error=RuntimeError) - _test((10,), 0, 1, pad_end=4, expected_error=RuntimeError) - _test((10,), 15, 1, pad_end=4, expected_error=RuntimeError) - _test((10,), 5, -4, expected_error=RuntimeError) - _test((10,), 5, 4, window_sizes=(11,), expected_error=RuntimeError) - _test((10,), 5, 4, window_sizes=(1, 1), expected_error=RuntimeError) + _test((10,), 11, 1, center=False, expected_error=RuntimeError) + _test((10,), -1, 1, expected_error=RuntimeError) + _test((10,), 3, win_length=5, expected_error=RuntimeError) + _test((10,), 5, 4, win_sizes=(11,), expected_error=RuntimeError) + _test((10,), 5, 4, win_sizes=(1, 1), expected_error=RuntimeError) def test_stft(self): self._test_stft(self) @@ -4707,14 +4693,18 @@ def test_logical(self): self.assertEqual(neqs.long().sum(), xne.long().sum(), 0) self.assertEqual(x.nelement(), all.long().sum()) - def test_isnan(self): - x = torch.Tensor([1, float('nan'), 2]) - self.assertEqual(torch.isnan(x), torch.ByteTensor([0, 1, 0])) + def test_isfinite(self): + x = torch.Tensor([1, inf, 2, -inf, nan, -10]) + self.assertEqual(torch.isfinite(x), torch.ByteTensor([1, 0, 1, 0, 0, 1])) def test_isinf(self): - x = torch.Tensor([1, float('inf'), 2, float('-inf'), float('nan')]) + x = torch.Tensor([1, inf, 2, -inf, nan]) self.assertEqual(torch.isinf(x), torch.ByteTensor([0, 1, 0, 1, 0])) + def test_isnan(self): + x = torch.Tensor([1, nan, 2]) + self.assertEqual(torch.isnan(x), torch.ByteTensor([0, 1, 0])) + def test_RNGState(self): state = torch.get_rng_state() stateCloned = state.clone() @@ -5871,26 +5861,31 @@ def test_masked_fill(self): self.assertEqual(dst, dst2, 0) def test_abs(self): - size = 1000 - max_val = 1000 - original = torch.rand(size).mul(max_val) - # Tensor filled with values from {-1, 1} - switch = torch.rand(size).mul(2).floor().mul(2).add(-1) + def _test_abs(tensors_dict): + for category, tensors in tensors_dict.items(): + for data in tensors: + switch = torch.rand(data.size()).mul(2).floor().mul(2).add(-1).type(data.dtype) + res = torch.mul(data, switch) + self.assertTensorsSlowEqual(res.abs(), data, 1e-16) - types = ['torch.DoubleTensor', 'torch.FloatTensor', 'torch.LongTensor', - 'torch.IntTensor', 'torch.ShortTensor'] - for t in types: - data = original.type(t) - switch = switch.type(t) - res = torch.mul(data, switch) - # abs is used in assertEqual so we use the slow version instead - self.assertTensorsSlowEqual(res.abs(), data, 1e-16) + max_val = 1000 + _test_abs(self._make_tensors((3, 4), val_range=(0, max_val))) + _test_abs(self._make_tensors((3, 5, 7), val_range=(0, max_val))) + _test_abs(self._make_tensors((2, 2, 5, 8, 2, 3), val_range=(0, max_val))) + _test_abs(self._make_tensors((1000, ), val_range=(0, max_val))) + _test_abs(self._make_tensors((30, 30, 30), val_range=(0, max_val))) # Checking that the right abs function is called for LongTensor bignumber = 2 ^ 31 + 1 res = torch.LongTensor((-bignumber,)) self.assertGreater(res.abs()[0], 0) + # One of + rec = torch.randn(2, 2, 3, 7, 6, 2).type(torch.float64).clamp(0, 1) + val1 = rec.select(-1, -1).data[0][0][0].sum() + val2 = rec.select(-1, -1).data.abs()[0][0][0].sum() + self.assertEqual(val1, val2, 1e-8, 'absolute value') + def test_hardshrink(self): data_original = torch.tensor([1, 0.5, 0.3, 0.6]).view(2, 2) float_types = [ @@ -6031,6 +6026,11 @@ def test_reshape(self): self.assertEqual(empty.reshape([1, -1]).shape, (0,)) self.assertRaises(RuntimeError, lambda: empty.reshape(1)) + x = torch.randn(3, 3) + self.assertEqual(x.data_ptr(), x.reshape_as(torch.rand(9)).data_ptr()) + self.assertEqual(x.data_ptr(), x.reshape_as(torch.rand(1, 9, 1)).data_ptr()) + self.assertRaises(RuntimeError, lambda: x.reshape_as(torch.rand(10))) + @skipIfNoZeroSize def test_empty_reshape(self): x = torch.randn(0, 6) @@ -6106,6 +6106,255 @@ def test_tensor_shape_empty(self): self.assertEqual([(0, 1, 3, 0)], [z.shape for z in torch.split(x, 1, dim=0)]) self.assertEqual([(0, 1, 3, 0)], [z.shape for z in torch.split(x, 0, dim=0)]) + # functions that operate over a dimension but don't reduce. + @skipIfNoZeroSize + def test_dim_function_empty(self): + # FIXME: enable CUDA tests. + devices = ['cpu'] # if not torch.cuda.is_available() else ['cpu', 'cuda'] + for device in devices: + shape = (0, 1, 2, 0) + x = torch.randn(shape, device=device) + + # size stride + self.assertEqual(0, x.size(3)) + self.assertEqual(2, x.size(2)) + self.assertEqual(2, x.stride(0)) + self.assertEqual(1, x.stride(2)) + + self.assertEqual(x, torch.nn.functional.glu(x, 0)) + self.assertEqual((0, 1, 1, 0), torch.nn.functional.glu(x, 2).shape) + + # softmax, logsoftmax + self.assertEqual(x, torch.nn.functional.softmax(x, 0)) + self.assertEqual(x, torch.nn.functional.softmax(x, 2)) + + self.assertEqual(x, torch.nn.functional.log_softmax(x, 0)) + self.assertEqual(x, torch.nn.functional.log_softmax(x, 2)) + + # cumsum, cumprod + self.assertEqual(shape, torch.cumsum(x, 0).shape) + self.assertEqual(shape, torch.cumsum(x, 2).shape) + self.assertEqual(shape, torch.cumprod(x, 0).shape) + self.assertEqual(shape, torch.cumprod(x, 2).shape) + + # flip + self.assertEqual(x, x.flip(0)) + self.assertEqual(x, x.flip(2)) + + # unbind + self.assertEqual((), x.unbind(0)) + self.assertEqual((torch.empty((0, 1, 0), device=device), torch.empty((0, 1, 0), device=device)), + x.unbind(2)) + + # cross + y = torch.randn((0, 1, 3, 0), device=device) + self.assertEqual(y.shape, torch.cross(y, y).shape) + + # renorm + self.assertEqual(shape, torch.renorm(x, 1, 0, 5).shape) + self.assertEqual(shape, torch.renorm(x, 1, 2, 5).shape) + + # sort + self.assertEqual([shape, shape], [z.shape for z in torch.sort(x, dim=0)]) + self.assertEqual([shape, shape], [z.shape for z in torch.sort(x, dim=2)]) + + # topk + self.assertEqual([shape, shape], [z.shape for z in torch.topk(x, 0, dim=0)]) + self.assertEqual([(0, 1, 1, 0), (0, 1, 1, 0)], [z.shape for z in torch.topk(x, 1, dim=2)]) + + y = torch.randn((2, 3, 4), device=device) + self.assertEqual([(2, 3, 0), (2, 3, 0)], [z.shape for z in torch.topk(y, 0)]) + + # gather + self.assertEqual(shape, torch.gather(x, 0, torch.empty(shape, dtype=torch.int64)).shape) + self.assertEqual(shape, torch.gather(x, 2, torch.empty(shape, dtype=torch.int64)).shape) + larger_shape = (0, 1, 3, 0) + self.assertEqual(larger_shape, torch.gather(x, 2, torch.empty(larger_shape, dtype=torch.int64)).shape) + smaller_shape = (0, 1, 0, 0) + self.assertEqual(smaller_shape, torch.gather(x, 2, torch.empty(smaller_shape, dtype=torch.int64)).shape) + y = torch.randn((2, 3, 4), device=device) + self.assertEqual((0, 3, 4), torch.gather(y, 0, torch.empty((0, 3, 4), dtype=torch.int64)).shape) + + # scatter, scatter_add + for dim in [0, 2]: + y = torch.randn(shape, device=device) + y_src = torch.randn(shape, device=device) + self.assertEqual(shape, y.scatter_(dim, torch.empty(shape, dtype=torch.int64), y_src).shape) + self.assertEqual(shape, y.scatter_add_(dim, torch.empty(shape, dtype=torch.int64), y_src).shape) + + z = torch.randn((2, 3, 4), device=device) + z_src = torch.randn((2, 3, 4), device=device) + self.assertEqual(z, z.scatter_(2, torch.empty((2, 3, 0), dtype=torch.int64), z_src)) + self.assertEqual(z, z.scatter_add_(2, torch.empty((2, 3, 0), dtype=torch.int64), z_src)) + + # index_fill, index_copy, index_add + c = x.clone() + ind_empty = torch.tensor([], dtype=torch.int64) + ind_01 = torch.tensor([0, 1], dtype=torch.int64) + self.assertEqual(c, c.index_fill_(0, ind_empty, -1)) + self.assertEqual(c, c.index_fill_(2, ind_empty, -1)) + self.assertEqual(c, c.index_fill_(2, torch.tensor([0, 1], dtype=torch.int64), -1)) + self.assertEqual(c, c.index_copy_(0, ind_empty, torch.empty((0, 1, 2, 0), device=device))) + self.assertEqual(c, c.index_copy_(2, ind_empty, torch.empty((0, 1, 0, 0), device=device))) + self.assertEqual(c, c.index_copy_(2, ind_01, torch.empty((0, 1, 2, 0), device=device))) + self.assertEqual(c, c.index_add_(0, ind_empty, torch.empty((0, 1, 2, 0), device=device))) + self.assertEqual(c, c.index_add_(2, ind_empty, torch.empty((0, 1, 0, 0), device=device))) + self.assertEqual(c, c.index_add_(2, ind_01, torch.empty((0, 1, 2, 0), device=device))) + + # index fill/copy/add non-empty + z = torch.randn((2, 3, 4), device=device) + self.assertEqual(z, z.index_fill_(0, ind_empty, -1)) + z = torch.randn((2, 3, 4), device=device) + self.assertEqual(z, z.index_copy_(0, ind_empty, torch.empty((0, 3, 4), device=device))) + z = torch.randn((2, 3, 4), device=device) + self.assertEqual(z, z.index_add_(0, ind_empty, torch.empty((0, 3, 4), device=device))) + + # index_select + self.assertEqual(x, x.index_select(0, ind_empty)) + self.assertEqual((0, 1, 0, 0), x.index_select(2, ind_empty).shape) + self.assertEqual(x, x.index_select(2, ind_01)) + z = torch.randn((2, 3, 4), device=device) # non-empty + self.assertEqual((0, 3, 4), z.index_select(0, ind_empty).shape) + + @skipIfNoZeroSize + def test_blas_empty(self): + devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] + for device in devices: + + def fn(torchfn, *args): + return torchfn(*tuple(torch.randn(shape, device=device) if isinstance(shape, tuple) else shape + for shape in args)) + + # mm, addmm + self.assertEqual((0, 0), fn(torch.mm, (0, 0), (0, 0)).shape) + self.assertEqual((0, 5), fn(torch.mm, (0, 0), (0, 5)).shape) + self.assertEqual((5, 0), fn(torch.mm, (5, 0), (0, 0)).shape) + self.assertEqual((3, 0), fn(torch.mm, (3, 2), (2, 0)).shape) + self.assertEqual(torch.zeros((5, 6), device=device), fn(torch.mm, (5, 0), (0, 6))) + + self.assertEqual((0, 0), fn(torch.addmm, (0, 0), (0, 0), (0, 0)).shape) + self.assertEqual((5, 6), fn(torch.addmm, (5, 6), (5, 0), (0, 6)).shape) + + # mv, addmv + self.assertEqual((0,), fn(torch.mv, (0, 0), (0,)).shape) + self.assertEqual((0,), fn(torch.mv, (0, 2), (2,)).shape) + self.assertEqual(torch.zeros((3,), device=device), fn(torch.mv, (3, 0), (0,))) + + self.assertEqual((0,), fn(torch.addmv, (0,), (0, 0), (0,)).shape) + self.assertEqual((3,), fn(torch.addmv, (3,), (3, 0), (0,)).shape) + + # ger, addr + self.assertEqual((0, 0), fn(torch.ger, (0,), (0,)).shape) + self.assertEqual((5, 0), fn(torch.ger, (5,), (0,)).shape) + self.assertEqual((0, 4), fn(torch.ger, (0,), (4,)).shape) + + self.assertEqual((0, 0), fn(torch.addr, (0, 0), (0,), (0,)).shape) + self.assertEqual((5, 0), fn(torch.addr, (5, 0), (5,), (0,)).shape) + self.assertEqual((0, 4), fn(torch.addr, (0, 4), (0,), (4,)).shape) + + # bmm, baddbmm + self.assertEqual((0, 0, 0), fn(torch.bmm, (0, 0, 0), (0, 0, 0)).shape) + self.assertEqual((3, 0, 5), fn(torch.bmm, (3, 0, 0), (3, 0, 5)).shape) + self.assertEqual((0, 5, 6), fn(torch.bmm, (0, 5, 0), (0, 0, 6)).shape) + self.assertEqual(torch.zeros((3, 5, 6), device=device), fn(torch.bmm, (3, 5, 0), (3, 0, 6))) + + self.assertEqual((0, 0, 0), fn(torch.baddbmm, (0, 0, 0), (0, 0, 0), (0, 0, 0)).shape) + self.assertEqual((3, 0, 5), fn(torch.baddbmm, (3, 0, 5), (3, 0, 0), (3, 0, 5)).shape) + self.assertEqual((0, 5, 6), fn(torch.baddbmm, (0, 5, 6), (0, 5, 0), (0, 0, 6)).shape) + self.assertEqual((3, 5, 6), fn(torch.baddbmm, (3, 5, 6), (3, 5, 0), (3, 0, 6)).shape) + + # addbmm + self.assertEqual((0, 0), fn(torch.addbmm, (0, 0), (0, 0, 0), (0, 0, 0)).shape) + self.assertEqual((0, 5), fn(torch.addbmm, (0, 5), (3, 0, 0), (3, 0, 5)).shape) + self.assertEqual((5, 6), fn(torch.addbmm, (5, 6), (0, 5, 0), (0, 0, 6)).shape) + + # matmul + self.assertEqual(torch.tensor(0., device=device), fn(torch.matmul, (0,), (0,))) + self.assertEqual((0, 0), fn(torch.matmul, (0, 0), (0, 0)).shape) + self.assertEqual((0, 0, 0), fn(torch.matmul, (0, 0, 0), (0, 0, 0)).shape) + self.assertEqual((5, 0, 0), fn(torch.matmul, (5, 0, 0), (5, 0, 0)).shape) + self.assertEqual(torch.zeros((5, 3, 4), device=device), fn(torch.matmul, (5, 3, 0), (5, 0, 4))) + + # dot + self.assertEqual(torch.tensor(0., device=device), fn(torch.dot, (0,), (0,))) + + # btrifact + A_LU, pivots = fn(torch.btrifact, (0, 5, 5)) + self.assertEqual([(0, 5, 5), (0, 5)], [A_LU.shape, pivots.shape]) + A_LU, pivots = fn(torch.btrifact, (0, 0, 0)) + self.assertEqual([(0, 0, 0), (0, 0)], [A_LU.shape, pivots.shape]) + A_LU, pivots = fn(torch.btrifact, (2, 0, 0)) + self.assertEqual([(2, 0, 0), (2, 0)], [A_LU.shape, pivots.shape]) + + @skipIfNoZeroSize + def test_blas_alpha_beta_empty(self): + devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] + for device in ['cuda']: + # ensure beta is respected + value = 11 + input = torch.full((2,), value, device=device) + mat = torch.ones((2, 0), device=device) + vec = torch.ones((0,), device=device) + out = torch.randn((2,), device=device) + alpha = 6 + beta = 3 + self.assertEqual(torch.full((2,), beta * value, device=device), + torch.addmv(input=input, mat=mat, vec=vec, alpha=alpha, beta=beta)) + self.assertEqual(torch.full((2,), beta * value, device=device), + torch.addmv(input=input, mat=mat, vec=vec, alpha=alpha, beta=beta, out=out)) + + # torch.addmm + input = torch.full((2, 3), value, device=device) + mat2 = torch.ones((0, 3), device=device) + out = torch.randn((2, 3), device=device) + self.assertEqual(torch.full((2, 3), beta * value, device=device), + torch.addmm(input=input, mat1=mat, mat2=mat2, alpha=alpha, beta=beta)) + self.assertEqual(torch.full((2, 3), beta * value, device=device), + torch.addmm(input=input, mat1=mat, mat2=mat2, alpha=alpha, beta=beta, out=out)) + + @skipIfNoZeroSize + @skipIfNoLapack + def test_lapack_empty(self): + # FIXME: these are just a selection of LAPACK functions -- we need a general strategy here. + # The LAPACK functions themselves generally do NOT work with zero sized dimensions, although + # numpy/sci often has a direct wrapper (e.g. lu_factor) and a wrapper that "does the right thing" + # (e.g. lu). We often name our functions identically to the lapack function, so it will take work + # to name / migrate-to better wrappers. + + # FIXME: enable CUDA tests. + devices = ['cpu'] # if not torch.cuda.is_available() else ['cpu', 'cuda'] + for device in devices: + + def fn(torchfn, *args): + return torchfn(*tuple(torch.randn(shape, device=device) if isinstance(shape, tuple) else shape + for shape in args)) + + # inverse, pinverse + self.assertEqual((0, 0), fn(torch.inverse, (0, 0)).shape) + self.assertEqual((5, 0), fn(torch.pinverse, (0, 5)).shape) + self.assertEqual((0, 5), fn(torch.pinverse, (5, 0)).shape) + self.assertEqual((0, 0), fn(torch.pinverse, (0, 0)).shape) + + # svd + self.assertRaises(RuntimeError, lambda: fn(torch.svd, (0, 0))) + + # det, logdet, slogdet + self.assertEqual(torch.tensor(1., device=device), fn(torch.det, (0, 0))) + self.assertEqual(torch.tensor(0., device=device), fn(torch.logdet, (0, 0))) + self.assertEqual((torch.tensor(1., device=device), torch.tensor(0., device=device)), + fn(torch.slogdet, (0, 0))) + + # eig, symeig + evalues, evectors = fn(torch.eig, (0, 0), True) + self.assertEqual([(0, 2), (0, 0)], [evalues.shape, evectors.shape]) + evalues, evectors = fn(torch.symeig, (0, 0), True) + self.assertEqual([(0,), (0, 0)], [evalues.shape, evectors.shape]) + + # qr, gels + self.assertRaises(RuntimeError, lambda: torch.qr(torch.randn(0, 0))) + self.assertRaises(RuntimeError, lambda: torch.gels(torch.randn(0, 0), torch.randn(0, 0))) + self.assertRaises(RuntimeError, lambda: torch.gels(torch.randn(0,), torch.randn(0, 0))) + def test_expand(self): tensor = torch.rand(1, 8, 1) tensor2 = torch.rand(5) @@ -7221,7 +7470,7 @@ def test_print(self): self.assertExpected(str(x), subname='negint') # test inf and nan - x = torch.tensor([4, float('inf'), 1.5, float('-inf'), 0, float('nan'), 1]) + x = torch.tensor([4, inf, 1.5, -inf, 0, nan, 1]) self.assertEqual(x.__repr__(), str(x)) self.assertExpected(str(x), subname='nonfinite') diff --git a/third_party/onnx b/third_party/onnx index b2817a682f25f9..0efd9f85c4e837 160000 --- a/third_party/onnx +++ b/third_party/onnx @@ -1 +1 @@ -Subproject commit b2817a682f25f960586f06caa539bbbd7a96b859 +Subproject commit 0efd9f85c4e837e8d64a8ea4d2d5b7a59fab75bb diff --git a/tools/amd_build/build_caffe2_amd.py b/tools/amd_build/build_caffe2_amd.py new file mode 100755 index 00000000000000..9effd464bbdb38 --- /dev/null +++ b/tools/amd_build/build_caffe2_amd.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python + +import os +import sys +import subprocess + +amd_build_dir = os.path.dirname(os.path.realpath(__file__)) +proj_dir = os.path.join(os.path.dirname(os.path.dirname(amd_build_dir))) + +includes = [ + "caffe2/operators/*", + "caffe2/sgd/*", + "caffe2/image/*", + "caffe2/transforms/*", + "caffe2/video/*", + "caffe2/distributed/*", +] + +ignores = [ + "caffe2/operators/depthwise_3x3_conv_op.cu", + "caffe2/operators/depthwise_3x3_conv_op_cudnn.cu", + "caffe2/operators/top_k.cu", + "caffe2/operators/top_k_radix_selection.cuh", + "caffe2/operators/top_k_heap_selection.cuh", + "caffe2/operators/pool_op_cudnn.cu", + "caffe2/operators/roi_align_op_gpu_test.cc", + # elementwise ops test is failing + "caffe2/operators/elementwise_op_gpu_test.cc", + '**/hip/**', +] + +file_extensions = ['.cc', '.cu', '.h', '.cuh'] + +# Execute the Hipify Script. +args = [ + "--project-directory", proj_dir, + "--output-directory", proj_dir, + "--includes"] + includes + \ + ["--extensions"] + file_extensions + \ + ["--ignores"] + ignores + \ + ["--hipify_caffe2", "True"] + \ + ["--add-static-casts", "True"] + +subprocess.check_call([ + sys.executable, + os.path.join(amd_build_dir, "pyHIPIFY", "hipify-python.py"), +] + args) diff --git a/tools/amd_build/build_pytorch_amd.py b/tools/amd_build/build_pytorch_amd.py index 3d3ff2958a5ecf..ed7206f0bf6d27 100644 --- a/tools/amd_build/build_pytorch_amd.py +++ b/tools/amd_build/build_pytorch_amd.py @@ -8,9 +8,9 @@ amd_build_dir = os.path.dirname(os.path.realpath(__file__)) proj_dir = os.path.dirname(os.path.dirname(amd_build_dir)) -include_dirs = [ - "aten", - "torch" +includes = [ + "aten/*", + "torch/*" ] # List of operators currently disabled @@ -63,9 +63,12 @@ # Execute the Hipify Script. args = (["--project-directory", proj_dir] + ["--output-directory", proj_dir] + - ["--include-dirs"] + include_dirs + + ["--includes"] + includes + ["--yaml-settings", yaml_file] + ["--add-static-casts", "True"] + ["--show-progress", "False"]) -os.execv(os.path.join(amd_build_dir, "pyHIPIFY", "hipify-python.py"), ['python'] + args) +subprocess.check_call([ + sys.executable, + os.path.join(amd_build_dir, "pyHIPIFY", "hipify-python.py") +] + args) diff --git a/tools/amd_build/pyHIPIFY/constants.py b/tools/amd_build/pyHIPIFY/constants.py index 1ea8f81a9ca9f9..092de16cff7a90 100644 --- a/tools/amd_build/pyHIPIFY/constants.py +++ b/tools/amd_build/pyHIPIFY/constants.py @@ -53,3 +53,4 @@ HIP_UNSUPPORTED = 43 API_PYTORCH = 1337 +API_CAFFE2 = 1338 \ No newline at end of file diff --git a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py index c03562861692a1..26322a5842c09c 100644 --- a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py +++ b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py @@ -2106,5 +2106,29 @@ "define MAX_NUM_BLOCKS 200": ("define MAX_NUM_BLOCKS 64", API_PYTORCH), } +CAFFE2_SPECIFIC_MAPPINGS = { + "CUDA" :("HIP", API_CAFFE2), + "REGISTER_CUDA_OPERATOR" : ("REGISTER_HIP_OPERATOR", API_CAFFE2), + "cuda_stream" : ("hip_stream", API_CAFFE2), + "context_gpu" : ("hip/context_hip", API_CAFFE2), + "common_gpu" : ("hip/common_hip", API_CAFFE2), + "mixed_utils" : ("hip/mixed_utils_hip", API_CAFFE2), + "operator_fallback_gpu" : ("hip/operator_fallback_hip", API_CAFFE2), + "recurrent_network_executor_gpu" : ("hip/recurrent_network_executor_hip", API_CAFFE2), + "max_pool_with_index_gpu": ("hip/max_pool_with_index_hip", API_CAFFE2), + "CUDA_1D_KERNEL_LOOP" : ("HIP_1D_KERNEL_LOOP", API_CAFFE2), + "CUDAContext" : ("HIPContext", API_CAFFE2), + "CAFFE_CUDA_NUM_THREADS" : ("CAFFE_HIP_NUM_THREADS", API_CAFFE2), + "HasCudaGPU" : ("HasHipGPU", API_CAFFE2), + "__expf" : ("expf", API_CAFFE2), + "CUBLAS_ENFORCE" : ("ROCBLAS_ENFORCE", API_CAFFE2), + "cublas_handle" : ("rocblas_handle", API_CAFFE2), + "CURAND_ENFORCE" :("HIPRAND_ENFORCE", API_CAFFE2), + "curandGenerateUniform" : ("hiprandGenerateUniform", API_CAFFE2), + "curand_generator" : ("hiprand_generator", API_CAFFE2), + "set_cuda_gpu_id" : ("set_hip_gpu_id", API_CAFFE2), + "CaffeCudaGetDevice" : ("CaffeHipGetDevice", API_CAFFE2), +} + CUDA_TO_HIP_MAPPINGS = [CUDA_TYPE_NAME_MAP, CUDA_IDENTIFIER_MAP, - CUDA_INCLUDE_MAP, CUDA_SPARSE_MAP, PYTORCH_SPECIFIC_MAPPINGS] + CUDA_INCLUDE_MAP, CUDA_SPARSE_MAP, PYTORCH_SPECIFIC_MAPPINGS, CAFFE2_SPECIFIC_MAPPINGS] diff --git a/tools/amd_build/pyHIPIFY/hipify-python.py b/tools/amd_build/pyHIPIFY/hipify-python.py index 15a717c7766cdf..fc3efabd26db7f 100755 --- a/tools/amd_build/pyHIPIFY/hipify-python.py +++ b/tools/amd_build/pyHIPIFY/hipify-python.py @@ -26,11 +26,13 @@ import argparse import constants +import fnmatch import re import shutil import sys import os import yaml +import ast from functools import reduce from enum import Enum @@ -40,6 +42,7 @@ """This dictionary provides the mapping from PyTorch kernel template types to their actual types.""" PYTORCH_TEMPLATE_MAP = {"Dtype": "real", "T": "real"} +CAFFE2_TEMPLATE_MAP = {} def openf(filename, mode): @@ -210,72 +213,47 @@ def update_progress_bar(total, progress): sys.stderr.flush() -def filename_ends_with_extension(filename, extensions): - """Helper method to see if filename ends with certain extension""" - for ext in extensions: - if filename.endswith("." + ext): - return True +def matched_files_iter(root_path, includes=('*',), ignores=(), extensions=(), hipify_caffe2=False): + def _fnmatch(filepath, patterns): + return any(fnmatch.fnmatch(filepath, pattern) for pattern in patterns) - return False + def match_extensions(filename): + """Helper method to see if filename ends with certain extension""" + return os.path.splitext(filename)[1] in extensions + for (dirpath, _, filenames) in os.walk(root_path, topdown=True): + for fn in filenames: + filepath = os.path.join(dirpath, fn) + rel_filepath = os.path.relpath(filepath, root_path) + if _fnmatch(rel_filepath, includes) and (not _fnmatch(rel_filepath, ignores)) and match_extensions(fn): + if hipify_caffe2 and not is_caffe2_gpu_file(filepath): + continue -def inside_included_directories(dirpath, rootpath, include_dirs): - """Helper method to see if filename within included directories""" - for included_directory in include_dirs: - if re.match(r'{0}\b'.format(os.path.join(rootpath, included_directory)), dirpath): - return True + yield filepath - return False - -def walk_over_directory(rootpath, extensions, show_detailed=False, include_dirs=None, show_progress=True): +def preprocess(all_files, show_detailed=False, show_progress=True, hipify_caffe2=False): """ - Recursively walk over the directory and call preprocessor on selected files. + Call preprocessor on selected files. Arguments) - extensions - A plist of file extensions ['cu', 'cuh', ..] - - include_dirs - Directories under the rootpath that should be included in the walk. - show_detailed - Show a detailed summary of the transpilation process. """ - # Default argument for excluded directories. - if include_dirs is None: - include_dirs = [] - # Compute the total number of files to be traversed. - total_files = 0 - for (dirpath, _dirnames, filenames) in os.walk(rootpath): - if inside_included_directories(dirpath, rootpath, include_dirs): - for filename in filenames: - total_files += filename_ends_with_extension(filename, extensions) - - current_file = 0 + total_count = len(all_files) + finished_count = 0 # Preprocessing statistics. stats = {"unsupported_calls": [], "kernel_launches": []} - # Begin traversing the files. - for (dirpath, _dirnames, filenames) in os.walk(rootpath, topdown=True): - # Check if file ends with a valid extensions - if not inside_included_directories(dirpath, rootpath, include_dirs): - continue - - for filename in filenames: - if filename_ends_with_extension(filename, extensions): - # Construct the file's full path - filepath = os.sep.join([dirpath, filename]) - - # Execute the preprocessor on the specified file. - preprocessor(filepath, stats) - - # Update the progress - if show_progress: - print(os.path.join(dirpath, filename)) - update_progress_bar(total_files, current_file) - - current_file += 1 + for filepath in all_files: + preprocessor(filepath, stats, hipify_caffe2) + # Update the progress + if show_progress: + print(filepath) + update_progress_bar(total_count, finished_count) + finished_count += 1 print(bcolors.OKGREEN + "Successfully preprocessed all matching files." + bcolors.ENDC) @@ -297,6 +275,41 @@ def compute_stats(stats): print("\nTotal number of replaced kernel launches: {0:d}".format(len(stats["kernel_launches"]))) +def add_dim3(kernel_string, cuda_kernel): + '''adds dim3() to the second and third arguments in the kernel launch''' + count = 0 + closure = 0 + kernel_string = kernel_string.replace("<<<", "").replace(">>>", "") + arg_locs = [{} for _ in range(2)] + arg_locs[count]['start'] = 0 + for ind, c in enumerate(kernel_string): + if count > 1: + break + if c == "(": + closure += 1 + elif c == ")": + closure -= 1 + elif (c == "," or ind == len(kernel_string) - 1) and closure == 0: + arg_locs[count]['end'] = ind + count += 1 + if count < 2: + arg_locs[count]['start'] = ind + 1 + + first_arg_raw = kernel_string[arg_locs[0]['start']:arg_locs[0]['end'] + 1] + second_arg_raw = kernel_string[arg_locs[1]['start']:arg_locs[1]['end']] + + first_arg_clean = kernel_string[arg_locs[0]['start']:arg_locs[0]['end']].replace("\n", "").strip(" ") + second_arg_clean = kernel_string[arg_locs[1]['start']:arg_locs[1]['end']].replace("\n", "").strip(" ") + + first_arg_dim3 = "dim3({})".format(first_arg_clean) + second_arg_dim3 = "dim3({})".format(second_arg_clean) + + first_arg_raw_dim3 = first_arg_raw.replace(first_arg_clean, first_arg_dim3) + second_arg_raw_dim3 = second_arg_raw.replace(second_arg_clean, second_arg_dim3) + cuda_kernel = cuda_kernel.replace(first_arg_raw + second_arg_raw, first_arg_raw_dim3 + second_arg_raw_dim3) + return cuda_kernel + + def processKernelLaunches(string, stats): """ Replace the CUDA style Kernel launches with the HIP style kernel launches.""" # Concat the namespace with the kernel names. (Find cleaner way of doing this later). @@ -396,12 +409,12 @@ def find_kernel_bounds(string): # Extract cuda kernel cuda_kernel = string[params[0]["start"]:parenthesis + 1] - + kernel_string = string[kernel['start']:kernel['end']] + cuda_kernel_dim3 = add_dim3(kernel_string, cuda_kernel) # Keep number of kernel launch params consistent (grid dims, group dims, stream, dynamic shared size) num_klp = len(extract_arguments(0, kernel["group"].replace("<<<", "(").replace(">>>", ")"))) - # Transform cuda kernel to hip kernel - hip_kernel = "hipLaunchKernelGGL(" + cuda_kernel[0:-1].replace( + hip_kernel = "hipLaunchKernelGGL(" + cuda_kernel_dim3[0:-1].replace( ">>>", ", 0" * (4 - num_klp) + ">>>").replace("<<<", ", ").replace(">>>", ", ") # Replace cuda kernel with hip kernel @@ -450,6 +463,7 @@ def disable_asserts(input_string): output_string = output_string.replace(input_string[start:p_end + 1], "") return output_string + def replace_forceinline(input_string): """__forceinline__'d methods can cause 'symbol multiply defined' errors in HIP. Adding 'static' to all such methods leads to compilation errors, so @@ -460,6 +474,7 @@ def replace_forceinline(input_string): output_string = re.sub("__forceinline__", "inline", output_string) return output_string + def replace_math_functions(input_string): """ FIXME: Temporarily replace std:: invocations of math functions with non-std:: versions to prevent linker errors NOTE: This can lead to correctness issues when running tests, since the correct version of the math function (exp/expf) might not get called. @@ -471,6 +486,7 @@ def replace_math_functions(input_string): output_string = re.sub("std::pow\(", "::pow(", output_string) return output_string + def disable_function(input_string, function, replace_style): """ Finds and disables a function in a particular file. @@ -610,11 +626,42 @@ def disable_function(input_string, function, replace_style): return output_string -def preprocessor(filepath, stats): +def get_hip_file_path(filepath, hipify_caffe2): + """ Returns the new name of the hipified file """ + if not hipify_caffe2: + return filepath + + dirpath, filename = os.path.split(filepath) + filename_without_ext, ext = os.path.splitext(filename) + + if 'gpu' in filename_without_ext: + filename_without_ext = filename_without_ext.replace('gpu', 'hip') + else: + filename_without_ext += '_hip' + + if ext == '.cu': + ext = '.cc' + + return os.path.join(dirpath, 'hip', filename_without_ext + ext) + + +def is_caffe2_gpu_file(filepath): + filename = os.path.basename(filepath) + _, ext = os.path.splitext(filename) + return 'gpu' in filename or ext in ['.cu', '.cuh'] + + +def preprocessor(filepath, stats, hipify_caffe2): """ Executes the CUDA -> HIP conversion on the specified file. """ - with openf(filepath, "r+") as fileobj: - output_source = fileobj.read() + fin_path = filepath + with open(fin_path, 'r') as fin: + output_source = fin.read() + fout_path = get_hip_file_path(filepath, hipify_caffe2) + if not os.path.exists(os.path.dirname(fout_path)): + os.makedirs(os.path.dirname(fout_path)) + + with open(fout_path, 'w') as fout: # Perform type, method, constant replacements for mapping in CUDA_TO_HIP_MAPPINGS: for cuda_type, value in mapping.items(): @@ -622,13 +669,22 @@ def preprocessor(filepath, stats): hip_type = value[0] meta_data = value[1:] + if constants.API_CAFFE2 in meta_data and not hipify_caffe2: + continue + if constants.API_RAND in meta_data and hipify_caffe2: + continue + if output_source.find(cuda_type) > -1: # Check if supported if constants.HIP_UNSUPPORTED in meta_data: stats["unsupported_calls"].append((cuda_type, filepath)) if cuda_type in output_source: - output_source = re.sub(r'\b({0})\b'.format(cuda_type), lambda x: hip_type, output_source) + if hipify_caffe2: + pattern = r'({0})'.format(cuda_type) + else: + pattern = r'(\b{0}\b)'.format(cuda_type) + output_source = re.sub(pattern, hip_type, output_source) # Perform Kernel Launch Replacements output_source = processKernelLaunches(output_source, stats) @@ -643,14 +699,7 @@ def preprocessor(filepath, stats): # Replace __forceinline__ with inline output_source = replace_forceinline(output_source) - # Overwrite file contents - fileobj.seek(0) - fileobj.write(output_source) - fileobj.truncate() - fileobj.flush() - - # Flush to disk - os.fsync(fileobj) + fout.write(output_source) def file_specific_replacement(filepath, search_string, replace_string, strict=False): @@ -847,7 +896,7 @@ def extract_arguments(start, string): closures["("] -= 1 elif string[current_position] == "<": closures["<"] += 1 - elif string[current_position] == ">" and string[current_position - 1] != "-": + elif string[current_position] == ">" and string[current_position - 1] != "-" and closures["<"] > 0: closures["<"] -= 1 # Finished all arguments @@ -867,7 +916,7 @@ def extract_arguments(start, string): # Add static_cast to ensure that the type of kernel arguments matches that in the corresponding kernel definition -def add_static_casts(directory, extensions, KernelTemplateParams): +def add_static_casts(filepath, KernelTemplateParams): """Add static casts to kernel launches in order to keep launch argument types and kernel definition types matching. Example: @@ -884,73 +933,70 @@ def add_static_casts(directory, extensions, KernelTemplateParams): static_cast_types = ["int", "const int", "int64_t", "THCIndex_t *", "const int *", "ptrdiff_t", "long", "const int64_t*", "int64_t *", "double"] - # Add static_casts<> to all kernel launches. - for (dirpath, _dirnames, filenames) in os.walk(directory): - for filename in filenames: - if filename_ends_with_extension(filename, extensions): - filepath = os.sep.join([dirpath, filename]) - with openf(filepath, "r+") as fileobj: - input_source = fileobj.read() - new_output_source = input_source - for kernel in re.finditer("hipLaunchKernelGGL\(", input_source): - arguments = extract_arguments(kernel.end() - 1, input_source) - - # Check if we have templating + static_cast information - argument_strings = [input_source[arg["start"]:arg["end"]] for arg in arguments] - original_kernel_name_with_template = argument_strings[0].strip() - kernel_name = original_kernel_name_with_template.split("<")[0].strip() - ignore = ["upscale"] - if kernel_name in KernelTemplateParams and kernel_name not in ignore: - # Add template to the kernel - # Add static_casts to relevant arguments - kernel_name_with_template = KernelTemplateParams[kernel_name]["kernel_with_template"] - argument_types = KernelTemplateParams[kernel_name]["arg_types"] - - # The first 5 arguments are simply (function, number blocks, dimension blocks, shared memory, stream) - # old_kernel_launch_parameters - will contain the actual arguments to the function itself. - old_kernel_launch_parameters = input_source[arguments[5]["start"]:arguments[-1]["end"]] - new_kernel_launch_parameters = old_kernel_launch_parameters - - # full_old_kernel_launch - will contain the entire kernel launch closure. - full_old_kernel_launch = input_source[arguments[0]["start"]:arguments[-1]["end"]] - full_new_kernel_launch = full_old_kernel_launch - - kernel_params = argument_strings[5:] - for arg_idx, arg in enumerate(kernel_params): - if arg_idx in argument_types: - the_type = argument_types[arg_idx] - the_arg = arg.replace("\n", "").replace("\\", "").strip() - # Not all types have issues with the hipLaunchKernelGGL. - if the_type in static_cast_types: - static_argument = "static_cast<{0}>({1})".format(the_type, the_arg) - - def replace_arg(match): - return match.group(1) + static_argument + match.group(3) - # Update to static_cast, account for cases where argument is at start/end of string - new_kernel_launch_parameters = re.sub(r'(^|\W)({0})(\W|$)'.format( - re.escape(the_arg)), replace_arg, new_kernel_launch_parameters) - - # replace kernel arguments in full kernel launch arguments w/ static_cast ones - full_new_kernel_launch = full_new_kernel_launch.replace(old_kernel_launch_parameters, new_kernel_launch_parameters) - - # PyTorch Specific: Add template type - # Here the template value will be resolved from to . - if "THCUNN" in filepath.split("/") and "generic" not in filepath.split("/"): - kernel_name_with_template = kernel_name_with_template.replace("", "") - full_new_kernel_launch = re.sub(r'\b{0}\b'.format(original_kernel_name_with_template), - lambda x: kernel_name_with_template, full_new_kernel_launch) - - # Replace Launch - new_output_source = new_output_source.replace(full_old_kernel_launch, full_new_kernel_launch) - - # Overwrite file contents - fileobj.seek(0) - fileobj.write(new_output_source) - fileobj.truncate() - fileobj.flush() - - # Flush to disk - os.fsync(fileobj) + with openf(filepath, "r+") as fileobj: + input_source = fileobj.read() + new_output_source = input_source + for kernel in re.finditer("hipLaunchKernelGGL\(", input_source): + arguments = extract_arguments(kernel.end() - 1, input_source) + + # Check if we have templating + static_cast information + argument_strings = [input_source[arg["start"]:arg["end"]] for arg in arguments] + original_kernel_name_with_template = argument_strings[0].strip() + kernel_name = original_kernel_name_with_template.split("<")[0].strip() + ignore = ["upscale"] + if kernel_name in KernelTemplateParams and kernel_name not in ignore: + # Add template to the kernel + # Add static_casts to relevant arguments + kernel_name_with_template = KernelTemplateParams[kernel_name]["kernel_with_template"] + argument_types = KernelTemplateParams[kernel_name]["arg_types"] + + # The first 5 arguments are simply (function, number blocks, dimension blocks, shared memory, stream) + # old_kernel_launch_parameters - will contain the actual arguments to the function itself. + old_kernel_launch_parameters = input_source[arguments[5]["start"]:arguments[-1]["end"]] + new_kernel_launch_parameters = old_kernel_launch_parameters + + # full_old_kernel_launch - will contain the entire kernel launch closure. + full_old_kernel_launch = input_source[arguments[0]["start"]:arguments[-1]["end"]] + full_new_kernel_launch = full_old_kernel_launch + + kernel_params = argument_strings[5:] + for arg_idx, arg in enumerate(kernel_params): + if arg_idx in argument_types: + the_type = argument_types[arg_idx] + the_arg = arg.replace("\n", "").replace("\\", "").strip() + # Not all types have issues with the hipLaunchKernelGGL. + if the_type in static_cast_types: + static_argument = "static_cast<{0}>({1})".format(the_type, the_arg) + + def replace_arg(match): + return match.group(1) + static_argument + match.group(3) + # Update to static_cast, account for cases where argument is at start/end of string + new_kernel_launch_parameters = re.sub(r'(^|\W)({0})(\W|$)'.format( + re.escape(the_arg)), replace_arg, new_kernel_launch_parameters) + + # replace kernel arguments in full kernel launch arguments w/ static_cast ones + full_new_kernel_launch = full_new_kernel_launch.replace( + old_kernel_launch_parameters, new_kernel_launch_parameters) + + # PyTorch Specific: Add template type + # Here the template value will be resolved from to . + if "THCUNN" in filepath.split("/") and "generic" not in filepath.split("/"): + kernel_name_with_template = kernel_name_with_template.replace("", "") + + full_new_kernel_launch = re.sub(r'\b{0}\b'.format(original_kernel_name_with_template), + lambda x: kernel_name_with_template, full_new_kernel_launch) + + # Replace Launch + new_output_source = new_output_source.replace(full_old_kernel_launch, full_new_kernel_launch) + + # Overwrite file contents + fileobj.seek(0) + fileobj.write(new_output_source) + fileobj.truncate() + fileobj.flush() + + # Flush to disk + os.fsync(fileobj) def str2bool(v): @@ -990,7 +1036,7 @@ def main(): parser.add_argument( '--extensions', nargs='+', - default=["cu", "cuh", "c", "cpp", "h", "in", "hpp"], + default=[".cu", ".cuh", ".c", ".cpp", ".h", ".in", ".hpp"], help="The extensions for files to run the Hipify script over.", required=False) @@ -1002,10 +1048,10 @@ def main(): required=False) parser.add_argument( - '--include-dirs', + '--includes', nargs='+', default=[], - help="The directories under the root that should be included.", + help="The patterns of files that should be included.", required=False) parser.add_argument( @@ -1022,6 +1068,19 @@ def main(): help="Whether to automatically add static_casts to kernel arguments.", required=False) + parser.add_argument( + '--hipify_caffe2', + type=str2bool, + default=False, + help="Whether to hipify caffe2 source", + required=False) + + parser.add_argument( + '--ignores', + nargs='+', + default=[], + help="list of patterns to ignore for hipifying") + parser.add_argument( '--show-progress', type=str2bool, @@ -1037,33 +1096,14 @@ def main(): sys.exit(1) # If no output directory, provide a default one. - if args.output_directory is "": + if not args.output_directory: args.project_directory.rstrip("/") args.output_directory = args.project_directory + "_amd" - # Make sure output directory does not exist. - if not os.path.exists(args.output_directory): - print("The output folder already exists.") - sys.exit(2) - # Copy from project directory to output directory if not done already. if not os.path.exists(args.output_directory): shutil.copytree(args.project_directory, args.output_directory) - # Extract all of the kernel parameter and template type information. - if args.add_static_casts: - KernelTemplateParams = {} - for (dirpath, _dirnames, filenames) in os.walk(args.output_directory): - for filename in filenames: - if filename_ends_with_extension(filename, args.extensions) and inside_included_directories(dirpath, args.output_directory, args.include_dirs): - the_file = os.sep.join([dirpath, filename]) - - # Store param information inside KernelTemplateParams - get_kernel_template_params( - the_file, - KernelTemplateParams, - PYTORCH_TEMPLATE_MAP) - # Open YAML file with disable information. if args.yaml_settings != "": with openf(args.yaml_settings, "r") as f: @@ -1152,17 +1192,28 @@ def main(): f.write(txt) f.truncate() + all_files = list(matched_files_iter(args.output_directory, includes=args.includes, + ignores=args.ignores, extensions=args.extensions, hipify_caffe2=args.hipify_caffe2)) + # Start Preprocessor - walk_over_directory( - args.output_directory, - extensions=args.extensions, + preprocess( + all_files, show_detailed=args.show_detailed, - include_dirs=args.include_dirs, - show_progress=args.show_progress) + show_progress=args.show_progress, + hipify_caffe2=args.hipify_caffe2) + # Extract all of the kernel parameter and template type information. if args.add_static_casts: + KernelTemplateParams = {} + for filepath in all_files: + get_kernel_template_params( + filepath, + KernelTemplateParams, + CAFFE2_TEMPLATE_MAP if args.hipify_caffe2 else PYTORCH_TEMPLATE_MAP) + # Execute the Clang Tool to Automatically add static casts - add_static_casts(args.output_directory, args.extensions, KernelTemplateParams) + for filepath in all_files: + add_static_casts(get_hip_file_path(filepath, hipify_caffe2=args.hipify_caffe2), KernelTemplateParams) if __name__ == '__main__': diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index 960e55727031b3..3a396d84b66e4f 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -342,7 +342,7 @@ self: at::zeros(self.sizes(), grad.type()).index_add_(dim, index, grad) - name: inverse(Tensor self) - self: -at::mm(output.t(), at::mm(grad, output.t())) + self: -at::mm(result.t(), at::mm(grad, result.t())) - name: kthvalue(Tensor self, int64_t k, int64_t dim, bool keepdim) self: index_select_backward(grad, dim, result1, self.sizes(), keepdim) @@ -579,7 +579,7 @@ self: repeat_backward(grad, self.dim(), repeats) # DO NOT define a backward for reshape! -# reshape is special in that it sometimes returns a view, and somtimes not. +# reshape is special in that it sometimes returns a view, and sometimes not. # Defining a backward will make codegen spit out the forward call as # as_variable(baseType->reshape(self)), # making it impossible (hard) to detect when it is actually a view. diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py index f9bd8d9c0d2b28..6d85270fffc89a 100644 --- a/tools/autograd/gen_python_functions.py +++ b/tools/autograd/gen_python_functions.py @@ -25,11 +25,12 @@ 'index', '_indexCopy_', 'max_values', 'min_values', 'argmax', 'argmin', '_cumsum.*', '_cumprod.*', '_sum.*', '_prod.*', '_th_.*', - 'arange.*', 'range.*', '_gesv.*', 'slice', 'max_pool1d', 'max_pool2d', 'max_pool3d' + 'arange.*', 'range.*', '_gesv.*', '_getri.*', 'slice', + 'max_pool1d', 'max_pool2d', 'max_pool3d' ] PY_VARIABLE_METHOD_VARARGS = CodeTemplate("""\ -static PyObject * ${pycname}(PyObject* self, PyObject* args, PyObject* kwargs) +static PyObject * ${pycname}(PyObject* self_, PyObject* args, PyObject* kwargs) { HANDLE_TH_ERRORS static PythonArgParser parser({ @@ -45,7 +46,7 @@ """) PY_VARIABLE_METHOD_NOARGS = CodeTemplate("""\ -static PyObject * ${pycname}(PyObject* self, PyObject* args) +static PyObject * ${pycname}(PyObject* self_, PyObject* args) { HANDLE_TH_ERRORS ${unpack_self} @@ -98,7 +99,7 @@ PY_VARIABLE_METHOD_DEF = CodeTemplate("""\ {"${name}", (PyCFunction)${pycname}, ${flags}, NULL},""") -UNPACK_SELF = "auto& self_ = reinterpret_cast(self)->cdata;" +UNPACK_SELF = "auto& self = reinterpret_cast(self_)->cdata;" PYTHON_FUNCTION_SIGNATURE = CodeTemplate("""\ ${name}(${py_formal_args})""") @@ -329,7 +330,7 @@ def append_actuals_formals(actual, formal): continue if has_self and arg['name'] == 'self': formal_args.append('Tensor & self') - actuals.append('self_') + actuals.append('self') continue append_actuals_formals(*parse_arg(arg, arg_idx, unpack)) arg_idx += 1 @@ -582,7 +583,7 @@ def process_function(name, declarations): if len(declarations) == 1 and len(declarations[0]['args']) == 1 and has_self: tmpl = PY_VARIABLE_METHOD_NOARGS - env['actuals'] = ['self_'] + env['actuals'] = ['self'] env['flags'] = 'METH_NOARGS' else: tmpl = PY_VARIABLE_METHOD_VARARGS diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py index 679e3c97f9fa54..97130b1bdbb3bc 100644 --- a/tools/autograd/gen_variable_type.py +++ b/tools/autograd/gen_variable_type.py @@ -54,6 +54,13 @@ 's_native_mul': 'mul', 'th_addmm': 'addmm', 's_native_addmm': 'addmm', + 'zero': 'zeros_like', + 'fill': 'full_like', +} + +# (declaration name, argument name) -> attribute name +RENAME_ATTRIBUTES = { + ('fill_', 'value'): 'fill_value' } # These functions are not worth profiling because they are very cheap and may @@ -126,7 +133,7 @@ PRE_RECORD_TRACE = CodeTemplate("""\ jit::tracer::PreTraceInfo trace_info; -if (jit::tracer::isTracing( ${tensor_args} )) { +if (jit::tracer::isTracing()) { trace_info = jit::tracer::preRecordTrace( jit::aten::${trace_name}, ${trace_inputs} ); if (!jit::tracer::ArgumentStash::empty()) { ${record_positional_attributes} @@ -138,13 +145,13 @@ """) POST_RECORD_TRACE = CodeTemplate("""\ -if (trace_info.state != nullptr) { +if (jit::tracer::isTracing()) { jit::tracer::postRecordTrace( trace_info, ${trace_outputs} ); } """) RECORD_ATTRIBUTE = CodeTemplate("""\ -setattr(trace_info.n, jit::attr::${name}, ${name});""") +setattr(trace_info.n, jit::attr::${attr_name}, ${name});""") RECORD_POSITIONAL_ATTRIBUTE = CodeTemplate("""\ setposattr(trace_info.n, ${i}, "${name}", ${name});""") @@ -417,7 +424,8 @@ def emit_record_trace(env): for arg in declaration['arguments']: if arg['simple_type'] in {'Tensor', 'TensorList'}: continue - local['record_attributes'].append(RECORD_ATTRIBUTE.substitute(name=arg['name'])) + attr_name = RENAME_ATTRIBUTES.get((declaration['name'], arg['name']), arg['name']) + local['record_attributes'].append(RECORD_ATTRIBUTE.substitute(attr_name=attr_name, name=arg['name'])) local['record_positional_attributes'] = [] for i, arg in enumerate(declaration['arguments']): diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp index 0695c0d89befe6..03e0f641696144 100644 --- a/tools/autograd/templates/VariableType.cpp +++ b/tools/autograd/templates/VariableType.cpp @@ -14,6 +14,7 @@ #include "torch/csrc/jit/symbolic_variable.h" #include "torch/csrc/jit/tensor_conversions.h" #include "torch/csrc/utils/variadic.h" +#include "torch/csrc/autograd/functions/utils.h" #include #include @@ -38,10 +39,14 @@ using namespace torch::autograd::generated; namespace torch { namespace autograd { // Helper methods for working with Attributes (torch/csrc/jit/attributes.h) +at::Tensor maybeUnwrapVar(const at::Tensor& t) { + return t.is_variable() ? Variable(t).data() : t; +} + // The overloaded accessors are convenient for the generated code (since we // don't want to make the codegen do the dispatch manually) static void setattr(jit::Node* n, jit::Symbol name, int64_t v) { n->i_(name, v); } -static void setattr(jit::Node* n, jit::Symbol name, const at::Scalar& v) { n->t_(name, v.toTensor()); } +static void setattr(jit::Node* n, jit::Symbol name, const at::Scalar& v) { n->t_(name, maybeUnwrapVar(v.toTensor())); } static void setattr(jit::Node* n, jit::Symbol name, SparseTensorRef s) { n->t_(name, s.tref); } static void setattr(jit::Node* n, jit::Symbol name, const at::IntList& v) { n->is_(name, v); } static void setattr(jit::Node* n, jit::Symbol name, bool v) { n->i_(name, v); } @@ -327,26 +332,6 @@ static std::vector as_view(const Tensor & base, std::vector tens return tensors; } -struct ComputeRequiresGrad : IterArgs { - bool out = false; - using IterArgs::operator(); - void operator()(const at::Tensor& tensor) { - const auto& var = static_cast(tensor); - if (var.defined() && var.requires_grad()) { - out = true; - } - } - bool short_circuit() { return out; } -}; - -template -static bool compute_requires_grad(Args&&... args) { - if (!GradMode::is_enabled()) { - return false; - } - return ComputeRequiresGrad().apply(std::forward(args)...).out; -} - static void check_no_requires_grad(const Tensor& tensor, const char* name) { auto& var = static_cast(tensor); if (var.defined() && var.requires_grad()) { @@ -394,20 +379,6 @@ static void rebase_history(ArrayRef vars, std::shared_ptr gr } } -static void set_history(ArrayRef vars, std::shared_ptr grad_fn) { - if (grad_fn) { - for (auto& var : vars) { - if (var.defined()) { - // TODO: eliminate const_cast - auto output_nr = grad_fn->add_input_metadata(var.type(), var.sizes()); - const_cast(var).set_gradient_edge({grad_fn, output_nr}); - } else { - grad_fn->add_input_metadata(Function::undefined_input()); - } - } - } -} - struct Flatten : IterArgs { Flatten(variable_list& out) : out(out) {} variable_list& out; diff --git a/tools/autograd/templates/VariableType.h b/tools/autograd/templates/VariableType.h index 567bbdf5f60b27..5bf7a4e5591155 100644 --- a/tools/autograd/templates/VariableType.h +++ b/tools/autograd/templates/VariableType.h @@ -4,6 +4,8 @@ #include +#include + #include // for size_t #include // for function #include // for unique_ptr @@ -28,7 +30,7 @@ using at::optional; void register_variable_type_for(at::Type* baseType); -struct VariableType final : public at::Type { +struct TORCH_API VariableType final : public at::Type { VariableType(Context* context, at::Type* baseType); virtual at::ScalarType scalarType() const override; virtual at::Backend backend() const override; diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp index 8e8c87607b478b..9e4405482b4485 100644 --- a/tools/autograd/templates/python_variable_methods.cpp +++ b/tools/autograd/templates/python_variable_methods.cpp @@ -140,7 +140,7 @@ static PyObject * THPVariable_size(PyObject* self, PyObject* args, PyObject* kwa ParsedArgs<3> parsed_args; auto r = parser.parse(args, kwargs, parsed_args); if (r.idx == 0) { - if (jit::tracer::isTracing(self_)) { + if (jit::tracer::isTracing()) { return wrap(jit::tracer::getSizeOf(self_, r.toInt64(0))); } else { return wrap(self_.size(r.toInt64(0))); diff --git a/tools/build_pytorch_libs.bat b/tools/build_pytorch_libs.bat index 0bfe87b3c9a8a1..cc8271515590ba 100755 --- a/tools/build_pytorch_libs.bat +++ b/tools/build_pytorch_libs.bat @@ -175,6 +175,9 @@ goto:eof cmake .. %CMAKE_GENERATOR_COMMAND% ^ -DCMAKE_BUILD_TYPE=%BUILD_TYPE% ^ -DBUILD_CAFFE2=OFF ^ + -DBUILD_TORCH="%BUILD_TORCH%" ^ + -DNVTOOLEXT_HOME="%NVTOOLEXT_HOME%" ^ + -DNO_API=ON ^ -DBUILD_ATEN=ON ^ -DBUILD_PYTHON=OFF ^ -DBUILD_BINARY=OFF ^ diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh index e3d6202c69ec8f..4a0dbd04c905f1 100755 --- a/tools/build_pytorch_libs.sh +++ b/tools/build_pytorch_libs.sh @@ -255,6 +255,7 @@ function build_caffe2() { -DBUILDING_WITH_TORCH_LIBS=ON \ -DCMAKE_BUILD_TYPE=$BUILD_TYPE \ -DBUILD_CAFFE2=$FULL_CAFFE2 \ + -DBUILD_TORCH=$BUILD_TORCH \ -DBUILD_ATEN=ON \ -DBUILD_PYTHON=$FULL_CAFFE2 \ -DBUILD_BINARY=OFF \ diff --git a/tools/cpp_build/build_all.sh b/tools/cpp_build/build_all.sh deleted file mode 100755 index abc9cd5c5d7bed..00000000000000 --- a/tools/cpp_build/build_all.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env bash - -set -ex -SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" -source $SCRIPTPATH/build_caffe2.sh -source $SCRIPTPATH/build_libtorch.sh diff --git a/tools/cpp_build/build_caffe2.sh b/tools/cpp_build/build_caffe2.sh index 4c3254e7de66b6..b35435acb388c6 100755 --- a/tools/cpp_build/build_caffe2.sh +++ b/tools/cpp_build/build_caffe2.sh @@ -12,7 +12,9 @@ echo "Building Caffe2" mkdir -p $CAFFE2_BUILDPATH pushd $CAFFE2_BUILDPATH -cmake -DUSE_CUDA=$USE_CUDA \ +cmake -DUSE_CUDA:BOOL=$USE_CUDA \ + -DBUILD_TORCH=ON \ + -DUSE_OPENMP:BOOL=${USE_OPENMP:ON} \ -DBUILD_CAFFE2=OFF \ -DBUILD_ATEN=ON \ -DBUILD_PYTHON=OFF \ diff --git a/tools/cpp_build/build_common.sh b/tools/cpp_build/build_common.sh index 6a801937a936d9..be9ac2b271743d 100755 --- a/tools/cpp_build/build_common.sh +++ b/tools/cpp_build/build_common.sh @@ -11,7 +11,6 @@ fi CAFFE2_BUILDPATH="$BUILD_PATH/caffe2" NANOPB_BUILDPATH="$BUILD_PATH/nanopb" -LIBTORCH_BUILDPATH="$BUILD_PATH/libtorch" # Build with Ninja if available. It has much cleaner output. GENERATE="Unix Makefiles" diff --git a/tools/cpp_build/build_libtorch.sh b/tools/cpp_build/build_libtorch.sh index b5001120b8ca91..92a9b9981ed697 100755 --- a/tools/cpp_build/build_libtorch.sh +++ b/tools/cpp_build/build_libtorch.sh @@ -13,7 +13,7 @@ mkdir -p $LIBTORCH_BUILDPATH pushd $LIBTORCH_BUILDPATH cmake -DUSE_CUDA:BOOL=$USE_CUDA \ - -DNO_API:BOOL=${NO_API:0} \ + -DNO_API:BOOL=${NO_API:-0} \ -DCAFFE2_PATH=$PYTORCHPATH/ \ -DCAFFE2_BUILD_PATH=$CAFFE2_BUILDPATH \ -DONNX_NAMESPACE=$ONNX_NAMESPACE \ diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index d73406aa063d5f..02fd0428622c13 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -1,91 +1,20 @@ -if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) +if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) + if (NOT BUILD_TORCH) + return() + endif() +else() cmake_minimum_required(VERSION 3.0 FATAL_ERROR) - include(CMakeDependentOption) + project(torch CXX C) + find_package(Caffe2 REQUIRED) option(USE_CUDA "Use CUDA" ON) - option(TORCH_BUILD_TEST "Build torch test binaries" ON) - - # Flag for shared dependencies - set(BUILD_TORCH ON) endif() -cmake_policy(VERSION 3.0) - -set(CMAKE_CXX_STANDARD 11) -set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(CMAKE_POSITION_INDEPENDENT_CODE ON) -set(CMAKE_EXPORT_COMPILE_COMMANDS ON) - -if (VERBOSE) - message(STATUS "CAFFE2_PATH is ${CAFFE2_PATH}") - message(STATUS "CAFFE2_BUILD_PATH is ${CAFFE2_BUILD_PATH}") - message(STATUS "INSTALL_PREFIX is ${INSTALL_PREFIX}") -endif() +option(BUILD_TORCH_TEST "Build torch test binaries" ON) -set(CAFFE2_INCLUDE_DIR "${CAFFE2_PATH}") -set(CAFFE2_BUILD_LIB_DIR "${CAFFE2_BUILD_PATH}/lib") -set(CAFFE2_INSTALL_INCLUDE_DIR "${INSTALL_PREFIX}/include") -set(CAFFE2_INSTALL_SHARE_DIR "${INSTALL_PREFIX}/share") -set(CAFFE2_INSTALL_LIB_DIR "${INSTALL_PREFIX}/lib") set(TORCH_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}") -find_library(CAFFE2_LIBRARY caffe2 - NAMES libcaffe2.so libcaffe2.dylib caffe2.lib - PATHS ${CAFFE2_INSTALL_LIB_DIR} NO_DEFAULT_PATH) -find_library(CAFFE2_GPU_LIBRARY caffe2_gpu - NAMES libcaffe2_gpu.so libcaffe2_gpu.dylib caffe2_gpu.lib - PATHS ${CAFFE2_INSTALL_LIB_DIR} NO_DEFAULT_PATH) -find_library(PROTOBUF_LIBRARY protobuf - NAMES libprotobuf.a libprotobufd.a libprotobuf.lib libprotobufd.lib - PATHS ${CAFFE2_BUILD_LIB_DIR} NO_DEFAULT_PATH) - add_subdirectory(../third_party/nanopb protobuf-nanopb) -set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) -set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) -set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) - -if(USE_CUDA) - set(CMAKE_MODULE_PATH - ${INSTALL_PREFIX}/share/cmake - ${TORCH_SRC_DIR}/../cmake/Modules - ${TORCH_SRC_DIR}/../cmake/public - ${TORCH_SRC_DIR}/../cmake/Modules_CUDA_fix - /usr/lib/x86_64-linux-gnu/ - ${CMAKE_MODULE_PATH}) - set(CMAKE_LIBRARY_PATH /usr/lib/x86_64-linux-gnu/ ${CMAKE_LIBRARY_PATH}) - - if(NOT CUDA_FOUND) - find_package(CUDA 7.0) - endif() - - find_package(MAGMA) - if(CUDA_FOUND AND MAGMA_FOUND) - include_directories("${MAGMA_INCLUDE_DIR}") - set(CMAKE_REQUIRED_INCLUDES "${MAGMA_INCLUDE_DIR};${CUDA_INCLUDE_DIRS}") - include(CheckPrototypeDefinition) - check_prototype_definition(magma_get_sgeqrf_nb - "magma_int_t magma_get_sgeqrf_nb( magma_int_t m, magma_int_t n );" - "0" - "magma.h" - MAGMA_V2) - IF (MAGMA_V2) - add_definitions(-DMAGMA_V2) - endif (MAGMA_V2) - - set(USE_MAGMA 1) - if(VERBOSE) - message(STATUS "Compiling with MAGMA support") - message(STATUS "MAGMA INCLUDE DIRECTORIES: ${MAGMA_INCLUDE_DIR}") - message(STATUS "MAGMA LIBRARIES: ${MAGMA_LIBRARIES}") - message(STATUS "MAGMA V2 check: ${MAGMA_V2}") - endif() - else() - message(STATUS "MAGMA not found. Compiling without MAGMA support") - endif() -endif() - -add_definitions(-DUSE_CATCH -D_FORCE_INLINES -DONNX_NAMESPACE=${ONNX_NAMESPACE}) - if(NOT TORCH_INSTALL_BIN_DIR) set(TORCH_INSTALL_BIN_DIR bin) endif() @@ -98,19 +27,6 @@ if(NOT TORCH_INSTALL_LIB_DIR) set(TORCH_INSTALL_LIB_DIR lib) endif() -if(USE_CUDA) - add_definitions(-DUSE_CUDA) - - set(TORCH_CUDA_LIBRARIES - ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libcuda.so - ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libnvrtc.so - ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libnvToolsExt.so - ${CUDA_LIBRARIES}) - - list(APPEND CUDA_INCLUDE_DIRS - ${CAFFE2_INSTALL_INCLUDE_DIR}/THC) -endif() - # RPATH stuff # see https://cmake.org/Wiki/CMake_RPATH_handling if(APPLE) @@ -134,11 +50,11 @@ endif() # Generate files set(TOOLS_PATH "${TORCH_SRC_DIR}/../tools") -configure_file("${CAFFE2_PATH}/aten/src/ATen/common_with_cwrap.py" +configure_file("${TORCH_SRC_DIR}/../aten/src/ATen/common_with_cwrap.py" "${TOOLS_PATH}/shared/cwrap_common.py" COPYONLY) -configure_file("${CAFFE2_PATH}/torch/_utils_internal.py" +configure_file("${TORCH_SRC_DIR}/_utils_internal.py" "${TOOLS_PATH}/shared/_utils_internal.py" COPYONLY) @@ -164,11 +80,11 @@ add_custom_command( "${TORCH_SRC_DIR}/csrc/jit/generated/aten_interned_strings.h" COMMAND python tools/setup_helpers/generate_code.py - --declarations-path "${CAFFE2_INSTALL_SHARE_DIR}/ATen/Declarations.yaml" + --declarations-path "${CMAKE_BINARY_DIR}/aten/src/ATen/Declarations.yaml" --nn-path "aten/src/" DEPENDS - "${CAFFE2_INSTALL_SHARE_DIR}/ATen/Declarations.yaml" - "${CAFFE2_INSTALL_INCLUDE_DIR}/THNN/generic/THNN.h" + "${CMAKE_BINARY_DIR}/aten/src/ATen/Declarations.yaml" + "${CMAKE_CURRENT_LIST_DIR}/../aten/src/THNN/generic/THNN.h" "${TOOLS_PATH}/autograd/templates/VariableType.h" "${TOOLS_PATH}/autograd/templates/VariableType.cpp" "${TOOLS_PATH}/autograd/templates/Functions.h" @@ -191,62 +107,68 @@ add_custom_command( WORKING_DIRECTORY "${TORCH_SRC_DIR}/..") set(TORCH_SRCS - ${TORCH_SRC_DIR}/csrc/autograd/aten_variable_hooks.cpp - ${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType.cpp - ${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.cpp - ${TORCH_SRC_DIR}/csrc/autograd/profiler.cpp - ${TORCH_SRC_DIR}/csrc/autograd/saved_variable.cpp - ${TORCH_SRC_DIR}/csrc/autograd/grad_mode.cpp + ${TORCH_SRC_DIR}/csrc/assertions.cpp ${TORCH_SRC_DIR}/csrc/autograd/anomaly_mode.cpp + ${TORCH_SRC_DIR}/csrc/autograd/aten_variable_hooks.cpp + ${TORCH_SRC_DIR}/csrc/autograd/engine.cpp ${TORCH_SRC_DIR}/csrc/autograd/function.cpp - ${TORCH_SRC_DIR}/csrc/autograd/input_buffer.cpp - ${TORCH_SRC_DIR}/csrc/autograd/functions/utils.cpp - ${TORCH_SRC_DIR}/csrc/autograd/functions/basic_ops.cpp ${TORCH_SRC_DIR}/csrc/autograd/functions/accumulate_grad.cpp + ${TORCH_SRC_DIR}/csrc/autograd/functions/basic_ops.cpp + ${TORCH_SRC_DIR}/csrc/autograd/functions/comm.cpp ${TORCH_SRC_DIR}/csrc/autograd/functions/tensor.cpp + ${TORCH_SRC_DIR}/csrc/autograd/functions/utils.cpp + ${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.cpp + ${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType.cpp + ${TORCH_SRC_DIR}/csrc/autograd/grad_mode.cpp + ${TORCH_SRC_DIR}/csrc/autograd/input_buffer.cpp + ${TORCH_SRC_DIR}/csrc/autograd/profiler.cpp + ${TORCH_SRC_DIR}/csrc/autograd/saved_variable.cpp ${TORCH_SRC_DIR}/csrc/autograd/variable.cpp - ${TORCH_SRC_DIR}/csrc/autograd/engine.cpp - ${TORCH_SRC_DIR}/csrc/assertions.cpp - ${TORCH_SRC_DIR}/csrc/utils/variadic.cpp + ${TORCH_SRC_DIR}/csrc/cuda/comm.cpp + ${TORCH_SRC_DIR}/csrc/jit/autodiff.cpp + ${TORCH_SRC_DIR}/csrc/jit/export.cpp + ${TORCH_SRC_DIR}/csrc/jit/fusion_compiler.cpp ${TORCH_SRC_DIR}/csrc/jit/generated/register_aten_ops.cpp - ${TORCH_SRC_DIR}/csrc/jit/operator.cpp - ${TORCH_SRC_DIR}/csrc/jit/variable_flags.cpp + ${TORCH_SRC_DIR}/csrc/jit/graph_executor.cpp + ${TORCH_SRC_DIR}/csrc/jit/import.cpp + ${TORCH_SRC_DIR}/csrc/jit/interned_strings.cpp ${TORCH_SRC_DIR}/csrc/jit/interpreter.cpp - ${TORCH_SRC_DIR}/csrc/jit/register_prim_ops.cpp ${TORCH_SRC_DIR}/csrc/jit/ir.cpp - ${TORCH_SRC_DIR}/csrc/jit/graph_executor.cpp - ${TORCH_SRC_DIR}/csrc/jit/fusion_compiler.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/graph_fuser.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/common_subexpression_elimination.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/shape_analysis.cpp + ${TORCH_SRC_DIR}/csrc/jit/operator.cpp + ${TORCH_SRC_DIR}/csrc/jit/operator.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/batch_mm.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/canonicalize.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/common_subexpression_elimination.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/create_autodiff_subgraphs.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/dead_code_elimination.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/decompose_addmm.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/erase_number_types.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/lower_tuples.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/graph_fuser.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/inplace_check.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/loop_unrolling.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/lower_grad_of.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/lower_tuples.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/peephole.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/inplace_check.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/batch_mm.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/create_autodiff_subgraphs.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/remove_expands.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/decompose_addmm.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/shape_analysis.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/specialize_undef.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/loop_unrolling.cpp - ${TORCH_SRC_DIR}/csrc/jit/interned_strings.cpp + ${TORCH_SRC_DIR}/csrc/jit/register_prim_ops.cpp + ${TORCH_SRC_DIR}/csrc/jit/register_prim_ops.cpp + ${TORCH_SRC_DIR}/csrc/jit/register_symbols.cpp ${TORCH_SRC_DIR}/csrc/jit/script/compiler.cpp ${TORCH_SRC_DIR}/csrc/jit/script/lexer.cpp ${TORCH_SRC_DIR}/csrc/jit/script/module.cpp + ${TORCH_SRC_DIR}/csrc/jit/test_jit.cpp ${TORCH_SRC_DIR}/csrc/jit/tracer.cpp - ${TORCH_SRC_DIR}/csrc/jit/tracer_state.cpp - ${TORCH_SRC_DIR}/csrc/jit/autodiff.cpp ${TORCH_SRC_DIR}/csrc/jit/type.cpp - ${TORCH_SRC_DIR}/csrc/jit/export.cpp - ${TORCH_SRC_DIR}/csrc/jit/import.cpp ${TORCH_SRC_DIR}/csrc/onnx/onnx.cpp ${TORCH_SRC_DIR}/csrc/onnx/onnx.npb.cpp - ${TORCH_SRC_DIR}/csrc/torch.cpp) + ${TORCH_SRC_DIR}/csrc/torch.cpp + ${TORCH_SRC_DIR}/csrc/utils/tensor_flatten.cpp + ${TORCH_SRC_DIR}/csrc/utils/variadic.cpp + ) -if (NOT NO_API) +if (NOT NO_API AND NOT USE_ROCM) list(APPEND TORCH_SRCS ${TORCH_SRC_DIR}/csrc/api/src/utils.cpp ${TORCH_SRC_DIR}/csrc/api/src/cuda.cpp @@ -266,73 +188,159 @@ if (NOT NO_API) ${TORCH_SRC_DIR}/csrc/api/src/optim/lbfgs.cpp ${TORCH_SRC_DIR}/csrc/api/src/optim/rmsprop.cpp ${TORCH_SRC_DIR}/csrc/api/src/optim/sgd.cpp - ) + ) + endif() add_library(torch SHARED ${TORCH_SRCS}) -# https://gcc.gnu.org/onlinedocs/gcc-4.0.3/gcc/Warning-Options.html -target_compile_options(torch - PRIVATE - -Wall - -Wextra - -pedantic - -Wcast-align - -Wcast-qual - -Wctor-dtor-privacy - -Wdisabled-optimization - -Winit-self - -Wmissing-include-dirs - -Woverloaded-virtual - -Wsign-promo - -Wstrict-overflow=5 - -Wundef - -fdiagnostics-show-option - -Wno-unused-parameter - -Wno-missing-braces # This warning is buggy - -Wno-unknown-pragmas) - -if ($ENV{WERROR}) - target_compile_options(torch PRIVATE -Werror) +target_compile_definitions(torch PRIVATE _THP_CORE) + +# until they can be unified, keep these lists synced with setup.py +if(MSVC) + target_compile_options(torch PRIVATE + /MD + /Z7 + /EHa + /DNOMINMAX + /wd4267 + /wd4251 + /wd4522 + /wd4522 + /wd4838 + /wd4305 + /wd4244 + /wd4190 + /wd4101 + /wd4996 + /wd4275 + /bigobj + ) +else() + target_compile_options(torch PRIVATE + -std=c++11 + -Wall + -Wextra + -Wno-unused-parameter + -Wno-missing-field-initializers + -Wno-write-strings + -Wno-unknown-pragmas + # Clang has an unfixed bug leading to spurious missing braces + # warnings, see https://bugs.llvm.org/show_bug.cgi?id=21629 + -Wno-missing-braces + ) +endif() + +# see the source file for explanation +set_source_files_properties( + ${TORCH_SRC_DIR}/csrc/jit/register_symbols.cpp + PROPERTIES COMPILE_FLAGS -O0 + ) + +if (MSVC) +elseif ($ENV{WERROR}) + target_compile_options(torch PRIVATE -Werror -Wno-strict-overflow) +endif() + +if (MSVC) + target_link_libraries(torch onnx onnx_library) endif() target_link_libraries(torch - ${TORCH_CUDA_LIBRARIES} - ${CAFFE2_LIBRARY} - ${PROTOBUF_LIBRARY} - protobuf-nanopb -) + caffe2_library + protobuf-nanopb) + +find_package(OpenMP) +if(OPENMP_FOUND) + if (VERBOSE) + message(STATUS "Compiling with OpenMP") + endif() + target_compile_options(torch INTERFACE -fopenmp) + target_link_libraries(torch -fopenmp) +endif() + if(USE_CUDA) - if("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU") - target_link_libraries(torch -Wl,--no-as-needed ${CAFFE2_GPU_LIBRARY} -Wl,--as-needed) + if(MSVC) + set(TORCH_CUDA_LIBRARIES + ${NVTOOLEXT_HOME}/lib/x64/nvToolsExt64_1.lib + ${CUDA_LIBRARIES}) + target_include_directories(torch PRIVATE "${NVTOOLEXT_HOME}/include") + elseif(APPLE) + set(TORCH_CUDA_LIBRARIES + ${CUDA_TOOLKIT_ROOT_DIR}/lib/libcudart.dylib + ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvrtc.dylib + ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvToolsExt.dylib + ${CUDA_LIBRARIES}) + set_target_properties(torch PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") else() - target_link_libraries(torch ${CAFFE2_GPU_LIBRARY}) + set(TORCH_CUDA_LIBRARIES + ${CUDA_CUDA_LIB} + ${CUDA_NVRTC_LIB} + ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libnvToolsExt.so + ${CUDA_LIBRARIES}) endif() + + if(MSVC OR APPLE) + target_link_libraries(torch caffe2_gpu_library ${TORCH_CUDA_LIBRARIES}) + else() + # TODO: using the full TORCH_CUDA_LIBRARIES here causes some + # builds to fail in CI, as libcuda.so can no longer be found. It's + # not clear why this is the case, and the situation should be + # investigated/cleaned up. Note that the test+jit/test_api + # targets below do require linking against the full + # TORCH_CUDA_LIBRARIES, even on Linux + target_link_libraries(torch caffe2_gpu_library ${CUDA_LIBRARIES}) + endif() + target_compile_definitions(torch PRIVATE USE_CUDA) endif() -target_include_directories(torch - PUBLIC - "${CAFFE2_INCLUDE_DIR}" - "${CAFFE2_INSTALL_INCLUDE_DIR}" - "${CAFFE2_INSTALL_INCLUDE_DIR}/TH" - "${TORCH_SRC_DIR}/.." - "${TORCH_SRC_DIR}") +if(USE_ROCM) + target_link_libraries(torch caffe2_hip_library) + target_compile_definitions(torch PRIVATE + USE_ROCM + __HIP_PLATFORM_HCC__ + ) + target_include_directories(torch PRIVATE + /opt/rocm/include + /opt/rocm/hcc/include + /opt/rocm/hipblas/include + /opt/rocm/hcsparse/include + ) +endif() -if (NOT NO_API) - target_include_directories(torch PUBLIC - "${TORCH_SRC_DIR}/csrc/api/" - "${TORCH_SRC_DIR}/csrc/api/include") + +set(TH_CPU_INCLUDE + # dense + ${TORCH_SRC_DIR}/../aten/src/TH + ${CMAKE_CURRENT_BINARY_DIR}/../aten/src/TH + ${TORCH_SRC_DIR}/../aten/src + ${CMAKE_CURRENT_BINARY_DIR}/../aten/src + ${CMAKE_BINARY_DIR}/aten/src) +target_include_directories(torch PRIVATE ${TH_CPU_INCLUDE}) + +if(USE_CUDA OR USE_ROCM) + set(TH_CUDA_INCLUDE + # dense + ${TORCH_SRC_DIR}/../aten/src/THC + ${CMAKE_CURRENT_BINARY_DIR}/../aten/src/THC) + target_include_directories(torch PRIVATE ${TH_CUDA_INCLUDE}) endif() +set(ATen_CPU_INCLUDE + ${TORCH_SRC_DIR}/../aten/src + ${CMAKE_CURRENT_BINARY_DIR}/../aten/src + ${CMAKE_CURRENT_BINARY_DIR}/../aten/src/ATen + ${CMAKE_BINARY_DIR}/aten/src) +target_include_directories(torch PRIVATE ${ATen_CPU_INCLUDE}) + +target_include_directories(torch PUBLIC + ${TORCH_SRC_DIR}/csrc) + # SYSTEM headers are included with -isystem and thus do not trigger warnings. target_include_directories(torch SYSTEM PUBLIC "${TORCH_SRC_DIR}/../third_party/cereal/include" # For cereal/ "${TORCH_SRC_DIR}/../third_party/nanopb") -if(USE_CUDA) - target_include_directories(torch SYSTEM PUBLIC "${CUDA_INCLUDE_DIRS}") -endif() - set_target_properties(torch PROPERTIES VERSION 1 SOVERSION 1) if(NOT ${CMAKE_VERSION} VERSION_LESS "3.1") @@ -348,45 +356,75 @@ install(TARGETS torch LIBRARY DESTINATION "${TORCH_INSTALL_LIB_DIR}" ARCHIVE DESTINATION "${TORCH_INSTALL_LIB_DIR}") -if (TORCH_BUILD_TEST) - # JIT Tests. TODO: Put into test/cpp/jit folder - +# JIT Tests. TODO: Put into test/cpp/jit folder +if (NOT MSVC AND NOT APPLE AND NOT USE_ROCM) add_executable(test_jit ${TORCH_SRC_DIR}/csrc/jit/test_jit.cpp) + target_link_libraries(test_jit torch ${TORCH_CUDA_LIBRARIES}) + target_compile_definitions(test_jit PUBLIC USE_CATCH _FORCE_INLINES) + target_include_directories(test_jit PUBLIC + "${TORCH_SRC_DIR}/../third_party/catch/single_include" + ${ATen_CPU_INCLUDE}) - target_link_libraries(test_jit torch) + if (USE_CUDA) + target_link_libraries(test_jit ${CUDA_LIBRARIES}) + endif() +endif() - target_include_directories(test_jit PUBLIC - "${TORCH_SRC_DIR}/../third_party/catch/single_include") - - # API Tests - - if (NOT NO_API) - set(TORCH_API_TEST_DIR "${TORCH_SRC_DIR}/../test/cpp/api") - - add_executable(test_api - ${TORCH_API_TEST_DIR}/any.cpp - ${TORCH_API_TEST_DIR}/modules.cpp - ${TORCH_API_TEST_DIR}/cursor.cpp - ${TORCH_API_TEST_DIR}/integration.cpp - ${TORCH_API_TEST_DIR}/main.cpp - ${TORCH_API_TEST_DIR}/misc.cpp - ${TORCH_API_TEST_DIR}/module.cpp - ${TORCH_API_TEST_DIR}/optim.cpp - ${TORCH_API_TEST_DIR}/sequential.cpp - ${TORCH_API_TEST_DIR}/rnn.cpp - ${TORCH_API_TEST_DIR}/serialization.cpp - ${TORCH_API_TEST_DIR}/static.cpp - ${TORCH_API_TEST_DIR}/tensor.cpp - ${TORCH_API_TEST_DIR}/tensor_cuda.cpp - # Temporary until ATen tests are built with Caffe2 - ${TORCH_API_TEST_DIR}/tensor_options.cpp - ${TORCH_API_TEST_DIR}/tensor_options_cuda.cpp - ) +if (BUILD_TORCH_TEST AND NOT NO_API AND NOT USE_ROCM) + target_include_directories(torch PUBLIC + ${TORCH_SRC_DIR}/csrc/api + ${TORCH_SRC_DIR}/csrc/api/include) + + if (NOT MSVC) + target_compile_options(torch PRIVATE -Wno-maybe-uninitialized) + endif() - target_include_directories(test_api - PUBLIC - "${TORCH_SRC_DIR}/../third_party/catch/single_include") + if (APPLE) + target_compile_options(torch PRIVATE -Wno-unknown-warning-option) + endif() - target_link_libraries(test_api torch) + set(TORCH_API_TEST_DIR "${TORCH_SRC_DIR}/../test/cpp/api") + + add_executable(test_api + ${TORCH_API_TEST_DIR}/any.cpp + ${TORCH_API_TEST_DIR}/cursor.cpp + ${TORCH_API_TEST_DIR}/integration.cpp + ${TORCH_API_TEST_DIR}/main.cpp + ${TORCH_API_TEST_DIR}/misc.cpp + ${TORCH_API_TEST_DIR}/module.cpp + ${TORCH_API_TEST_DIR}/modules.cpp + ${TORCH_API_TEST_DIR}/optim.cpp + ${TORCH_API_TEST_DIR}/parallel.cpp + ${TORCH_API_TEST_DIR}/rnn.cpp + ${TORCH_API_TEST_DIR}/sequential.cpp + ${TORCH_API_TEST_DIR}/serialization.cpp + ${TORCH_API_TEST_DIR}/static.cpp + ${TORCH_API_TEST_DIR}/tensor_cuda.cpp + ${TORCH_API_TEST_DIR}/tensor.cpp + # Temporary until ATen tests are built with Caffe2 + ${TORCH_API_TEST_DIR}/tensor_options.cpp + ${TORCH_API_TEST_DIR}/tensor_options_cuda.cpp + ) + + target_include_directories(test_api + PUBLIC + "${TORCH_SRC_DIR}/../third_party/catch/single_include" + ${ATen_CPU_INCLUDE}) + + target_link_libraries(test_api torch ${TORCH_CUDA_LIBRARIES}) + + if (APPLE) + target_compile_options(test_api PRIVATE + -Wno-unknown-warning-option + -Wno-missing-braces + -Wno-maybe-uninitialized + -Wno-reorder + ) + elseif (MSVC) + else() + target_compile_options(test_api PRIVATE + -Wno-unused-but-set-parameter + -Wno-reorder + ) endif() endif() diff --git a/torch/_six.py b/torch/_six.py index 1cea0661f56459..1d70df51830d5e 100644 --- a/torch/_six.py +++ b/torch/_six.py @@ -25,6 +25,13 @@ PY2 = sys.version_info[0] == 2 PY3 = sys.version_info[0] == 3 +if PY2: + inf = float('inf') + nan = float('nan') +else: + import math + inf = math.inf + nan = math.nan if PY2: string_classes = basestring diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py index 0c83e23a9703e3..eb9709f864d0f6 100644 --- a/torch/_tensor_docs.py +++ b/torch/_tensor_docs.py @@ -1666,6 +1666,20 @@ def callable(a, b) -> number See :func:`torch.reshape` """) +add_docstr_all('reshape_as', + r""" +reshape_as(other) -> Tensor + +Returns this tensor as the same shape as :attr:`other`. +``self.reshape_as(other)`` is equivalent to ``self.reshape(other.sizes())``. + +Please see :meth:`~Tensor.reshape` for more information about ``reshape``. + +Args: + other (:class:`torch.Tensor`): The result tensor has the same shape + as :attr:`other`. +""") + add_docstr_all('resize_', r""" resize_(*sizes) -> Tensor @@ -1729,8 +1743,8 @@ def callable(a, b) -> number Writes all values from the tensor :attr:`src` into :attr:`self` at the indices specified in the :attr:`index` tensor. For each value in :attr:`src`, its output -index is specified by its index in :attr:`src` for dimension != :attr:`dim` and -by the corresponding value in :attr:`index` for dimension = :attr:`dim`. +index is specified by its index in :attr:`src` for ``dimension != dim`` and by +the corresponding value in :attr:`index` for ``dimension = dim``. For a 3-D tensor, :attr:`self` is updated as:: @@ -1740,14 +1754,14 @@ def callable(a, b) -> number This is the reverse operation of the manner described in :meth:`~Tensor.gather`. -:attr:`self`, :attr:`index` and :attr:`src` should have same number of -dimensions. It is also required that `index.size(d) <= src.size(d)` for all -dimensions `d`, and that `index.size(d) <= self.size(d)` for all dimensions -`d != dim`. +:attr:`self`, :attr:`index` and :attr:`src` (if it is a Tensor) should have same +number of dimensions. It is also required that ``index.size(d) <= src.size(d)`` +for all dimensions ``d``, and that ``index.size(d) <= self.size(d)`` for all +dimensions ``d != dim``. Moreover, as for :meth:`~Tensor.gather`, the values of :attr:`index` must be -between `0` and `(self.size(dim) -1)` inclusive, and all values in a row along -the specified dimension :attr:`dim` must be unique. +between ``0`` and ``self.size(dim) - 1`` inclusive, and all values in a row +along the specified dimension :attr:`dim` must be unique. Args: dim (int): the axis along which to index @@ -1771,6 +1785,50 @@ def callable(a, b) -> number [ 0.0000, 0.0000, 0.0000, 1.2300]]) """) +add_docstr_all('scatter_add_', + r""" +scatter_add_(dim, index, other) -> Tensor + +Adds all values from the tensor :attr:`other` into :attr:`self` at the indices +specified in the :attr:`index` tensor in a similar fashion as +:meth:`~torch.Tensor.scatter_`. For each value in :attr:`other`, it is added to +an index in :attr:`self` which is specified by its index in :attr:`other` +for ``dimension != dim`` and by the corresponding value in :attr:`index` for +``dimension = dim``. + +For a 3-D tensor, :attr:`self` is updated as:: + + self[index[i][j][k]][j][k] += other[i][j][k] # if dim == 0 + self[i][index[i][j][k]][k] += other[i][j][k] # if dim == 1 + self[i][j][index[i][j][k]] += other[i][j][k] # if dim == 2 + +:attr:`self`, :attr:`index` and :attr:`other` should have same number of +dimensions. It is also required that ``index.size(d) <= other.size(d)`` for all +dimensions ``d``, and that ``index.size(d) <= self.size(d)`` for all dimensions +``d != dim``. + +Moreover, as for :meth:`~Tensor.gather`, the values of :attr:`index` must be +between ``0`` and ``self.size(dim) - 1`` inclusive, and all values in a row along +the specified dimension :attr:`dim` must be unique. + +Args: + dim (int): the axis along which to index + index (LongTensor): the indices of elements to scatter and add + other (Tensor): the source elements to scatter and add + +Example:: + + >>> x = torch.rand(2, 5) + >>> x + tensor([[0.7404, 0.0427, 0.6480, 0.3806, 0.8328], + [0.7953, 0.2009, 0.9154, 0.6782, 0.9620]]) + >>> torch.ones(3, 5).scatter_add_(0, torch.tensor([[0, 1, 2, 0, 0], [2, 0, 0, 1, 2]]), x) + tensor([[1.7404, 1.2009, 1.9154, 1.3806, 1.8328], + [1.0000, 1.0427, 1.0000, 1.6782, 1.0000], + [1.7953, 1.0000, 1.6480, 1.0000, 1.9620]]) + +""") + add_docstr_all('select', r""" select(dim, index) -> Tensor @@ -2407,6 +2465,20 @@ def callable(a, b) -> number """) +add_docstr_all('view_as', + r""" +view_as(other) -> Tensor + +View this tensor as the same size as :attr:`other`. +``self.view_as(other)`` is equivalent to ``self.view(other.size())``. + +Please see :meth:`~Tensor.view` for more information about ``view``. + +Args: + other (:class:`torch.Tensor`): The result tensor has the same size + as :attr:`other`. +""") + add_docstr_all('expand', r""" expand(*sizes) -> Tensor @@ -2445,6 +2517,20 @@ def callable(a, b) -> number [ 3, 3, 3, 3]]) """) +add_docstr_all('expand_as', + r""" +expand_as(other) -> Tensor + +Expand this tensor to the same size as :attr:`other`. +``self.expand_as(other)`` is equivalent to ``self.expand(other.size())``. + +Please see :meth:`~Tensor.expand` for more information about ``expand``. + +Args: + other (:class:`torch.Tensor`): The result tensor has the same size + as :attr:`other`. +""") + add_docstr_all('zero_', r""" zero_() -> Tensor diff --git a/torch/_tensor_str.py b/torch/_tensor_str.py index f02449ab6e4ff5..ee90abff708a19 100644 --- a/torch/_tensor_str.py +++ b/torch/_tensor_str.py @@ -2,6 +2,7 @@ import torch from functools import reduce from sys import float_info +from torch._six import inf, nan class __PrinterOptions(object): @@ -50,7 +51,7 @@ def set_printoptions( PRINT_OPTS.linewidth = 80 elif profile == "full": PRINT_OPTS.precision = 4 - PRINT_OPTS.threshold = float('inf') + PRINT_OPTS.threshold = inf PRINT_OPTS.edgeitems = 3 PRINT_OPTS.linewidth = 80 @@ -101,8 +102,8 @@ def __init__(self, tensor): else: copy_abs = copy.abs() - pos_inf_mask = copy_abs.eq(float('inf')) - neg_inf_mask = copy_abs.eq(float('-inf')) + pos_inf_mask = copy_abs.eq(inf) + neg_inf_mask = copy_abs.eq(-inf) nan_mask = copy_abs.ne(copy) invalid_value_mask = pos_inf_mask + neg_inf_mask + nan_mask if invalid_value_mask.all(): diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index 73c42ef0fa83e4..84a08a155ce97e 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -3782,11 +3782,11 @@ def parse_kwargs(desc): end (Number): the ending value for the set of points step (Number): the gap between each pair of adjacent points. Default: ``1``. {out} - {dtype} - If `dtype` is not given, infer the data type from the other input arguments. - If any of `start`, `end`, or `stop` are floating-point, - the `dtype` is inferred to be the default dtype, see :meth:`~torch.get_default_dtype`. - Otherwise, the `dtype` is inferred to be `torch.int64`. + {dtype} If `dtype` is not given, infer the data type from the other input + arguments. If any of `start`, `end`, or `stop` are floating-point, the + `dtype` is inferred to be the default dtype, see + :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to + be `torch.int64`. {layout} {device} {requires_grad} @@ -4202,9 +4202,6 @@ def parse_kwargs(desc): `squeeze(input, 0)` leaves the tensor unchanged, but :func:`squeeze(input, 1)` will squeeze the tensor to the shape :math:`(A \times B)`. -.. note:: As an exception to the above, a 1-dimensional tensor of size 1 will - not have its dimensions changed. - .. note:: The returned tensor shares the storage with the input tensor, so changing the contents of one will change the contents of the other. @@ -5108,58 +5105,6 @@ def parse_kwargs(desc): {requires_grad} """.format(**factory_like_common_args)) -add_docstr(torch.stft, - r""" -stft(signal, frame_length, hop, fft_size=None, normalized=False, onesided=True, window=None, pad_end=0) -> Tensor - -Short-time Fourier transform (STFT). - -Ignoring the batch dimension, this method computes the following expression: - -.. math:: - X[m, \omega] = \sum_{k = 0}^{\text{frame_length}}% - window[k]\ signal[m \times hop + k]\ e^{- j \frac{2 \pi \cdot \omega k}{\text{frame_length}}}, - -where :math:`m` is the index of the sliding window, and :math:`\omega` is -the frequency that :math:`0 \leq \omega <` :attr:`fft_size`. When -:attr:`return_onsesided` is the default value ``True``, only values for -:math:`\omega` in range :math:`\left[0, 1, 2, \dots, \left\lfloor \frac{\text{fft_size}}{2} \right\rfloor + 1\right]` -are returned because the real-to-complex transform satisfies the Hermitian -symmetry, i.e., :math:`X[m, \omega] = X[m, \text{fft_size} - \omega]^*`. - -The input :attr:`signal` must be 1-D sequence :math:`(T)` or 2-D a batch of -sequences :math:`(N \times T)`. If :attr:`fft_size` is ``None``, it is -default to same value as :attr:`frame_length`. :attr:`window` can be a -1-D tensor of size :attr:`frame_length`, e.g., see -:meth:`torch.hann_window`. If :attr:`window` is the default value ``None``, -it is treated as if having :math:`1` everywhere in the frame. -:attr:`pad_end` indicates the amount of zero padding at the end of -:attr:`signal` before STFT. If :attr:`normalized` is set to ``True``, the -function returns the normalized STFT results, i.e., multiplied by -:math:`(frame\_length)^{-0.5}`. - -Returns the real and the imaginary parts together as one tensor of size -:math:`(* \times N \times 2)`, where :math:`*` is the shape of input :attr:`signal`, -:math:`N` is the number of :math:`\omega` s considered depending on -:attr:`fft_size` and :attr:`return_onesided`, and each pair in the last -dimension represents a complex number as real part and imaginary part. - -Arguments: - signal (Tensor): the input tensor - frame_length (int): the size of window frame and STFT filter - hop (int): the distance between neighboring sliding window frames - fft_size (int, optional): size of Fourier transform. Default: ``None`` - normalized (bool, optional): controls whether to return the normalized STFT results - Default: ``False`` - onesided (bool, optional): controls whether to return half of results to - avoid redundancy Default: ``True`` - window (Tensor, optional): the optional window function. Default: ``None`` - pad_end (int, optional): implicit zero padding at the end of :attr:`signal`. Default: 0 - -Returns: - Tensor: A tensor containing the STFT result -""") - add_docstr(torch.det, r""" det(A) -> Tensor @@ -5569,7 +5514,7 @@ def parse_kwargs(desc): normalized (bool, optional): controls whether to return normalized results. Default: ``False`` onesided (bool, optional): controls whether to return half of results to - avoid redundancy Default: ``True`` + avoid redundancy. Default: ``True`` Returns: Tensor: A tensor containing the real-to-complex Fourier transform result diff --git a/torch/csrc/Device.cpp b/torch/csrc/Device.cpp index f130f8164c0ac1..fcd0cf2ff8bb52 100644 --- a/torch/csrc/Device.cpp +++ b/torch/csrc/Device.cpp @@ -4,12 +4,14 @@ #include "torch/csrc/utils/object_ptr.h" #include "torch/csrc/utils/python_arg_parser.h" #include "torch/csrc/utils/python_strings.h" +#include "torch/csrc/utils/python_numbers.h" #include "torch/csrc/utils/pybind.h" #include #include #include +#include #include #include @@ -95,6 +97,13 @@ PyObject *THPDevice_index(THPDevice *self) END_HANDLE_TH_ERRORS } +static Py_ssize_t THPDevice_hash(THPDevice *self) +{ + HANDLE_TH_ERRORS + return static_cast(std::hash{}(self->device) % std::numeric_limits::max()); + END_HANDLE_TH_ERRORS_RET(-1) +} + PyObject *THPDevice_rc(PyObject *a, PyObject *b, int op) { HANDLE_TH_ERRORS if (!THPDevice_Check(a) || !THPDevice_Check(b)) { @@ -181,7 +190,7 @@ PyTypeObject THPDeviceType = { 0, /* tp_as_number */ 0, /* tp_as_sequence */ 0, /* tp_as_mapping */ - 0, /* tp_hash */ + (hashfunc)THPDevice_hash, /* tp_hash */ 0, /* tp_call */ (reprfunc)THPDevice_str, /* tp_str */ 0, /* tp_getattro */ diff --git a/torch/csrc/Size.cpp b/torch/csrc/Size.cpp index 0708ebaaa36ccf..95c7c648244b9c 100644 --- a/torch/csrc/Size.cpp +++ b/torch/csrc/Size.cpp @@ -14,7 +14,7 @@ struct THPSize { PyObject * THPSize_New(const torch::autograd::Variable& var) { - if (!torch::jit::tracer::isTracing(var)) { + if (!torch::jit::tracer::isTracing()) { auto sizes = var.sizes(); return THPSize_NewFromSizes(var.dim(), sizes.data()); } @@ -38,10 +38,10 @@ PyObject * THPSize_NewFromSizes(int dim, const int64_t *sizes) return self.release(); } -static bool isTracedVar(PyObject *item) { +static bool isTracedZeroDimVar(PyObject *item) { if (!THPVariable_Check(item)) return false; auto & var = reinterpret_cast(item)->cdata; - return torch::jit::tracer::isTracing(var); + return var.dim() == 0 && torch::jit::tracer::getValueTrace(var); } static PyObject * THPSize_pynew(PyTypeObject *type, PyObject *args, PyObject *kwargs) @@ -50,10 +50,10 @@ static PyObject * THPSize_pynew(PyTypeObject *type, PyObject *args, PyObject *kw if (self) { for (Py_ssize_t i = 0; i < PyTuple_Size(self); ++i) { PyObject *item = PyTuple_GET_ITEM(self.get(), i); - if (isTracedVar(item)) { + if (THPUtils_checkLong(item)) { continue; } - if (THPUtils_checkLong(item)) { + if (torch::jit::tracer::isTracing() && isTracedZeroDimVar(item)) { continue; } // item.__index__() works with 0-dim tensors and tensors with one element diff --git a/torch/csrc/Storage.cpp b/torch/csrc/Storage.cpp index 80c6705cb77a46..3b15ead08d66a1 100644 --- a/torch/csrc/Storage.cpp +++ b/torch/csrc/Storage.cpp @@ -12,7 +12,7 @@ #include // See Note [TH abstraction violation] // - Used to get at the allocator associated with a storage -#include +#include #include #include #include "THP.h" @@ -39,7 +39,7 @@ void THPPointer::free() { } else { AT_ASSERT(ptr->data_ptr.device().is_cuda()); #ifdef USE_CUDA - THCStorage_free(at::globalContext().lazyInitCUDA(), ptr); + THStorage_free(ptr); #else AT_ERROR("Cannot free THCStorage when not built with CUDA"); #endif diff --git a/torch/csrc/WindowsTorchApiMacro.h b/torch/csrc/WindowsTorchApiMacro.h new file mode 100644 index 00000000000000..b2b03eca03dc63 --- /dev/null +++ b/torch/csrc/WindowsTorchApiMacro.h @@ -0,0 +1,18 @@ +#pragma once + +#ifdef _WIN32 + +#if defined(torch_EXPORTS) +#define TORCH_API __declspec(dllexport) +#else +#define TORCH_API __declspec(dllimport) +#endif + +#else +#if defined(torch_EXPORTS) +#define TORCH_API +#else +#define TORCH_API +#endif + +#endif diff --git a/torch/csrc/api/include/torch/nn/cloneable.h b/torch/csrc/api/include/torch/nn/cloneable.h index 61a32e20fe8061..3b304a652d133a 100644 --- a/torch/csrc/api/include/torch/nn/cloneable.h +++ b/torch/csrc/api/include/torch/nn/cloneable.h @@ -43,7 +43,8 @@ class Cloneable : public Module { "Are you sure you called register_parameter() inside reset() " "and not the constructor?"); for (const auto& parameter : parameters_) { - copy->parameters_[parameter.key].data().copy_(parameter->data()); + copy->parameters_[parameter.key].data().copy_( + parameter->data(), /*non_blocking=*/true); } AT_CHECK( copy->buffers_.size() == buffers_.size(), @@ -52,7 +53,8 @@ class Cloneable : public Module { "Are you sure you called register_buffer() inside reset() " "and not the constructor?"); for (const auto& buffer : buffers_) { - copy->buffers_[buffer.key].data().copy_(buffer->data()); + copy->buffers_[buffer.key].data().copy_( + buffer->data(), /*non_blocking=*/true); } AT_CHECK( copy->children_.size() == children_.size(), diff --git a/torch/csrc/api/include/torch/nn/cursor.h b/torch/csrc/api/include/torch/nn/cursor.h index c4007d89686ff6..c0f56eea72fbd0 100644 --- a/torch/csrc/api/include/torch/nn/cursor.h +++ b/torch/csrc/api/include/torch/nn/cursor.h @@ -125,13 +125,13 @@ class CursorBase { template void map(Iterator output_iterator, Function function) { for (auto& item : items_) { - *output_iterator = function(*item); + *output_iterator++ = function(*item); } } template void map(Iterator output_iterator, Function function) const { for (auto& item : items_) { - *output_iterator = function(*item); + *output_iterator++ = function(*item); } } @@ -142,13 +142,13 @@ class CursorBase { template void map_items(Iterator output_iterator, Function function) { for (auto& item : items_) { - *output_iterator = function(item.key, item.value); + *output_iterator++ = function(item.key, item.value); } } template void map_items(Iterator output_iterator, Function function) const { for (auto& item : items_) { - *output_iterator = function(item.key, item.value); + *output_iterator++ = function(item.key, item.value); } } diff --git a/torch/csrc/api/include/torch/nn/modules/any.h b/torch/csrc/api/include/torch/nn/modules/any.h index be5fd3a6702826..595baee2532d23 100644 --- a/torch/csrc/api/include/torch/nn/modules/any.h +++ b/torch/csrc/api/include/torch/nn/modules/any.h @@ -186,7 +186,7 @@ class AnyModule::Value { private: friend class AnyModule; - friend class TestValue; + friend struct TestValue; /// Constructs the `Value` from value type. template < diff --git a/torch/csrc/api/include/torch/nn/modules/batchnorm.h b/torch/csrc/api/include/torch/nn/modules/batchnorm.h index 25c75b85a7df71..d4dd669a286fef 100644 --- a/torch/csrc/api/include/torch/nn/modules/batchnorm.h +++ b/torch/csrc/api/include/torch/nn/modules/batchnorm.h @@ -19,9 +19,8 @@ struct BatchNormOptions { class BatchNormImpl : public torch::nn::Cloneable { public: - template - explicit BatchNormImpl(Ts&&... ts) - : BatchNormImpl(BatchNormOptions(std::forward(ts)...)) {} + explicit BatchNormImpl(int64_t features) + : BatchNormImpl(BatchNormOptions(features)) {} explicit BatchNormImpl(BatchNormOptions options); void reset() override; diff --git a/torch/csrc/api/include/torch/nn/modules/conv.h b/torch/csrc/api/include/torch/nn/modules/conv.h index f7a7cc0b142460..3f562eab1c5a52 100644 --- a/torch/csrc/api/include/torch/nn/modules/conv.h +++ b/torch/csrc/api/include/torch/nn/modules/conv.h @@ -32,9 +32,11 @@ struct ConvOptions { template class ConvImpl : public torch::nn::Cloneable { public: - template - explicit ConvImpl(Ts&&... ts) - : ConvImpl(ConvOptions(std::forward(ts)...)) {} + ConvImpl( + int64_t input_channels, + int64_t output_channels, + ExpandingArray kernel_size) + : ConvImpl(ConvOptions(input_channels, output_channels, kernel_size)) {} explicit ConvImpl(ConvOptions options); void reset() override; diff --git a/torch/csrc/api/include/torch/nn/modules/dropout.h b/torch/csrc/api/include/torch/nn/modules/dropout.h index 91f4c5b244dd85..23c3e4f127d97f 100644 --- a/torch/csrc/api/include/torch/nn/modules/dropout.h +++ b/torch/csrc/api/include/torch/nn/modules/dropout.h @@ -18,9 +18,8 @@ namespace detail { template class DropoutImplBase : public torch::nn::Cloneable { public: - template - explicit DropoutImplBase(Ts&&... ts) - : DropoutImplBase(DropoutOptions(std::forward(ts)...)) {} + explicit DropoutImplBase(double rate) + : DropoutImplBase(DropoutOptions(rate)) {} explicit DropoutImplBase(DropoutOptions options_); void reset() override; diff --git a/torch/csrc/api/include/torch/nn/modules/embedding.h b/torch/csrc/api/include/torch/nn/modules/embedding.h index f35cd05cb7ca2f..3b80d1044a2c18 100644 --- a/torch/csrc/api/include/torch/nn/modules/embedding.h +++ b/torch/csrc/api/include/torch/nn/modules/embedding.h @@ -18,9 +18,8 @@ struct EmbeddingOptions { class EmbeddingImpl : public torch::nn::Cloneable { public: - template - explicit EmbeddingImpl(Ts&&... ts) - : EmbeddingImpl(EmbeddingOptions(std::forward(ts)...)) {} + EmbeddingImpl(int64_t count, int64_t dimension) + : EmbeddingImpl(EmbeddingOptions(count, dimension)) {} explicit EmbeddingImpl(EmbeddingOptions options); void reset() override; diff --git a/torch/csrc/api/include/torch/nn/modules/linear.h b/torch/csrc/api/include/torch/nn/modules/linear.h index 34f674991b1e14..5642a9acc58c78 100644 --- a/torch/csrc/api/include/torch/nn/modules/linear.h +++ b/torch/csrc/api/include/torch/nn/modules/linear.h @@ -19,9 +19,7 @@ struct LinearOptions { class LinearImpl : public Cloneable { public: - template - explicit LinearImpl(Ts&&... ts) - : LinearImpl(LinearOptions(std::forward(ts)...)) {} + LinearImpl(int64_t in, int64_t out) : LinearImpl(LinearOptions(in, out)) {} explicit LinearImpl(LinearOptions options); void reset() override; diff --git a/torch/csrc/api/include/torch/nn/modules/rnn.h b/torch/csrc/api/include/torch/nn/modules/rnn.h index e6d2ea918e9ec6..3523eff4764fec 100644 --- a/torch/csrc/api/include/torch/nn/modules/rnn.h +++ b/torch/csrc/api/include/torch/nn/modules/rnn.h @@ -121,8 +121,8 @@ struct RNNOptions { class RNNImpl : public detail::RNNImplBase { public: - template - explicit RNNImpl(Ts&&... ts) : RNNImpl(RNNOptions(std::forward(ts)...)) {} + RNNImpl(int64_t input_size, int64_t hidden_size) + : RNNImpl(RNNOptions(input_size, hidden_size)) {} explicit RNNImpl(RNNOptions options); RNNOptions options; @@ -140,9 +140,8 @@ using LSTMOptions = detail::RNNOptionsBase; class LSTMImpl : public detail::RNNImplBase { public: - template - explicit LSTMImpl(Ts&&... ts) - : LSTMImpl(LSTMOptions(std::forward(ts)...)) {} + LSTMImpl(int64_t input_size, int64_t hidden_size) + : LSTMImpl(LSTMOptions(input_size, hidden_size)) {} explicit LSTMImpl(LSTMOptions options); private: @@ -157,8 +156,8 @@ using GRUOptions = detail::RNNOptionsBase; class GRUImpl : public detail::RNNImplBase { public: - template - explicit GRUImpl(Ts&&... ts) : GRUImpl(GRUOptions(std::forward(ts)...)) {} + GRUImpl(int64_t input_size, int64_t hidden_size) + : GRUImpl(GRUOptions(input_size, hidden_size)) {} explicit GRUImpl(GRUOptions options); private: diff --git a/torch/csrc/api/include/torch/nn/parallel/data_parallel.h b/torch/csrc/api/include/torch/nn/parallel/data_parallel.h new file mode 100644 index 00000000000000..82150658dcffe2 --- /dev/null +++ b/torch/csrc/api/include/torch/nn/parallel/data_parallel.h @@ -0,0 +1,176 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace torch { +namespace nn { +namespace parallel { + +/// Replicates a module on the given list of devices. +/// A replica is created by calling `clone()` on the module. For this, the +/// module must inherit from `nn::Cloneable`, or define its own `clone()` +/// method, which is expected to perform a deep copy of the module. +template +std::vector> replicate( + const std::shared_ptr& module, + const std::vector& devices) { + std::vector> replicas; + replicas.reserve(devices.size()); + for (const auto& device : devices) { + // Here we rely on the property tensors are never (or should never be) + // allocated on any particular device, but always the default device, e.g. + // in `torch::ones({3, 4})`, the device is unspecified and pulled from the + // current thread local default options. As such, we can here modify these + // thread local default options and thereby cause all tensors in the cloned + // module to be constructed directly on the device we want. + OptionsGuard guard(device); + replicas.push_back(std::static_pointer_cast(module->clone())); + } + return replicas; +} + +/// Replicates a module holder on the given list of devices. +/// This method allows calling `replicate()` with a module holder, such as +/// `Linear`. +template +std::vector> replicate( + const ModuleHolder& module, + const std::vector& devices) { + auto ptrs = replicate(module.ptr(), devices); + return std::vector>(ptrs.begin(), ptrs.end()); +} + +/// Applies the given inputs to the given modules in a parallel fashion. +/// Conceptually, a thread is spawned for each `(module, input)` pair, in which +/// `forward()` is called on the module with its corresponding input. The +/// outputs of the individual calls are stored in a vector and returned. +/// +/// The first exception caught by any thread is stashed and rethrown after all +/// threads have completed their operation. +/// +/// Further remarks: +/// 1. The length of the module container must match the length of the inputs. +/// 2. If a list of devices is supplied, it must match the list of modules in +/// length. Each device will be set to the current default device during the +/// invocation of the respective module. This means any tensors allocated on the +/// default device inside the module will be constructed on this device. +template +std::vector parallel_apply( + std::vector& modules, + const std::vector& inputs, + const at::optional>& devices = at::nullopt) { + AT_CHECK( + modules.size() == inputs.size(), "Must have as many inputs as modules"); + if (devices) { + AT_CHECK( + modules.size() == devices->size(), + "Must have as many devices as modules"); + } + + std::vector outputs(modules.size()); + std::mutex mutex; + + // std::exception_ptr can be passed between threads: + // > An instance of std::exception_ptr may be passed to another function, + // > possibly on another thread, where the exception may be rethrown [...]. + // https://en.cppreference.com/w/cpp/error/exception_ptr + std::exception_ptr exception; + + at::parallel_for( + /*begin=*/0, + /*end=*/modules.size(), + /*grain_size=*/1, + [&modules, &inputs, &devices, &outputs, &mutex, &exception]( + int64_t index, int64_t stop) { + for (; index < stop; ++index) { + try { + torch::OptionsGuard options_guard( + devices ? (*devices)[index] : inputs[index].device()); + auto output = modules[index]->forward(inputs[index]); + std::lock_guard lock(mutex); + outputs[index] = output; + } catch (...) { + std::lock_guard lock(mutex); + if (!exception) { + exception = std::current_exception(); + } + } + } + }); + + if (exception) { + std::rethrow_exception(exception); + } + + return outputs; +} + +/// Evaluates `module(input)` in parallel across the given `devices`. If +/// `devices` is not supplied, the invocation is parallelized across all +/// available CUDA devices. If `output_device` is supplied, the final, combined +/// tensor will be placed on this device. If not, it defaults to the first +/// device in `devices`. +/// +/// In detail, this method performs the following four distinct steps: +/// 1. *Scatter* the input to the given devices, +/// 2. *Replicate* (deep clone) the model on each device, +/// 3. *Evaluate* each module with its input on its device, +/// 4. *Gather* the outputs of each replica into a single output tensor, located +/// on the `output_device`. +template +Tensor data_parallel( + ModuleType module, + Tensor input, + at::optional> devices = at::nullopt, + at::optional output_device = at::nullopt, + int64_t dim = 0) { + if (!devices) { + const auto device_count = torch::cuda::device_count(); + AT_CHECK(device_count > 0, "Expected at least one CUDA device"); + devices.emplace(); + devices->reserve(device_count); + for (size_t index = 0; index < device_count; ++index) { + devices->emplace_back(kCUDA, index); + } + } + if (!output_device) { + output_device = devices->front(); + } + + if (devices->size() == 1) { + OptionsGuard guard(devices->front()); + return module->forward(std::move(input)).to(*output_device); + } + + autograd::Scatter scatter(*devices, /*chunk_sizes=*/at::nullopt, dim); + auto scattered_inputs = scatter.apply({std::move(input)}); + + auto replicas = replicate(module, *devices); + auto outputs = parallel_apply(replicas, scattered_inputs, *devices); + return autograd::Gather(*output_device, dim) + .apply(std::move(outputs)) + .front(); +} + +} // namespace parallel +} // namespace nn +} // namespace torch diff --git a/torch/csrc/api/include/torch/nn/pimpl.h b/torch/csrc/api/include/torch/nn/pimpl.h index e4b6aa76b5a985..772e19b5e53070 100644 --- a/torch/csrc/api/include/torch/nn/pimpl.h +++ b/torch/csrc/api/include/torch/nn/pimpl.h @@ -38,10 +38,22 @@ class ModuleHolder : torch::detail::ModuleHolderIndicator { public: using ContainedType = Contained; + /// Default constructs the contained module if if has a default constructor, + /// else produces a static error. NOTE: This uses the behavior of template + /// classes in C++ that constructors (or any methods) are only compiled when + /// actually used. + ModuleHolder() : impl_(default_construct()) { + static_assert( + std::is_default_constructible::value, + "You are trying to default construct a module which has " + "no default constructor. Use = nullptr to give it the empty state " + "(e.g. `Linear linear = nullptr;` instead of `Linear linear;`)."); + } + /// Constructs the `ModuleHolder` with an empty contained value. Access to /// the underlying module is not permitted and will throw an exception, until /// a value is assigned. - explicit ModuleHolder(std::nullptr_t) : impl_(nullptr) {} + /* implicit */ ModuleHolder(std::nullptr_t) : impl_(nullptr) {} /// Constructs the `ModuleHolder` with a contained module, forwarding all /// arguments to its constructor. @@ -115,6 +127,30 @@ class ModuleHolder : torch::detail::ModuleHolderIndicator { bool is_empty() const noexcept { return impl_ == nullptr; } + + private: + /// In C++17, the two methods below could be written as the following: + /// if constexpr (std::is_default_constructible_v) { + /// return std::make_shared(); + /// } else { + /// return nullptr; + /// } + /// In C++11, we use SFINAE instead of `if constexpr`. + + template < + typename T = Contained, + typename = torch::enable_if_t::value>> + std::shared_ptr default_construct() { + return std::make_shared(); + } + + template + torch::disable_if_t< + std::is_default_constructible::value, + std::shared_ptr> + default_construct() { + return nullptr; + } }; } // namespace nn } // namespace torch diff --git a/torch/csrc/api/include/torch/optim/optimizer.h b/torch/csrc/api/include/torch/optim/optimizer.h index 6ba46a2bd95127..eed600ab759bc4 100644 --- a/torch/csrc/api/include/torch/optim/optimizer.h +++ b/torch/csrc/api/include/torch/optim/optimizer.h @@ -21,26 +21,33 @@ class OptimizerBase { using ParameterCursor = torch::detail::CursorBase; /// Constructs the `Optimizer` from a vector of parameters. - explicit OptimizerBase(std::vector parameters) - : parameters_(std::move(parameters)) {} + explicit OptimizerBase(std::vector parameters); /// Constructs the `Optimizer` from a ParameterCursor, such as /// `nn::Module::parameters()` returns. - explicit OptimizerBase(ParameterCursor cursor) { - parameters_.reserve(cursor.size()); - for (const auto& parameter : cursor) { - parameters_.push_back(*parameter); - } - } + explicit OptimizerBase(const ParameterCursor& cursor); virtual ~OptimizerBase() = default; + /// Adds the given vector of parameters to the optimizer's parameter list. + /// Override this method if you want to modify the way parameters are added to + /// the `Optimizer`. + virtual void add_parameters(const std::vector& parameters); + + /// Adds the `ParameterCursor`'s parameters to the optimizer's parameter list. + /// NOTE: Calls the `vector` overload of `add_parameters` -- override + /// that method if you want to modify the behavior of `add_parameters`. + virtual void add_parameters(const ParameterCursor& cursor); + /// Zeros out the gradients of all parameters. virtual void zero_grad(); /// Provides a reference to the parameters this optimizer holds. const std::vector& parameters() const noexcept; + /// Returns the number of parameters referenced by the optimizer. + size_t size() const noexcept; + protected: OptimizerBase() = default; diff --git a/torch/csrc/api/src/nn/module.cpp b/torch/csrc/api/src/nn/module.cpp index e2809204a1aa3c..f21f6c5511b600 100644 --- a/torch/csrc/api/src/nn/module.cpp +++ b/torch/csrc/api/src/nn/module.cpp @@ -38,8 +38,7 @@ std::shared_ptr Module::clone() const { AT_ERROR( "clone() has not been implemented for ", name(), - ". Use the copy constructor if you don't require polymorphic cloning. " - "Otherwise, subclass torch::nn::Cloneable<", + ". Subclass torch::nn::Cloneable<", name(), "> instead of torch::nn::Module to inherit the ability to clone."); } diff --git a/torch/csrc/api/src/optim/optimizer.cpp b/torch/csrc/api/src/optim/optimizer.cpp index 57c300df07e030..47f2f36423d9d4 100644 --- a/torch/csrc/api/src/optim/optimizer.cpp +++ b/torch/csrc/api/src/optim/optimizer.cpp @@ -1,12 +1,32 @@ #include +#include #include -#include +#include +#include namespace torch { namespace optim { namespace detail { + +OptimizerBase::OptimizerBase(std::vector parameters) + : parameters_(std::move(parameters)) {} + +OptimizerBase::OptimizerBase(const ParameterCursor& cursor) { + add_parameters(cursor); +} + +void OptimizerBase::add_parameters(const std::vector& parameters) { + parameters_.insert(parameters_.end(), parameters.begin(), parameters.end()); +} + +void OptimizerBase::add_parameters(const ParameterCursor& cursor) { + std::vector tensors(cursor.size()); + cursor.map(tensors.begin(), [](const Tensor& tensor) { return tensor; }); + add_parameters(tensors); +} + void OptimizerBase::zero_grad() { for (auto& parameter : parameters_) { auto& grad = parameter.grad(); @@ -16,6 +36,10 @@ void OptimizerBase::zero_grad() { } } } + +size_t OptimizerBase::size() const noexcept { + return parameters_.size(); +} } // namespace detail } // namespace optim } // namespace torch diff --git a/torch/csrc/autograd/anomaly_mode.h b/torch/csrc/autograd/anomaly_mode.h index df1e92cec258d5..7327d03f11b887 100644 --- a/torch/csrc/autograd/anomaly_mode.h +++ b/torch/csrc/autograd/anomaly_mode.h @@ -1,5 +1,7 @@ #pragma once +#include "torch/csrc/WindowsTorchApiMacro.h" + namespace torch { namespace autograd { struct AnomalyMode { @@ -11,7 +13,7 @@ struct AnomalyMode { } private: - static bool _enabled; + TORCH_API static bool _enabled; }; diff --git a/torch/csrc/autograd/function.cpp b/torch/csrc/autograd/function.cpp index af5e410686c7f0..56ea7f7d290710 100644 --- a/torch/csrc/autograd/function.cpp +++ b/torch/csrc/autograd/function.cpp @@ -17,7 +17,13 @@ namespace torch { namespace autograd { -thread_local uint64_t Function::next_sequence_nr_ = 0; +/// Monotonically incrementing (thread local!) counter to supply sequence +/// numbers. +thread_local uint64_t Function_next_sequence_nr_ = 0; + +uint64_t& Function::get_next_sequence_nr() { + return Function_next_sequence_nr_; +} auto Function::name() const -> std::string { return at::demangle(typeid(*this).name()); diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h index 90189e4a3c4d69..b65a7063f15af5 100644 --- a/torch/csrc/autograd/function.h +++ b/torch/csrc/autograd/function.h @@ -8,7 +8,6 @@ #include "torch/csrc/autograd/saved_variable.h" #include "torch/csrc/autograd/type_and_shape.h" #include "torch/csrc/autograd/variable.h" -#include "torch/csrc/utils/auto_unique_ptr.h" #include "torch/csrc/utils/python_stub.h" #include "torch/csrc/utils/variadic.h" @@ -101,9 +100,8 @@ struct Function : std::enable_shared_from_this { } } - explicit Function( - edge_list&& next_edges = edge_list()) - : Function(next_sequence_nr_++, std::move(next_edges)) {} + explicit Function(edge_list&& next_edges = edge_list()) + : Function(get_next_sequence_nr()++, std::move(next_edges)) {} /// Functions are neither copyable nor moveable. Function(const Function& other) = delete; @@ -307,9 +305,7 @@ struct Function : std::enable_shared_from_this { } protected: - /// Monotonically incrementing (thread local!) counter to supply sequence - /// numbers. - static thread_local uint64_t next_sequence_nr_; + static uint64_t& get_next_sequence_nr(); /// Performs the `Function`'s actual operation. virtual variable_list apply(variable_list&& inputs) = 0; diff --git a/torch/csrc/autograd/functions/comm.cpp b/torch/csrc/autograd/functions/comm.cpp new file mode 100644 index 00000000000000..00e140e81b083d --- /dev/null +++ b/torch/csrc/autograd/functions/comm.cpp @@ -0,0 +1,131 @@ +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +namespace torch { +namespace autograd { +Scatter::Scatter( + std::vector devices, + const at::optional>& chunk_sizes, + int64_t dim, + const at::optional>& streams, + bool unsqueeze_scalars) + : devices_(std::move(devices)), + chunk_sizes_(chunk_sizes), + dim_(dim), + streams_(streams), + unsqueeze_scalars_(unsqueeze_scalars) {} + +variable_list Scatter::apply(variable_list&& inputs) { +#ifdef USE_CUDA + AT_ASSERT(inputs.size() == 1); + auto& input = inputs.front(); + + std::shared_ptr grad_fn; + if (compute_requires_grad(input)) { + grad_fn = + std::make_shared(/*destination_device=*/input.device(), dim_); + grad_fn->set_next_edges(collect_next_edges(input)); + } + + auto device_indices = fmap(devices_, [](const at::Device& device) -> int64_t { + return device.index(); + }); + auto tensors = torch::cuda::scatter( + std::move(input), device_indices, chunk_sizes_, dim_, streams_); + + std::vector variables; + variables.reserve(tensors.size()); + for (auto& tensor : tensors) { + AT_ASSERT(tensor.defined()); + if (unsqueeze_scalars_) { + AT_ASSERT(tensor.dim() == 1 && tensor.numel() == 1); + variables.push_back(tensor[0]); + } else { + variables.push_back(std::move(tensor)); + } + } + + set_history(variables, grad_fn); + + return variables; +#else + AT_ERROR("Scatter is only supported in CUDA environments"); +#endif +} + +Gather::Gather(const at::Device& destination_device, int64_t dim) + : destination_device_(destination_device), dim_(dim) {} + +variable_list Gather::apply(variable_list&& inputs) { +#ifdef USE_CUDA + bool all_are_zero_dim = true; + for (const auto& input : inputs) { + AT_CHECK( + input.is_cuda(), + "All inputs to Gather must be CUDA tensors, got ", + input.type()); + if (input.dim() > 0) { + all_are_zero_dim = false; + } + } + + const bool unsqueeze_scalars = all_are_zero_dim && dim_ == 0; + if (unsqueeze_scalars) { + AT_WARN( + "Was asked to gather along dimension 0, but all " + "input tensors were scalars; will instead unsqueeze " + "and return a vector."); + } + + std::vector tensors; + tensors.reserve(inputs.size()); + for (auto& variable : inputs) { + if (unsqueeze_scalars) { + tensors.push_back(variable.view(1)); + } else { + tensors.push_back(std::move(variable)); + } + } + + std::shared_ptr grad_fn; + if (compute_requires_grad(inputs)) { + std::vector source_devices; + std::vector input_sizes; + for (auto& input : inputs) { + source_devices.push_back(input.device()); + input_sizes.push_back(input.size(dim_)); + } + grad_fn = std::make_shared( + std::move(source_devices), + std::move(input_sizes), + dim_, + /*streams=*/at::nullopt, + /*unsqueeze_scalars=*/unsqueeze_scalars); + grad_fn->set_next_edges(collect_next_edges(inputs)); + } + + // This is special logic for torch::cuda::gather! + const auto destination_index = + destination_device_.is_cpu() ? -1 : destination_device_.index(); + auto variable = torch::cuda::gather(tensors, dim_, destination_index); + set_history(variable, grad_fn); + return {variable}; +#else + AT_ERROR("Gather is only supported in CUDA environments"); +#endif +} + +} // namespace autograd +} // namespace torch diff --git a/torch/csrc/autograd/functions/comm.h b/torch/csrc/autograd/functions/comm.h new file mode 100644 index 00000000000000..7bbd24a169dcbe --- /dev/null +++ b/torch/csrc/autograd/functions/comm.h @@ -0,0 +1,41 @@ +#pragma once + +#include +#include + +#include + +#include +#include + +namespace torch { +namespace autograd { + +struct Scatter : public Function { + explicit Scatter( + std::vector devices, + const at::optional>& chunk_sizes = at::nullopt, + int64_t dim = 0, + const at::optional>& streams = at::nullopt, + bool unsqueeze_scalars = false); + + variable_list apply(variable_list&& inputs) override; + + std::vector devices_; + at::optional> chunk_sizes_; + int64_t dim_; + at::optional> streams_; + bool unsqueeze_scalars_; +}; + +struct Gather : public Function { + explicit Gather(const at::Device& destination_device, int64_t dim = 0); + + variable_list apply(variable_list&& inputs) override; + + at::Device destination_device_; + int64_t dim_; +}; + +} // namespace autograd +} // namespace torch diff --git a/torch/csrc/autograd/functions/utils.h b/torch/csrc/autograd/functions/utils.h index 5f4d8cdb1c8bc6..a1b7ee74bf8ec6 100644 --- a/torch/csrc/autograd/functions/utils.h +++ b/torch/csrc/autograd/functions/utils.h @@ -1,7 +1,10 @@ #pragma once -#include "torch/csrc/autograd/function.h" -#include "torch/csrc/autograd/variable.h" +#include +#include +#include + +#include #include #include @@ -18,9 +21,59 @@ using function_constructor = std::function(edge_list&& variable_list wrap_outputs(const variable_list& inputs, tensor_list&& outputs, function_constructor ctr); -/** - * Checks that inputs contains exactly `args` items and that the first `required_args` - * items are not nullptr. If not specified, `required_args` defaults to `args`. - */ +/// Checks that inputs contains exactly `args` items and that the first `required_args` +/// items are not nullptr. If not specified, `required_args` defaults to `args`. void check_input_variables(const char* name, const variable_list& inputs, int args, int required_args=-1); + +struct ComputeRequiresGrad : IterArgs { + bool out = false; + using IterArgs::operator(); + void operator()(const at::Tensor& tensor) { + const auto& var = static_cast(tensor); + if (var.defined() && var.requires_grad()) { + out = true; + } + } + bool short_circuit() { + return out; + } +}; + +template +inline bool compute_requires_grad(Args&&... args) { + if (!GradMode::is_enabled()) { + return false; + } + return ComputeRequiresGrad().apply(std::forward(args)...).out; +} + +inline void set_history( + at::Tensor& variable, + const std::shared_ptr& grad_fn) { + if (grad_fn) { + if (variable.defined()) { + auto output_nr = + grad_fn->add_input_metadata(variable.type(), variable.sizes()); + as_variable_ref(variable).set_gradient_edge({grad_fn, output_nr}); + } else { + grad_fn->add_input_metadata(Function::undefined_input()); + } + } +} + +inline void set_history( + std::vector&& variables, + const std::shared_ptr& grad_fn) { + for (auto& variable : variables) { + set_history(variable, grad_fn); + } +} + +inline void set_history( + std::vector& variables, + const std::shared_ptr& grad_fn) { + for (auto& variable : variables) { + set_history(variable, grad_fn); + } +} }} diff --git a/torch/csrc/autograd/grad_mode.cpp b/torch/csrc/autograd/grad_mode.cpp index 6409c697a3b808..fc438dfad3d6a6 100644 --- a/torch/csrc/autograd/grad_mode.cpp +++ b/torch/csrc/autograd/grad_mode.cpp @@ -2,6 +2,13 @@ namespace torch { namespace autograd { -thread_local bool GradMode::_enabled = 1; +thread_local bool GradMode_enabled = 1; +bool GradMode::is_enabled() { + return GradMode_enabled; +} + +void GradMode::set_enabled(bool enabled) { + GradMode_enabled = enabled; +} }} diff --git a/torch/csrc/autograd/grad_mode.h b/torch/csrc/autograd/grad_mode.h index 31a514744a5643..e7d15446bee1fd 100644 --- a/torch/csrc/autograd/grad_mode.h +++ b/torch/csrc/autograd/grad_mode.h @@ -3,14 +3,8 @@ namespace torch { namespace autograd { struct GradMode { - static bool is_enabled() { - return _enabled; - } - static void set_enabled(bool enabled) { - _enabled = enabled; - } -private: - static thread_local bool _enabled; + static bool is_enabled(); + static void set_enabled(bool enabled); }; // A RAII, thread local (!) guard that enables or disables grad mode upon diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index efc6cad05a4264..ca1575699cf08f 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -24,14 +24,17 @@ PyObject * THPAutograd_initExtension(PyObject *_unused) auto m = py::handle(autograd_module).cast(); - py::class_(m,"ProfilerEvent") - .def("kind",&torch::autograd::profiler::Event::kind) - .def("name",&torch::autograd::profiler::Event::name) - .def("thread_id",&torch::autograd::profiler::Event::thread_id) - .def("device",&torch::autograd::profiler::Event::device) - .def("cpu_elapsed_us",&torch::autograd::profiler::Event::cpu_elapsed_us) - .def("cuda_elapsed_us",&torch::autograd::profiler::Event::cuda_elapsed_us) - .def("has_cuda",&torch::autograd::profiler::Event::has_cuda); + py::class_(m, "ProfilerEvent") + .def("kind", &torch::autograd::profiler::Event::kind) + .def( + "name", + [](const torch::autograd::profiler::Event& e) { return e.name(); }) + .def("thread_id", &torch::autograd::profiler::Event::thread_id) + .def("device", &torch::autograd::profiler::Event::device) + .def("cpu_elapsed_us", &torch::autograd::profiler::Event::cpu_elapsed_us) + .def( + "cuda_elapsed_us", &torch::autograd::profiler::Event::cuda_elapsed_us) + .def("has_cuda", &torch::autograd::profiler::Event::has_cuda); py::enum_(m,"ProfilerState") .value("Disabled", torch::autograd::profiler::ProfilerState::Disabled) .value("CPU", torch::autograd::profiler::ProfilerState::CPU) @@ -41,16 +44,10 @@ PyObject * THPAutograd_initExtension(PyObject *_unused) m.def("_enable_profiler", torch::autograd::profiler::enableProfiler); m.def("_disable_profiler", torch::autograd::profiler::disableProfiler); - m.def("_push_range", [](const char *name) { - using namespace torch::autograd::profiler; - if (state == ProfilerState::Disabled) return; - pushRange(name); - }); - m.def("_pop_range", []() { - using namespace torch::autograd::profiler; - if (state == ProfilerState::Disabled) return; - popRange(); + m.def("_push_range", [](const char* name) { + torch::autograd::profiler::pushRange(name); }); + m.def("_pop_range", []() { torch::autograd::profiler::popRange(); }); Py_RETURN_TRUE; } diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp index d9f2f37adc16f9..02b36b470c492b 100644 --- a/torch/csrc/autograd/profiler.cpp +++ b/torch/csrc/autograd/profiler.cpp @@ -10,6 +10,97 @@ std::list> all_event_lists; thread_local std::shared_ptr event_list; thread_local int32_t thread_id; +RangeEventList& getEventList() { + if (!event_list) { + std::lock_guard guard(all_event_lists_mutex); + event_list = std::make_shared(); + thread_id = next_thread_id++; + all_event_lists.emplace_front(event_list); + } + return *event_list; +} + +void mark(std::string name, bool include_cuda /* = true */) { + if (state == ProfilerState::NVTX) { +#ifdef USE_CUDA + nvtxMarkA(name.c_str()); +#else + throw std::logic_error( + "mark called with NVTX tracing, but compiled without CUDA"); +#endif + } else { + getEventList().record( + EventKind::Mark, + std::move(name), + thread_id, + include_cuda && state == ProfilerState::CUDA); + } +} + +void pushRange(std::string name) { + if (state == ProfilerState::Disabled) { + return; + } + if (state == ProfilerState::NVTX) { +#ifdef USE_CUDA + nvtxRangePushA(name.c_str()); +#else + throw std::logic_error( + "pushRange called with NVTX tracing, but compiled without CUDA"); +#endif + } else { + getEventList().record( + EventKind::PushRange, + std::move(name), + thread_id, + state == ProfilerState::CUDA); + } +} + +void popRange() { + if (state == ProfilerState::Disabled) { + return; + } + if (state == ProfilerState::NVTX) { +#ifdef USE_CUDA + nvtxRangePop(); +#else + throw std::logic_error( + "popRange called with NVTX tracing, but compiled without CUDA"); +#endif + } else { + getEventList().record( + EventKind::PopRange, + std::string(), + thread_id, + state == ProfilerState::CUDA); + } +} + +RecordFunction::RecordFunction(Function* fn) { + if (state == ProfilerState::Disabled) + return; + pushFunctionRange(fn); +} + +RecordFunction::RecordFunction(std::string name) { + if (state == ProfilerState::Disabled) + return; + pushRange(std::move(name)); +} + +RecordFunction::RecordFunction(const char* name) { + if (state == ProfilerState::Disabled) + return; + pushRange(name); +} + +RecordFunction::~RecordFunction() { + if (state == ProfilerState::Disabled) + return; + popRange(); +} + void RecordFunction::pushFunctionRange(Function* fn) { pushRange(fn->name()); } diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h index c842e00a3b90d7..3df34c728844bc 100644 --- a/torch/csrc/autograd/profiler.h +++ b/torch/csrc/autograd/profiler.h @@ -162,80 +162,19 @@ enum class ProfilerState { NVTX, // only emit NVTX markers }; -extern ProfilerState state; -extern uint32_t next_thread_id; -extern std::mutex all_event_lists_mutex; -extern std::list> all_event_lists; - -extern thread_local std::shared_ptr event_list; -extern thread_local int32_t thread_id; - -inline RangeEventList& getEventList() { - if (!event_list) { - std::lock_guard guard(all_event_lists_mutex); - event_list = std::make_shared(); - thread_id = next_thread_id++; - all_event_lists.emplace_front(event_list); - } - return *event_list; -} - -inline void mark(std::string name, bool include_cuda = true) { - if (state == ProfilerState::NVTX) { -#ifdef USE_CUDA - nvtxMarkA(name.c_str()); -#else - throw std::logic_error("mark called with NVTX tracing, but compiled without CUDA"); -#endif - } else { - getEventList().record(EventKind::Mark, std::move(name), thread_id, include_cuda && state == ProfilerState::CUDA); - } -} - -inline void pushRange(std::string name) { - if (state == ProfilerState::NVTX) { -#ifdef USE_CUDA - nvtxRangePushA(name.c_str()); -#else - throw std::logic_error("pushRange called with NVTX tracing, but compiled without CUDA"); -#endif - } else { - getEventList().record(EventKind::PushRange, std::move(name), thread_id, state == ProfilerState::CUDA); - } -} - -inline void popRange() { - if (state == ProfilerState::NVTX) { -#ifdef USE_CUDA - nvtxRangePop(); -#else - throw std::logic_error("popRange called with NVTX tracing, but compiled without CUDA"); -#endif - } else { - getEventList().record(EventKind::PopRange, std::string(), thread_id, state == ProfilerState::CUDA); - } -} +RangeEventList& getEventList(); +void mark(std::string name, bool include_cuda = true); +void pushRange(std::string name); +void popRange(); struct RecordFunction { - explicit RecordFunction(Function *fn) { - if (state == ProfilerState::Disabled) return; - pushFunctionRange(fn); - } + explicit RecordFunction(Function* fn); - explicit RecordFunction(std::string name) { - if (state == ProfilerState::Disabled) return; - pushRange(std::move(name)); - } + explicit RecordFunction(std::string name); - explicit RecordFunction(const char *name) { - if (state == ProfilerState::Disabled) return; - pushRange(name); - } + explicit RecordFunction(const char* name); - ~RecordFunction() { - if (state == ProfilerState::Disabled) return; - popRange(); - } + ~RecordFunction(); // Needed only because we don't have Function defined yet. void pushFunctionRange(Function *fn); diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp index 54943342932447..8c20c6c48e3e16 100644 --- a/torch/csrc/autograd/python_function.cpp +++ b/torch/csrc/autograd/python_function.cpp @@ -550,7 +550,7 @@ std::pair unpack_input(PyObject *args) { } static void _assert_not_tracing(const char* name, const variable_list& input_vars) { - if (tracer::isTracingVar(input_vars)) { + if (tracer::isTracing()) { std::ostringstream oss; oss << "Attempted to trace " << name; oss << ", but tracing of legacy functions is not supported"; @@ -562,7 +562,7 @@ static jit::tracer::PreTraceInfo _trace_pre_record( PyObject* op_obj, PyObject *input_objects, const variable_list& input_vars) { - if (!tracer::isTracingVar(input_vars)) { + if (!jit::tracer::isTracing()) { return jit::tracer::PreTraceInfo(); } @@ -598,7 +598,7 @@ static void _trace_post_record( const variable_list& input_vars, PyObject *output_objects, bool is_inplace) { - if (!trace_info.state) { + if (!jit::tracer::isTracing()) { return; } @@ -612,7 +612,6 @@ static void _trace_post_record( jit::tracer::postRecordTrace(trace_info, output_vars); - auto state_lock = trace_info.state->lock(); trace_info.n->i_(attr::inplace, is_inplace); } @@ -640,11 +639,6 @@ PyObject* process_outputs(PyObject *op_obj, THPFunction* grad_fn, const Unpacked bool is_inplace = static_cast(grad_fn->dirty_tensors); _wrap_outputs(grad_fn, inputs, raw_output, outputs, is_executable); - // NOTE: _trace_post_record has to run before _save_variables, because we need - // to assign traces to outputs before we convert them to SavedVariables. - // On the other hand, it needs to go after _mark_non_differentiable, because - // it might be wraping backwards in Evals, and _mark_non_differentiable uses - // grad_fn pointer equality for error checking. _trace_post_record(trace_info, op_obj, unpacked.input_vars, outputs, is_inplace); if (is_executable) { _save_variables(grad_fn); @@ -715,10 +709,6 @@ PyObject *THPFunction_apply(PyObject *cls, PyObject *inputs) // Record input nodes if tracing auto trace_info = _trace_pre_record(cls, inputs, unpacked_input.input_vars); - if (trace_info.state) { - // TODO: ezyang suggests this is unused and can be removed - ctx->is_traced = true; - } // Initialize backward function (and ctx) bool is_executable = input_info.is_executable; @@ -1009,7 +999,6 @@ static struct PyGetSetDef THPFunction_properties[] = { {"dirty_tensors", &getObject<&THPFunction::dirty_tensors>, &setObject<&THPFunction::dirty_tensors>, nullptr, nullptr}, {"needs_input_grad", &getObject<&THPFunction::needs_input_grad>, nullptr, nullptr, nullptr}, {"requires_grad", getRequiresGrad, nullptr, nullptr, nullptr}, - {"_is_tracing", &getMember, nullptr, nullptr, nullptr}, {"metadata", (getter)THPFunction_metadata, nullptr, nullptr, nullptr}, {nullptr} }; diff --git a/torch/csrc/autograd/python_function.h b/torch/csrc/autograd/python_function.h index 7bc7548e125f43..bdbca1016ebcfa 100644 --- a/torch/csrc/autograd/python_function.h +++ b/torch/csrc/autograd/python_function.h @@ -90,7 +90,6 @@ struct THPFunction { // For each input, true if the input is a THPVariable std::vector is_variable_input; char has_freed_buffers; - char is_traced; // The C++ wrapper for this Python function. // See a comment in THPFunction_asFunction for details about this field. diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp index a93f4e68fbe31e..607f3b739cdda0 100644 --- a/torch/csrc/autograd/python_variable.cpp +++ b/torch/csrc/autograd/python_variable.cpp @@ -16,7 +16,6 @@ #include "torch/csrc/autograd/generated/VariableType.h" #include "torch/csrc/autograd/utils/python_error_messages.h" #include "torch/csrc/autograd/utils/wrap_outputs.h" -#include "torch/csrc/jit/tracer_state.h" #include "torch/csrc/tensor/python_tensor.h" #include "torch/csrc/utils/auto_gil.h" #include "torch/csrc/utils/cuda_lazy_init.h" diff --git a/torch/csrc/autograd/saved_variable.cpp b/torch/csrc/autograd/saved_variable.cpp index 889f456c8b07d8..0f93c13997c8f6 100644 --- a/torch/csrc/autograd/saved_variable.cpp +++ b/torch/csrc/autograd/saved_variable.cpp @@ -3,7 +3,6 @@ #include "torch/csrc/autograd/edge.h" #include "torch/csrc/autograd/function.h" #include "torch/csrc/autograd/variable.h" -#include "torch/csrc/jit/tracer_state.h" #include @@ -29,10 +28,6 @@ SavedVariable::SavedVariable(const Variable& variable, bool is_output) { } version_counter_ = variable.version_counter(); saved_version_ = version_counter_.current_version(); - if (variable.has_tracing_state()) { - tracing_state_.reset( - new jit::tracer::ValueTracingState(variable.tracing_state())); - } } } @@ -78,9 +73,6 @@ Variable SavedVariable::unpack(std::shared_ptr saved_for) const { if (requires_grad_ && !var.grad_fn() && grad_accumulator_.expired()) throw std::logic_error("No grad accumulator for a saved leaf!"); var.set_grad_accumulator(grad_accumulator_); - if (tracing_state_) { - var.set_tracing_state(new jit::tracer::ValueTracingState(*tracing_state_)); - } return var; } diff --git a/torch/csrc/autograd/saved_variable.h b/torch/csrc/autograd/saved_variable.h index 7372d10c6fa30e..ff5a36ba04c03a 100644 --- a/torch/csrc/autograd/saved_variable.h +++ b/torch/csrc/autograd/saved_variable.h @@ -1,7 +1,7 @@ #pragma once +#include "torch/csrc/WindowsTorchApiMacro.h" #include "torch/csrc/autograd/variable_version.h" -#include "torch/csrc/jit/tracer_state.h" #include @@ -14,7 +14,7 @@ namespace torch { namespace autograd { struct Variable; struct Function; -extern const char* ERR_BACKWARD_TWICE; +TORCH_API extern const char* ERR_BACKWARD_TWICE; /// A snapshot of a variable at a certain version. A `SavedVariable` stores /// enough information to reconstruct a variable from a certain point in time. @@ -43,7 +43,6 @@ class SavedVariable { // passed in to the unpack function when reconstructing the Variable. std::shared_ptr grad_fn_; std::weak_ptr grad_accumulator_; - std::unique_ptr tracing_state_; VariableVersion version_counter_; uint32_t saved_version_; diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp index 16e8105090ecfd..7654c4ee4c4b82 100644 --- a/torch/csrc/autograd/variable.cpp +++ b/torch/csrc/autograd/variable.cpp @@ -9,8 +9,6 @@ #include "torch/csrc/autograd/generated/Functions.h" #include "torch/csrc/autograd/generated/VariableType.h" #include "torch/csrc/autograd/variable_version.h" -#include "torch/csrc/jit/tracer_state.h" -#include "torch/csrc/utils/auto_unique_ptr.h" #include @@ -141,7 +139,6 @@ void Variable::Impl::release_resources() { grad_.reset(); grad_fn_.reset(); hooks_.clear(); - tracing_state_.reset(); } Variable::ViewImpl::ViewImpl(Variable base, at::Tensor data, Edge gradient_edge) @@ -205,13 +202,4 @@ void Variable::rebase_history(Edge gradient_edge) { } } -void Variable::set_tracing_state( - jit::tracer::ValueTracingState* new_tracing_state) { - get()->tracing_state_.reset(new_tracing_state); -} - -jit::tracer::ValueTracingState& Variable::tracing_state() const noexcept { - return *get()->tracing_state_; -} - }} // namespace torch::autograd diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h index a6d670ae55f703..2def489e3ae540 100644 --- a/torch/csrc/autograd/variable.h +++ b/torch/csrc/autograd/variable.h @@ -2,11 +2,11 @@ #include "torch/csrc/utils/python_stub.h" +#include "torch/csrc/WindowsTorchApiMacro.h" #include "torch/csrc/assertions.h" #include "torch/csrc/autograd/edge.h" #include "torch/csrc/autograd/function_hook.h" #include "torch/csrc/autograd/variable_version.h" -#include "torch/csrc/utils/auto_unique_ptr.h" #include #include @@ -19,20 +19,10 @@ #include #include -namespace torch { -namespace autograd { -struct Function; -} // namespace autograd -namespace jit { namespace tracer { -// Has to be forward declared because tracer_state.h has a dependency on -// variable.h. -struct ValueTracingStateElem; -using ValueTracingState = std::list; -}} // namespace jit::tracer -} // namespace torch - namespace torch { namespace autograd { +struct Function; + ///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /// Variable ///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -227,15 +217,6 @@ struct Variable : public at::Tensor { const std::vector>& hooks() const noexcept; void clear_hooks(); - // JIT Tracing - //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - void set_tracing_state(jit::tracer::ValueTracingState* new_tracing_state); - jit::tracer::ValueTracingState& tracing_state() const noexcept; - - /// Returns true if the `Variable`'s tracing state is not null. - bool has_tracing_state() const noexcept; - // View Variables //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -280,7 +261,7 @@ struct Variable : public at::Tensor { //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ struct Variable::Impl : public at::TensorImpl { - explicit Impl( + TORCH_API explicit Impl( at::Tensor data, bool requires_grad = false, Edge edge = Edge()); @@ -378,9 +359,6 @@ struct Variable::Impl : public at::TensorImpl { // state are still thread-safe. Used by get_grad_fn and // get_grad_accumulator. std::mutex mutex_; - - // For use in torch::jit::tracer - auto_unique_ptr tracing_state_; }; //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -563,13 +541,6 @@ inline void Variable::clear_hooks() { get()->hooks_.clear(); } -// JIT Tracing -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -inline bool Variable::has_tracing_state() const noexcept { - return get()->tracing_state_ != nullptr; -} - // View Variables //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp index d7c3b76c64b2b3..52a27ea0a44734 100644 --- a/torch/csrc/cuda/comm.cpp +++ b/torch/csrc/cuda/comm.cpp @@ -7,10 +7,6 @@ #include #endif -#include - -#include - #include #include @@ -18,7 +14,6 @@ #include namespace torch { namespace cuda { - using namespace at; // Some operations can be performed more efficiently if we're handling tensors @@ -123,7 +118,7 @@ std::vector scatter( at::IntList devices, const at::optional>& chunk_sizes, int64_t dim, - const at::optional>& streams) { + const at::optional>& streams) { std::vector chunks; if (chunk_sizes) { const int64_t chunk_size_sum = @@ -145,18 +140,20 @@ std::vector scatter( } else { chunks = tensor.chunk(/*chunks=*/devices.size(), /*dim=*/dim); } - auto* thc_state = at::globalContext().lazyInitCUDA(); + at::CUDAGuard cuda_guard; for (size_t chunk = 0; chunk < chunks.size(); ++chunk) { - const int32_t device_index = devices[chunk]; - // We must set the current device before setting the current stream. - const at::DeviceGuard device_guard({at::kCUDA, device_index}); - const AutoStream stream_guard( - streams ? (*streams)[chunk] - : THCState_getStreamOnDevice(thc_state, device_index)); - // Copy the chunk from its current device to its destination device, which - // we set as the default device above, thus specified as -1. - chunks[chunk] = - chunks[chunk].contiguous().to({at::kCUDA, -1}, /*non_blocking=*/true); + const auto device_index = static_cast(devices[chunk]); + if (streams) { + AT_CHECK( + (*streams)[chunk].device() == device_index, + "Expected the device associated with the stream at index ", + chunk, " (was ", (*streams)[chunk].device(), ") ", + "to match the device supplied at that index ", + "(expected ", device_index, ")"); + cuda_guard.set_stream((*streams)[chunk]); + } + chunks[chunk] = chunks[chunk].contiguous().to( + {at::kCUDA, device_index}, /*non_blocking=*/true); } return chunks; } @@ -165,7 +162,7 @@ at::Tensor gather( at::TensorList tensors, int64_t dim, at::optional destination_index) { - AT_ASSERT(!tensors.empty()); + AT_CHECK(!tensors.empty(), "Expected at least one tensor to gather from"); at::Tensor result; int64_t total_size = 0; auto& first = tensors.front(); @@ -174,7 +171,7 @@ at::Tensor gather( for (const auto& tensor : tensors) { AT_CHECK( tensor.type().is_cuda(), "Gather expects all inputs to have CUDA type"); - AT_CHECK(tensor.ndimension() == static_cast(expected_size.size())); + AT_ASSERT(tensor.ndimension() == static_cast(expected_size.size())); expected_size[dim] = tensor.size(dim); for (size_t dimension = 0; dimension < expected_size.size(); ++dimension) { AT_CHECK( diff --git a/torch/csrc/cuda/comm.h b/torch/csrc/cuda/comm.h index a87cc455691334..c7009a56e9712f 100644 --- a/torch/csrc/cuda/comm.h +++ b/torch/csrc/cuda/comm.h @@ -1,7 +1,5 @@ #pragma once -#include - #include #include @@ -21,7 +19,7 @@ std::vector scatter( at::IntList devices, const at::optional>& chunk_sizes = at::nullopt, int64_t dim = 0, - const at::optional>& streams = at::nullopt); + const at::optional>& streams = at::nullopt); at::Tensor gather( at::TensorList tensors, diff --git a/torch/csrc/cuda/python_comm.cpp b/torch/csrc/cuda/python_comm.cpp index 902d5b93339ef7..0ec849a7498549 100644 --- a/torch/csrc/cuda/python_comm.cpp +++ b/torch/csrc/cuda/python_comm.cpp @@ -3,6 +3,7 @@ #include "torch/csrc/cuda/Stream.h" #include "torch/csrc/cuda/THCP.h" #include "torch/csrc/utils/auto_gil.h" +#include "torch/csrc/utils/functional.h" #include @@ -27,10 +28,15 @@ void initCommMethods(PyObject *module) { at::optional> chunk_sizes, int64_t dim, at::optional py_streams) { - at::optional> streams; + at::optional> streams; if (py_streams) { py::handle handle = *py_streams; - streams = THPUtils_PySequence_to_THCStreamList(handle.ptr()); + streams = fmap( + THPUtils_PySequence_to_THCStreamList(handle.ptr()), + [](THCStream* stream) { + at::detail::CUDAStream_retain(stream); + return at::CUDAStream(stream); + }); } // Note: We're holding the GIL up to here. AutoNoGIL no_gil; diff --git a/torch/csrc/cuda/utils.cpp b/torch/csrc/cuda/utils.cpp index fe17a8c31c952c..5090ad6647b0f3 100644 --- a/torch/csrc/cuda/utils.cpp +++ b/torch/csrc/cuda/utils.cpp @@ -34,10 +34,4 @@ std::vector THPUtils_PySequence_to_THCStreamList(PyObject *obj) { return streams; } -template<> -void THPPointer::free() { - if (ptr) - THCTensor_free(LIBRARY_STATE ptr); -} - #endif diff --git a/torch/csrc/finalizer.h b/torch/csrc/finalizer.h index 4335c50f7198e9..13b9fa5e7bdd04 100644 --- a/torch/csrc/finalizer.h +++ b/torch/csrc/finalizer.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include #include #include diff --git a/torch/csrc/jit/fusion_compiler.cpp b/torch/csrc/jit/fusion_compiler.cpp index 64772bede8d02c..78087f8d3118c4 100644 --- a/torch/csrc/jit/fusion_compiler.cpp +++ b/torch/csrc/jit/fusion_compiler.cpp @@ -190,11 +190,15 @@ std::string valueName(Value * n) { return "n" + std::to_string(n->unique()); } - std::string scalarValue(const at::Tensor & t) { +std::string scalarValue(const at::Tensor & t) { auto s = at::Scalar(t); - return (s.isIntegral()) ? - std::to_string(s.toLong()) : - (std::to_string(s.toDouble()) + "f"); + if (s.isIntegral()){ + return std::to_string(s.toLong()); + } else { + std::ostringstream out; + out << std::scientific << s.toDouble() << "f"; + return out.str(); + } } const char * scalarTypeName(at::ScalarType type) { diff --git a/torch/csrc/jit/graph_executor.cpp b/torch/csrc/jit/graph_executor.cpp index 5ef60d95a47dc5..2b0e0b47115506 100644 --- a/torch/csrc/jit/graph_executor.cpp +++ b/torch/csrc/jit/graph_executor.cpp @@ -226,7 +226,7 @@ struct GraphExecutorImpl { // there is no need to optimize, but we do need to splice the graph of // this excutor into the trace. Otherwise we might unroll control-flow // operations. - if(isTracing(inputs)) { + if(tracer::isTracing()) { return runTraced(std::move(inputs)); } @@ -274,26 +274,11 @@ struct GraphExecutorImpl { private: friend struct GraphExecutor; - // TODO: switching tracing to be part of the local thread state, instead of - // a per-variable property will make this check significantly faster. - // It is along the fast path, so this is important. - static bool isTracing(const variable_tensor_list& inputs) { - for(auto & i : inputs) { - if(i.defined() && tracer::isTracingVar(autograd::as_variable_ref(i))) - return true; - } - return false; - } variable_tensor_list runTraced(variable_tensor_list inputs) { - // TODO: unnecessary copy to variable_list - variable_list input_vars(inputs.begin(), inputs.end()); - auto state = tracer::getTracingState(input_vars); - auto input_values = fmap(input_vars, [&](const Variable& v) { - return tracer::getValueTrace(state, v); - }); + auto state = tracer::getTracingState(); + auto input_values = fmap(inputs, tracer::getValueTrace); ArgumentSpec spec(autograd::GradMode::is_enabled(), inputs); - input_vars.clear(); // don't hold inputs during execution auto outputs = runFallback(std::move(inputs)); auto all_dynamic = [](const at::ArrayRef xs) { @@ -316,7 +301,7 @@ struct GraphExecutorImpl { auto output_values = script::inlineCallTo(*state->graph, *local_graph, input_values); for(size_t i = 0; i < outputs.size(); ++i) { - tracer::setValueTrace(state, outputs[i], output_values[i]); + tracer::setValueTrace(outputs[i], output_values[i]); } return outputs; } diff --git a/torch/csrc/jit/interned_strings.cpp b/torch/csrc/jit/interned_strings.cpp index 77ec1848b3679a..d633514256dbb0 100644 --- a/torch/csrc/jit/interned_strings.cpp +++ b/torch/csrc/jit/interned_strings.cpp @@ -1,92 +1,76 @@ -#include +#include "torch/csrc/jit/interned_strings.h" #include -#include -#include +#include #include #include +#include +#include +#include #include "ATen/optional.h" -#include "torch/csrc/assertions.h" -#include "torch/csrc/jit/interned_strings.h" #include "string.h" -#include +#include "torch/csrc/assertions.h" +#include "torch/csrc/jit/interned_strings_class.h" namespace torch { namespace jit { -struct InternedStrings { - InternedStrings() - : sym_to_info_(static_cast(_keys::num_symbols)) { - #define REGISTER_SYMBOL(n, s) \ - string_to_sym_[#n "::" #s] = n::s; \ - sym_to_info_[n::s] = {namespaces::n, #n "::" #s, #s}; +Symbol InternedStrings::symbol(const std::string& s) { + std::lock_guard guard(mutex_); + return _symbol(s); +} - FORALL_NS_SYMBOLS(REGISTER_SYMBOL) - #undef REGISTER_SYMBOL - } - Symbol symbol(const std::string & s) { - std::lock_guard guard(mutex_); - return _symbol(s); - } - std::pair string(Symbol sym) { - // Builtin Symbols are also in the maps, but - // we can bypass the need to acquire a lock - // to read the map for Builtins because we already - // know their string value - switch(sym) { - #define DEFINE_CASE(ns, s) \ - case ns::s: return {#ns "::" #s, #s}; - FORALL_NS_SYMBOLS(DEFINE_CASE) - #undef DEFINE_CASE - default: - return customString(sym); - } +std::pair InternedStrings::string(Symbol sym) { + // Builtin Symbols are also in the maps, but + // we can bypass the need to acquire a lock + // to read the map for Builtins because we already + // know their string value + switch (sym) { +#define DEFINE_CASE(ns, s) \ + case ns::s: \ + return {#ns "::" #s, #s}; + FORALL_NS_SYMBOLS(DEFINE_CASE) +#undef DEFINE_CASE + default: + return customString(sym); } - Symbol ns(Symbol sym) { - switch(sym) { - #define DEFINE_CASE(ns, s) \ - case ns::s: return namespaces::ns; - FORALL_NS_SYMBOLS(DEFINE_CASE) - #undef DEFINE_CASE - default: { - std::lock_guard guard(mutex_); - return sym_to_info_.at(sym).ns; - } +} + +Symbol InternedStrings::ns(Symbol sym) { + switch (sym) { +#define DEFINE_CASE(ns, s) \ + case ns::s: \ + return namespaces::ns; + FORALL_NS_SYMBOLS(DEFINE_CASE) +#undef DEFINE_CASE + default: { + std::lock_guard guard(mutex_); + return sym_to_info_.at(sym).ns; } } -private: - // prereq - holding mutex_ - Symbol _symbol(const std::string & s) { - auto it = string_to_sym_.find(s); - if(it != string_to_sym_.end()) - return it->second; - - auto pos = s.find("::"); - if(pos == std::string::npos) { - throw std::runtime_error("all symbols must have a namespace, ::"); - } - Symbol ns = _symbol("namespaces::" + s.substr(0, pos)); +} - Symbol sym(sym_to_info_.size()); - string_to_sym_[s] = sym; - sym_to_info_.push_back({ns, s, s.substr(pos + strlen("::"))}); - return sym; - } +Symbol InternedStrings::_symbol(const std::string& s) { + auto it = string_to_sym_.find(s); + if (it != string_to_sym_.end()) + return it->second; - std::pair customString(Symbol sym) { - std::lock_guard guard(mutex_); - SymbolInfo& s = sym_to_info_.at(sym); - return {s.qual_name.c_str(), s.unqual_name.c_str()}; + auto pos = s.find("::"); + if (pos == std::string::npos) { + throw std::runtime_error( + "all symbols must have a namespace, ::"); } - std::unordered_map string_to_sym_; + Symbol ns = _symbol("namespaces::" + s.substr(0, pos)); - struct SymbolInfo { - Symbol ns; - std::string qual_name; - std::string unqual_name; - }; - std::vector sym_to_info_; + Symbol sym(sym_to_info_.size()); + string_to_sym_[s] = sym; + sym_to_info_.push_back({ns, s, s.substr(pos + strlen("::"))}); + return sym; +} - std::mutex mutex_; -}; +std::pair InternedStrings::customString(Symbol sym) { + std::lock_guard guard(mutex_); + SymbolInfo& s = sym_to_info_.at(sym); + return {s.qual_name.c_str(), s.unqual_name.c_str()}; +} static InternedStrings & globalStrings() { static InternedStrings s; diff --git a/torch/csrc/jit/interned_strings_class.h b/torch/csrc/jit/interned_strings_class.h new file mode 100644 index 00000000000000..c4f7b06ed5f1a7 --- /dev/null +++ b/torch/csrc/jit/interned_strings_class.h @@ -0,0 +1,39 @@ +#include +#include +#include +#include +#include +#include +#include +#include "ATen/optional.h" +#include "string.h" +#include "torch/csrc/assertions.h" +#include "torch/csrc/jit/interned_strings.h" + +namespace torch { +namespace jit { + +struct InternedStrings { + InternedStrings(); + Symbol symbol(const std::string& s); + std::pair string(Symbol sym); + Symbol ns(Symbol sym); + + private: + // prereq - holding mutex_ + Symbol _symbol(const std::string& s); + std::pair customString(Symbol sym); + std::unordered_map string_to_sym_; + + struct SymbolInfo { + Symbol ns; + std::string qual_name; + std::string unqual_name; + }; + std::vector sym_to_info_; + + std::mutex mutex_; +}; + +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h index a42915ba4c17a2..2b555029f16b54 100644 --- a/torch/csrc/jit/ir.h +++ b/torch/csrc/jit/ir.h @@ -7,7 +7,6 @@ #include "torch/csrc/jit/resource_guard.h" #include "torch/csrc/jit/source_location.h" #include "torch/csrc/jit/type.h" -#include "torch/csrc/jit/variable_flags.h" #include "torch/csrc/utils/disallow_copy.h" #include "torch/csrc/utils/functional.h" diff --git a/torch/csrc/jit/passes/onnx.h b/torch/csrc/jit/passes/onnx.h index bd6f6e4444fcc9..a58d421a458d2c 100644 --- a/torch/csrc/jit/passes/onnx.h +++ b/torch/csrc/jit/passes/onnx.h @@ -1,7 +1,6 @@ #pragma once #include "torch/csrc/jit/ir.h" -#include "torch/csrc/jit/tracer_state.h" #include "torch/csrc/onnx/onnx.h" namespace torch { namespace jit { diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp index 534297aa3f174a..f03edfe2d6b3bf 100644 --- a/torch/csrc/jit/python_ir.cpp +++ b/torch/csrc/jit/python_ir.cpp @@ -444,7 +444,7 @@ void initPythonIRBindings(PyObject * module_) { return t.expect()->strides(); }) .def("contiguous",[](Type& t) { - return t.expect()->contiguous(); + return std::static_pointer_cast(t.expect()->contiguous()); }) .def("scalarType",[](Type& t) { return at::toString(t.expect()->scalarType()); @@ -471,8 +471,5 @@ void initPythonIRBindings(PyObject * module_) { } return std::make_tuple(graph, variables); }); - m.def("_jit_is_tracing", [](const autograd::Variable& var) { - return tracer::isTracing(var); - }); } }} diff --git a/torch/csrc/jit/python_tracer.cpp b/torch/csrc/jit/python_tracer.cpp index 2ad7a79e9a947f..6397877266683c 100644 --- a/torch/csrc/jit/python_tracer.cpp +++ b/torch/csrc/jit/python_tracer.cpp @@ -46,21 +46,26 @@ std::shared_ptr createGraphByTracing( tracer::variable_list trace_inputs, size_t num_func_inputs) { auto enter_info = tracer::enter(std::move(trace_inputs)); - py::tuple py_inputs(num_func_inputs); - for(size_t i = 0; i < num_func_inputs; ++i) { - py_inputs[i] = py::cast(enter_info.second[i]); - } - auto out = func(*py_inputs); - std::vector outputs; - if(PyTuple_Check(out.ptr())) { - outputs = py::cast>(out); - } else { - outputs.push_back(py::cast(out)); + try { + py::tuple py_inputs(num_func_inputs); + for(size_t i = 0; i < num_func_inputs; ++i) { + py_inputs[i] = py::cast(enter_info.second[i]); + } + auto out = func(*py_inputs); + std::vector outputs; + if(PyTuple_Check(out.ptr())) { + outputs = py::cast>(out); + } else { + outputs.push_back(py::cast(out)); + } + tracer::exit(outputs); + auto graph = enter_info.first->graph; + EliminateDeadCode(graph); + return graph; + } catch (...) { + tracer::abandon(); + throw; } - tracer::exit(outputs); - auto graph = enter_info.first->graph; - EliminateDeadCode(graph); - return graph; } PreTraceInfo preRecordPythonTrace(THPObjectPtr pyobj, @@ -119,17 +124,17 @@ void initPythonTracerBindings(PyObject* module_) { m.def("_tracer_exit", [](variable_list var_outputs) { tracer::exit(var_outputs); }); - m.def("_get_tracing_state", [](const variable_list& vars) { - return getTracingState(vars); + m.def("_tracer_abandon", []() { + tracer::abandon(); }); - m.def("_get_value_trace", [](std::shared_ptr& state, const Variable& var) { - return getValueTrace(state, var); + m.def("_get_tracing_state", []() { + return getTracingState(); }); - m.def("_set_value_trace", [](std::shared_ptr& state, const Variable& var, Value* value) { - return setValueTrace(state, var, value); + m.def("_get_value_trace", [](const Variable& var) { + return getValueTrace(var); }); - m.def("_is_tracing", [](const variable_list& vars) { - return isTracingVar(vars); + m.def("_set_value_trace", [](const Variable& var, Value* value) { + return setValueTrace(var, value); }); } diff --git a/torch/csrc/jit/register_symbols.cpp b/torch/csrc/jit/register_symbols.cpp new file mode 100644 index 00000000000000..d08a11dff1a724 --- /dev/null +++ b/torch/csrc/jit/register_symbols.cpp @@ -0,0 +1,18 @@ +#include "torch/csrc/jit/interned_strings_class.h" + +// This file is compiled with -O0 because the fully-macro-expanded +// function is huge and only called once at startup. + +namespace torch { +namespace jit { +InternedStrings::InternedStrings() + : sym_to_info_(static_cast(_keys::num_symbols)) { +#define REGISTER_SYMBOL(n, s) \ + string_to_sym_[#n "::" #s] = n::s; \ + sym_to_info_[n::s] = {namespaces::n, #n "::" #s, #s}; + + FORALL_NS_SYMBOLS(REGISTER_SYMBOL) +#undef REGISTER_SYMBOL +} +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp index 0fda8352909283..a86059f3004953 100644 --- a/torch/csrc/jit/tracer.cpp +++ b/torch/csrc/jit/tracer.cpp @@ -13,6 +13,28 @@ namespace torch { namespace jit { namespace tracer { +//////////////////////////////////////////////////////////////////////////////// +// Recording the traces +//////////////////////////////////////////////////////////////////////////////// +namespace detail { + +thread_local std::shared_ptr tracing_state; + +} // namespace detail + +const std::shared_ptr& getTracingState() { + return detail::tracing_state; +} + +void setTracingState(std::shared_ptr state) { + detail::tracing_state = std::move(state); +} + +TracingState::TracingState() + : graph(new Graph()) {} + +TracingState::~TracingState() = default; + PreTraceInfo preRecordTrace(Symbol op, at::ArrayRef inputs) { return makePreTraceInfo(inputs, [&op](const std::shared_ptr& state, Graph& graph) { @@ -22,14 +44,10 @@ PreTraceInfo preRecordTrace(Symbol op, void postRecordTrace(const PreTraceInfo& info, at::ArrayRef outputs) { - // TODO: Technically, we could reduce the scope of the lock, but since we - // haven't actually specified what the locking contract is, be conservative. - auto state_lock = info.state->lock(); - auto assignOutput = [&info](const Variable & output, Value * value) { if (output.defined()) { value->inferTypeFrom(output.data()); - setValueTrace(info.state, output, value); + setValueTrace(output, value); } }; @@ -38,35 +56,39 @@ void postRecordTrace(const PreTraceInfo& info, } } -thread_local ArgumentStash ArgumentStash::stash; - -void ArgumentStash::stashIntListElem(const std::string& arg_name, size_t size, size_t idx, const Variable& var) { - // TODO: check type? - if (!isTracing(var)) return; - auto tracing_state = getTracingState({var}); - auto & list_trace = stash.intlists.emplace(arg_name, size).first->second; - JIT_ASSERT(size == list_trace.size()); - JIT_ASSERT(idx < list_trace.size()); - JIT_ASSERT(list_trace[idx] == nullptr); - list_trace[idx] = getValueTrace(tracing_state, var); -} - autograd::Variable getSizeOf(const autograd::Variable& var, int64_t dim) { - auto tracing_state = getTracingState({var}); + auto & tracing_state = getTracingState(); auto & graph = tracing_state->graph; auto size_var = autograd::make_variable(at::Scalar(var.size(dim)).toTensor()); - auto* value = getValueTrace(tracing_state, var); + auto* value = getValueTrace(var); auto* node = graph->create(aten::size, {value}) ->i_(attr::dim, dim); node->output()->inferTypeFrom(size_var); graph->appendNode(node); - setValueTrace(tracing_state, size_var, node->output()); + setValueTrace(size_var, node->output()); return size_var; } +//////////////////////////////////////////////////////////////////////////////// +// Argument stash +//////////////////////////////////////////////////////////////////////////////// +thread_local ArgumentStash ArgumentStash::stash; + +void ArgumentStash::stashIntListElem(const std::string& arg_name, size_t size, size_t idx, const Variable& var) { + // TODO: check type? + if (!isTracing()) return; + auto & list_trace = stash.intlists.emplace(arg_name, size).first->second; + JIT_ASSERT(size == list_trace.size()); + JIT_ASSERT(idx < list_trace.size()); + JIT_ASSERT(list_trace[idx] == nullptr); + list_trace[idx] = getValueTrace(var); +} +//////////////////////////////////////////////////////////////////////////////// +// Stack trace recording +//////////////////////////////////////////////////////////////////////////////// // no python present so we just do not record source information void defaultRecordSourceLocation(Node* n) {} std::atomic record_source_location(defaultRecordSourceLocation); diff --git a/torch/csrc/jit/tracer.h b/torch/csrc/jit/tracer.h index 5775091f5b8e69..2b8f32e8034f97 100644 --- a/torch/csrc/jit/tracer.h +++ b/torch/csrc/jit/tracer.h @@ -1,13 +1,12 @@ #pragma once #include "torch/csrc/jit/ir.h" -#include "torch/csrc/jit/tracer_state.h" #include "torch/csrc/assertions.h" #include "torch/csrc/utils/functional.h" #include "torch/csrc/utils/variadic.h" #include "torch/csrc/autograd/function_hook.h" #include "torch/csrc/autograd/variable.h" -#include "torch/csrc/utils/auto_unique_ptr.h" + #include #include #include @@ -20,38 +19,27 @@ namespace torch { namespace jit { namespace tracer { using torch::autograd::Variable; using variable_list = std::vector; -namespace detail { - -inline ValueTracingStateElem* getValueState(const std::shared_ptr& state, const Variable& var, bool alloc = true) { - auto& tracing_state = var.tracing_state(); - for (auto it = tracing_state.begin(); it != tracing_state.end();) { - auto ts = it->state.lock(); - // GC of invalidated tracing states - if (!ts) { - auto current_it = it++; - tracing_state.erase(current_it); - continue; - } else if (ts == state) { - return &(*it); +struct TracingState : public std::enable_shared_from_this { + TracingState(); + ~TracingState(); + + using WeakTensor = at::WeakTensor; + + struct WeakTensorHasher { + size_t operator()(const WeakTensor& t) const { + return std::hash()(t.unsafeGetTensorImpl()); } - ++it; - } - if (alloc) { - tracing_state.emplace_front(); - auto & vts = tracing_state.front(); - vts.state = state; - return &vts; - } else { - return nullptr; - } -} + }; -inline bool isElemActive(const ValueTracingStateElem& vts) { - auto state = vts.state.lock(); - return state && state->active; -} + struct WeakTensorEq { + bool operator()(const WeakTensor& t1, const WeakTensor& t2) const { + return t1.unsafeGetTensorImpl() == t2.unsafeGetTensorImpl(); + } + }; -} // namespace detail + std::unordered_map value_map; + std::shared_ptr graph; +}; // This is meant to be used as a thread local place, where we can store extra @@ -91,76 +79,27 @@ struct ArgumentStash { std::unordered_map intlists; }; -// Should a function which takes 'vars' as inputs be traced? -// It suffices for ONE variable to be tracing: any "untraced" variables -// are treated as constants. -// -// NB: This code lives in the hotpath; make sure it is fast -// -// NB: Variable overload is not variadic because we don't actually -// need it (in most cases if we have a variable_list it is already -// flattened). -inline bool isTracingVar(const Variable& var) { - if (!var.defined() || !var.has_tracing_state()) return false; - return std::any_of(var.tracing_state().begin(), var.tracing_state().end(), detail::isElemActive); -} - -inline bool isTracingVar(at::ArrayRef vars) { - // Reference to avoid refcount bump - for (const Variable& var : vars) { - if (isTracingVar(var)) return true; - } - return false; -} - -struct IsTracing : IterArgs { - bool out = false; - using IterArgs::operator(); - void operator()(const at::Tensor& var) { - out = out || isTracingVar(var); - } - bool short_circuit() { return out; } -}; - -// To be called with Tensor arguments from generated code -template -inline bool isTracing(Args&&... args) { - return IsTracing().apply(std::forward(args)...).out; -} +// Retrieve or set the current tracing state. Returns a nullptr if tracing is disabled. +const std::shared_ptr& getTracingState(); +void setTracingState(std::shared_ptr state); -// Retrieve the tracing state which a function applied with 'vars' should -// be recorded to. Precondition: isTracing(vars) == true. At the moment, -// we don't support mixing up variables from different traces; this code -// will need to be revisited if that ever becomes supported. -inline std::shared_ptr getTracingState(const variable_list& vars) { - std::shared_ptr state; - for (auto& var : vars) { - if (!var.defined() || !var.has_tracing_state()) continue; - for (auto & vts : var.tracing_state()) { - auto var_state = vts.state.lock(); - if (!var_state || !var_state->active) continue; - if (!state) state = var_state; - JIT_ASSERT(var_state == state); - } - } - JIT_ASSERT(state); - return state; +inline bool isTracing() { + return static_cast(getTracingState()); } -// Having finished adding a new 'node' to the graph IR owned by TracingState 'state', -// 'setValueTrace' associates this node with an output variable, so that further operations -// involving this variable know which node in the IR to reference. -inline void setValueTrace(const std::shared_ptr& state, const Variable& var, Value *value) { +// Having finished adding a new 'node' to the graph IR 'setValueTrace' associates +// this node with an output variable, so that further operations involving this +// variable know which node in the IR to reference. +inline void setValueTrace(const Variable& var, Value *value) { JIT_ASSERT(var.defined()); - auto vts = detail::getValueState(state, var); - vts->trace = value; + getTracingState()->value_map[var] = value; } // Given a variable 'var', return the 'node' which represents the instruction -// which computes the value of this variable in the IR. When 'mustExist' is -// false, we interpret untraced variables as constants that are just embedded +// which computes the value of this variable in the IR. +// Here, we interpret untraced variables as constants that are just embedded // in the graph. This is useful to handle code which does things like this -// (from torch.autograd.variable): +// (from torch.autograd.variable, now moved to C++): // // def mm(self, matrix): // output = Variable(self.data.new(self.data.size(0), matrix.data.size(1))) @@ -170,19 +109,21 @@ inline void setValueTrace(const std::shared_ptr& state, const Vari // update on, but subsequently ignores it because the alpha scaling factor is zero. // This is one of the cases where a Variable can be created inside of a trace, and // if we treat it as a constant, everything will work out. -inline Value* getValueTrace(const std::shared_ptr& state, const Variable& var) { +inline Value* getValueTrace(const Variable& var) { + auto &state = getTracingState(); if (!var.defined()) { Node *n = state->graph->createUndefined(); return state->graph->appendNode(n)->output(); } - auto vts = detail::getValueState(state, var, true); - if (vts->trace) return vts->trace; - - Value *constant = state->graph->appendNode(state->graph->createConstant(var.data()))->output(); - constant->inferTypeFrom(var.data()); - setValueTrace(state, var, constant); - return constant; + auto & value_map = getTracingState()->value_map; + auto it = value_map.find(var); + if (it == value_map.end()) { + Value *constant = state->graph->appendNode(state->graph->createConstant(var.data()))->output(); + constant->inferTypeFrom(var.data()); + it = value_map.emplace_hint(it, var, constant); + } + return it->second; } inline Value* getOutputTrace(const std::shared_ptr& state, const Variable& var, size_t output_no) { @@ -191,36 +132,37 @@ inline Value* getOutputTrace(const std::shared_ptr& state, const V return state->graph->appendNode(n)->output(); } - auto vts = detail::getValueState(state, var, false); - if (!vts) { + auto & value_map = getTracingState()->value_map; + auto it = value_map.find(var); + if (it == value_map.end()) { std::ostringstream os; os << "output " << output_no << " of traced region did not have observable " << "data dependence with trace inputs; this probably indicates your program " << "cannot be understood by the tracer."; throw std::runtime_error(os.str()); } - return vts->trace; + return it->second; } // Start tracing, treating 'inputs' as inputs to the trace, which can be // varied on subsequent invocations of the trace. Any other variables // will be treated as constants. -// -// NB: Why does this take an rvalue reference? We need to get a non-const -// reference to at::Tensor buffer to call unsafeGetTH, but you can't get this -// out of a const vector (silly std::vector...) inline std::pair, variable_list> enter( variable_list inputs) { + if (isTracing()) { + AT_ERROR("Tracing can't be nested"); + } auto state = std::make_shared(); + setTracingState(state); for (auto& input : inputs) { - auto * value_state = detail::getValueState(state, input, false); + auto * value_state = state->value_map[input]; if (value_state) { // See Note [Repeated inputs] in tracer.cpp input = input.view(input.sizes()); } auto input_node = state->graph->addInput(input.name()); - setValueTrace(state, input, input_node); input_node->inferTypeFrom(input.data()); + state->value_map[input] = input_node; } return std::make_pair(state, inputs); } @@ -229,27 +171,29 @@ inline std::pair, variable_list> enter( // are the variables whose values will be computed upon subsequent // invocations of the trace. inline void exit(const variable_list& outputs) { - auto state = getTracingState(outputs); + auto & state = getTracingState(); size_t i = 0; for (auto& output : outputs) { state->graph->registerOutput(getOutputTrace(state, output, i)); i++; } - state->active = false; + setTracingState(nullptr); +} + +// Abort tracing. Used to reset the state in case of errors. +inline void abandon() { + setTracingState(nullptr); } // Pre-recorded information about the trace before we actually carry // out the trace struct PreTraceInfo { - std::shared_ptr state; Node *n; }; PreTraceInfo preRecordTrace(Symbol op, at::ArrayRef inputs); void postRecordTrace(const PreTraceInfo& info, at::ArrayRef outputs); -autograd::Variable getSizeOf(const autograd::Variable& var, int64_t dim); - void recordSourceLocation(Node* n); void setRecordSourceLocation(void (*v)(Node*)); @@ -259,15 +203,14 @@ void setRecordSourceLocation(void (*v)(Node*)); template PreTraceInfo makePreTraceInfo(at::ArrayRef inputs, F ctor) { PreTraceInfo info; - info.state = getTracingState(inputs); - auto& graph = info.state->graph; - auto state_lock = info.state->lock(); + auto & state = getTracingState(); + auto & graph = state->graph; - Node *n = ctor(info.state, *graph); + Node *n = ctor(state, *graph); recordSourceLocation(n); - for (Variable input : inputs) { - n->addInput(getValueTrace(info.state, input)); + for (const Variable & input : inputs) { + n->addInput(getValueTrace(input)); } // NB: Order matters. This must append after inputs but before outputs. @@ -278,4 +221,6 @@ PreTraceInfo makePreTraceInfo(at::ArrayRef inputs, F ctor) { return info; } +autograd::Variable getSizeOf(const autograd::Variable& var, int64_t dim); + }}} // namespace torch::jit::tracer diff --git a/torch/csrc/jit/tracer_state.cpp b/torch/csrc/jit/tracer_state.cpp deleted file mode 100644 index 6f445625fd6b73..00000000000000 --- a/torch/csrc/jit/tracer_state.cpp +++ /dev/null @@ -1,12 +0,0 @@ -#include "torch/csrc/jit/tracer_state.h" -#include "torch/csrc/jit/ir.h" - -namespace torch { namespace jit { namespace tracer { - -TracingState::TracingState() - : graph(new Graph()) - , active(true) {} - -TracingState::~TracingState() = default; - -}}} // namespace torch::jit::tracer diff --git a/torch/csrc/jit/tracer_state.h b/torch/csrc/jit/tracer_state.h deleted file mode 100644 index 887ad94dced892..00000000000000 --- a/torch/csrc/jit/tracer_state.h +++ /dev/null @@ -1,59 +0,0 @@ -#pragma once - -#include "torch/csrc/autograd/edge.h" -#include "torch/csrc/autograd/variable.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace torch { namespace jit { -struct Graph; -struct Value; -}} // namespace torch::jit - -namespace torch { namespace jit { namespace tracer { - -// TracingState tracks the necessary state when we are tracing the execution of -// autograd code; most importantly, it holds a reference to the actual IR -// graph which we are recording the trace to. -// -// The liveness of a TracingState is expected to be a superset of the region -// of code being traced; in particular, Variables do not keep a TracingState -// live. Instead, they hold weak pointers to TracingState, to prevent leaks -// from arising when a variable that participated in a trace outlives the -// actual trace itself. - -struct TracingState : public std::enable_shared_from_this { - TracingState(); - ~TracingState(); - - std::shared_ptr graph; - std::mutex mutex; - bool active; - - std::unique_lock lock() { - return std::unique_lock(mutex); - } -}; - -struct ValueTracingStateElem { - std::weak_ptr state; - // it's only valid to use this field if !state.exired() - Value* trace = nullptr; - - void reset() { - state.reset(); - trace = nullptr; - } -}; - -using ValueTracingState = std::list; - -}}} // namespace torch::jit::tracer diff --git a/torch/csrc/jit/variable_flags.cpp b/torch/csrc/jit/variable_flags.cpp deleted file mode 100644 index 8ab565d1a23f59..00000000000000 --- a/torch/csrc/jit/variable_flags.cpp +++ /dev/null @@ -1,19 +0,0 @@ -#include "torch/csrc/jit/variable_flags.h" - -#include "torch/csrc/autograd/variable.h" -#include "torch/csrc/jit/tracer_state.h" - -using torch::autograd::Variable; - -namespace torch { namespace jit { - -// These definitions require Variable struct to be defined, so they can't be -// in tracer_state.h -VariableFlags VariableFlags::of(const Variable& var) { - VariableFlags f; - f.defined = var.defined(); - f.requires_grad = f.defined && var.requires_grad(); - return f; -} - -}} diff --git a/torch/csrc/jit/variable_flags.h b/torch/csrc/jit/variable_flags.h deleted file mode 100644 index 43c3ef9bf89a1a..00000000000000 --- a/torch/csrc/jit/variable_flags.h +++ /dev/null @@ -1,22 +0,0 @@ -#pragma once -#include -namespace torch { namespace autograd { -struct Variable; -}} - -namespace torch { namespace jit { - -struct VariableFlags { - static VariableFlags of(const autograd::Variable& var); - - bool requires_grad; - bool defined; -}; - -static inline std::ostream & operator<<(std::ostream & out, const VariableFlags& v) { - return out - << "(requires_grad=" << v.requires_grad - << ", defined=" << v.defined << ")"; -} - -}} diff --git a/torch/csrc/utils.cpp b/torch/csrc/utils.cpp index bcee9df993c27c..56f18033ef088f 100644 --- a/torch/csrc/utils.cpp +++ b/torch/csrc/utils.cpp @@ -230,6 +230,7 @@ bool maybeThrowBackCompatKeepdimWarn(char *func) { template<> void THPPointer::free() { - if (ptr) + if (ptr) { THTensor_free(LIBRARY_STATE ptr); + } } diff --git a/torch/csrc/utils/auto_stream.h b/torch/csrc/utils/auto_stream.h deleted file mode 100644 index 8d7b4d76008727..00000000000000 --- a/torch/csrc/utils/auto_stream.h +++ /dev/null @@ -1,26 +0,0 @@ -#pragma once - -// RAII structs to set CUDA stream - -#ifdef USE_CUDA -#include -extern THCState* state; -#endif - -struct AutoStream { -#ifdef USE_CUDA - explicit AutoStream(THCStream* stream) - : original_stream(THCState_getStream(state)) - { - THCStream_retain(original_stream); - THCState_setStream(state, stream); - } - - ~AutoStream() { - THCState_setStream(state, original_stream); - THCStream_free(original_stream); - } - - THCStream* original_stream; -#endif -}; diff --git a/torch/csrc/utils/auto_unique_ptr.h b/torch/csrc/utils/auto_unique_ptr.h deleted file mode 100644 index d49a03608e447c..00000000000000 --- a/torch/csrc/utils/auto_unique_ptr.h +++ /dev/null @@ -1,21 +0,0 @@ -#pragma once - -#include - -namespace torch { - -// A unique_ptr that automatically constructs the object on first dereference. -template -struct auto_unique_ptr : public std::unique_ptr { - T& operator*() { - if (!this->get()) this->reset(new T()); - return *this->get(); - } - - T* operator->() { - if (!this->get()) this->reset(new T()); - return this->get(); - } -}; - -} // namespace torch diff --git a/torch/distributions/categorical.py b/torch/distributions/categorical.py index e9cfe030637a05..b1ecc5e93316fc 100644 --- a/torch/distributions/categorical.py +++ b/torch/distributions/categorical.py @@ -1,4 +1,5 @@ import torch +from torch._six import nan from torch.distributions import constraints from torch.distributions.distribution import Distribution from torch.distributions.utils import probs_to_logits, logits_to_probs, lazy_property, broadcast_all @@ -72,11 +73,11 @@ def param_shape(self): @property def mean(self): - return self.probs.new_tensor(float('nan')).expand(self._extended_shape()) + return self.probs.new_tensor(nan).expand(self._extended_shape()) @property def variance(self): - return self.probs.new_tensor(float('nan')).expand(self._extended_shape()) + return self.probs.new_tensor(nan).expand(self._extended_shape()) def sample(self, sample_shape=torch.Size()): sample_shape = self._extended_shape(sample_shape) diff --git a/torch/distributions/cauchy.py b/torch/distributions/cauchy.py index 0b4f92d9b7d11b..dec9cfafe134fb 100644 --- a/torch/distributions/cauchy.py +++ b/torch/distributions/cauchy.py @@ -1,4 +1,5 @@ import math +from torch._six import inf, nan from numbers import Number import torch @@ -37,11 +38,11 @@ def __init__(self, loc, scale, validate_args=None): @property def mean(self): - return self.loc.new_tensor(float('nan')).expand(self._extended_shape()) + return self.loc.new_tensor(nan).expand(self._extended_shape()) @property def variance(self): - return self.loc.new_tensor(float('inf')).expand(self._extended_shape()) + return self.loc.new_tensor(inf).expand(self._extended_shape()) def rsample(self, sample_shape=torch.Size()): shape = self._extended_shape(sample_shape) diff --git a/torch/distributions/fishersnedecor.py b/torch/distributions/fishersnedecor.py index a4338ef903320b..23915598c356b7 100644 --- a/torch/distributions/fishersnedecor.py +++ b/torch/distributions/fishersnedecor.py @@ -1,6 +1,7 @@ from numbers import Number import torch import math +from torch._six import nan from torch.distributions import constraints from torch.distributions.distribution import Distribution from torch.distributions.gamma import Gamma @@ -39,13 +40,13 @@ def __init__(self, df1, df2, validate_args=None): @property def mean(self): df2 = self.df2.clone() - df2[df2 <= 2] = float('nan') + df2[df2 <= 2] = nan return df2 / (df2 - 2) @property def variance(self): df2 = self.df2.clone() - df2[df2 <= 4] = float('nan') + df2[df2 <= 4] = nan return 2 * df2.pow(2) * (self.df1 + df2 - 2) / (self.df1 * (df2 - 2).pow(2) * (df2 - 4)) def rsample(self, sample_shape=torch.Size(())): diff --git a/torch/distributions/half_cauchy.py b/torch/distributions/half_cauchy.py index 8b979e2137dfbf..77a50d3f03c49f 100644 --- a/torch/distributions/half_cauchy.py +++ b/torch/distributions/half_cauchy.py @@ -1,5 +1,6 @@ import math +from torch._six import inf from torch.distributions import constraints from torch.distributions.transforms import AbsTransform from torch.distributions.cauchy import Cauchy @@ -44,7 +45,7 @@ def variance(self): def log_prob(self, value): log_prob = self.base_dist.log_prob(value) + math.log(2) - log_prob[value.expand(log_prob.shape) < 0] = -float('inf') + log_prob[value.expand(log_prob.shape) < 0] = -inf return log_prob def cdf(self, value): diff --git a/torch/distributions/half_normal.py b/torch/distributions/half_normal.py index 165045b7614092..059f3837604a63 100644 --- a/torch/distributions/half_normal.py +++ b/torch/distributions/half_normal.py @@ -1,5 +1,6 @@ import math +from torch._six import inf from torch.distributions import constraints from torch.distributions.transforms import AbsTransform from torch.distributions.normal import Normal @@ -44,7 +45,7 @@ def variance(self): def log_prob(self, value): log_prob = self.base_dist.log_prob(value) + math.log(2) - log_prob[value.expand(log_prob.shape) < 0] = -float('inf') + log_prob[value.expand(log_prob.shape) < 0] = -inf return log_prob def cdf(self, value): diff --git a/torch/distributions/kl.py b/torch/distributions/kl.py index 2ae67fc28ccbcd..caedb3e93a1335 100644 --- a/torch/distributions/kl.py +++ b/torch/distributions/kl.py @@ -3,6 +3,7 @@ from functools import total_ordering import torch +from torch._six import inf from .bernoulli import Bernoulli from .beta import Beta @@ -113,7 +114,7 @@ def _infinite_like(tensor): """ Helper function for obtaining infinite KL Divergence throughout """ - return tensor.new_tensor(float('inf')).expand_as(tensor) + return tensor.new_tensor(inf).expand_as(tensor) def _x_log_x(tensor): @@ -173,10 +174,10 @@ def kl_divergence(p, q): @register_kl(Bernoulli, Bernoulli) def _kl_bernoulli_bernoulli(p, q): t1 = p.probs * (p.probs / q.probs).log() - t1[q.probs == 0] = float('inf') + t1[q.probs == 0] = inf t1[p.probs == 0] = 0 t2 = (1 - p.probs) * ((1 - p.probs) / (1 - q.probs)).log() - t2[q.probs == 1] = float('inf') + t2[q.probs == 1] = inf t2[p.probs == 1] = 0 return t1 + t2 @@ -208,7 +209,7 @@ def _kl_binomial_binomial(p, q): @register_kl(Categorical, Categorical) def _kl_categorical_categorical(p, q): t = p.probs * (p.logits - q.logits) - t[q.probs == 0] = float('inf') + t[q.probs == 0] = inf t[p.probs == 0] = 0 return t.sum(-1) @@ -322,7 +323,7 @@ def _kl_pareto_pareto(p, q): t1 = q.alpha * scale_ratio.log() t2 = -alpha_ratio.log() result = t1 + t2 + alpha_ratio - 1 - result[p.support.lower_bound < q.support.lower_bound] = float('inf') + result[p.support.lower_bound < q.support.lower_bound] = inf return result @@ -346,7 +347,7 @@ def _kl_transformed_transformed(p, q): @register_kl(Uniform, Uniform) def _kl_uniform_uniform(p, q): result = ((q.high - q.low) / (p.high - p.low)).log() - result[(q.low > p.low) | (q.high < p.high)] = float('inf') + result[(q.low > p.low) | (q.high < p.high)] = inf return result @@ -392,7 +393,7 @@ def _kl_beta_normal(p, q): @register_kl(Beta, Uniform) def _kl_beta_uniform(p, q): result = -p.entropy() + (q.high - q.low).log() - result[(q.low > p.support.lower_bound) | (q.high < p.support.upper_bound)] = float('inf') + result[(q.low > p.support.lower_bound) | (q.high < p.support.upper_bound)] = inf return result @@ -543,7 +544,7 @@ def _kl_pareto_exponential(p, q): t2 = p.alpha.reciprocal() t3 = p.alpha * scale_rate_prod / (p.alpha - 1) result = t1 - t2 + t3 - 1 - result[p.alpha <= 1] = float('inf') + result[p.alpha <= 1] = inf return result @@ -555,7 +556,7 @@ def _kl_pareto_gamma(p, q): t3 = (1 - q.concentration) * common_term t4 = q.rate * p.alpha * p.scale / (p.alpha - 1) result = t1 + t2 + t3 + t4 - 1 - result[p.alpha <= 1] = float('inf') + result[p.alpha <= 1] = inf return result # TODO: Add Pareto-Laplace KL Divergence @@ -570,7 +571,7 @@ def _kl_pareto_normal(p, q): t3 = p.alpha * common_term.pow(2) / (p.alpha - 2) t4 = (p.alpha * common_term - q.loc).pow(2) result = t1 - t2 + (t3 + t4) / var_normal - 1 - result[p.alpha <= 2] = float('inf') + result[p.alpha <= 2] = inf return result @@ -588,14 +589,14 @@ def _kl_uniform_beta(p, q): t3 = (q.concentration0 - 1) * (_x_log_x((1 - p.high)) - _x_log_x((1 - p.low)) + common_term) / common_term t4 = q.concentration1.lgamma() + q.concentration0.lgamma() - (q.concentration1 + q.concentration0).lgamma() result = t3 + t4 - t1 - t2 - result[(p.high > q.support.upper_bound) | (p.low < q.support.lower_bound)] = float('inf') + result[(p.high > q.support.upper_bound) | (p.low < q.support.lower_bound)] = inf return result @register_kl(Uniform, Exponential) def _kl_uniform_exponetial(p, q): result = q.rate * (p.high + p.low) / 2 - ((p.high - p.low) * q.rate).log() - result[p.low < q.support.lower_bound] = float('inf') + result[p.low < q.support.lower_bound] = inf return result @@ -607,7 +608,7 @@ def _kl_uniform_gamma(p, q): t3 = (1 - q.concentration) * (_x_log_x(p.high) - _x_log_x(p.low) - common_term) / common_term t4 = q.rate * (p.high + p.low) / 2 result = -t1 + t2 + t3 + t4 - result[p.low < q.support.lower_bound] = float('inf') + result[p.low < q.support.lower_bound] = inf return result @@ -638,5 +639,5 @@ def _kl_uniform_pareto(p, q): t1 = (q.alpha * q.scale.pow(q.alpha) * (support_uniform)).log() t2 = (_x_log_x(p.high) - _x_log_x(p.low) - support_uniform) / support_uniform result = t2 * (q.alpha + 1) - t1 - result[p.low < q.support.lower_bound] = float('inf') + result[p.low < q.support.lower_bound] = inf return result diff --git a/torch/distributions/multinomial.py b/torch/distributions/multinomial.py index 57a5853ff10b5c..c045557df8ee6c 100644 --- a/torch/distributions/multinomial.py +++ b/torch/distributions/multinomial.py @@ -1,4 +1,5 @@ import torch +from torch._six import inf from torch.distributions.distribution import Distribution from torch.distributions import Categorical from numbers import Number @@ -93,6 +94,6 @@ def log_prob(self, value): logits, value = broadcast_all(self.logits.clone(), value) log_factorial_n = torch.lgamma(value.sum(-1) + 1) log_factorial_xs = torch.lgamma(value + 1).sum(-1) - logits[(value == 0) & (logits == -float('inf'))] = 0 + logits[(value == 0) & (logits == -inf)] = 0 log_powers = (logits * value).sum(-1) return log_factorial_n - log_factorial_xs + log_powers diff --git a/torch/distributions/studentT.py b/torch/distributions/studentT.py index e1c7cbe533a6ab..e91c7cf88176cf 100644 --- a/torch/distributions/studentT.py +++ b/torch/distributions/studentT.py @@ -1,5 +1,6 @@ from numbers import Number import torch +from torch._six import inf, nan import math from torch.distributions import constraints from torch.distributions.distribution import Distribution @@ -27,15 +28,15 @@ class StudentT(Distribution): @property def mean(self): m = self.loc.clone() - m[self.df <= 1] = float('nan') + m[self.df <= 1] = nan return m @property def variance(self): m = self.df.clone() m[self.df > 2] = self.scale[self.df > 2].pow(2) * self.df[self.df > 2] / (self.df[self.df > 2] - 2) - m[(self.df <= 2) & (self.df > 1)] = float('inf') - m[self.df <= 1] = float('nan') + m[(self.df <= 2) & (self.df > 1)] = inf + m[self.df <= 1] = nan return m def __init__(self, df, loc=0., scale=1., validate_args=None): diff --git a/torch/functional.py b/torch/functional.py index adc99f40fcd20f..19d47f394fa757 100644 --- a/torch/functional.py +++ b/torch/functional.py @@ -1,4 +1,6 @@ import torch +import torch.nn.functional as F +from torch._six import inf from operator import mul from functools import reduce import math @@ -8,9 +10,11 @@ 'argmin', 'btrifact', 'btriunpack', + 'isfinite', 'isinf', 'isnan', 'split', + 'stft', 'unique', ] @@ -136,6 +140,25 @@ def btriunpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True): return P, L, U +def isfinite(tensor): + r"""Returns a new tensor with boolean elements representing if each element is `Finite` or not. + + Arguments: + tensor (Tensor): A tensor to check + + Returns: + Tensor: A ``torch.ByteTensor`` containing a 1 at each location of finite elements and 0 otherwise + + Example:: + + >>> torch.isfinite(torch.Tensor([1, float('inf'), 2, float('-inf'), float('nan')])) + tensor([ 1, 0, 1, 0, 0], dtype=torch.uint8) + """ + if not isinstance(tensor, torch.Tensor): + raise ValueError("The argument is not a tensor", str(tensor)) + return (tensor == tensor) & (tensor.abs() != inf) + + def isinf(tensor): r"""Returns a new tensor with boolean elements representing if each element is `+/-INF` or not. @@ -152,7 +175,100 @@ def isinf(tensor): """ if not isinstance(tensor, torch.Tensor): raise ValueError("The argument is not a tensor", str(tensor)) - return tensor.abs() == float('inf') + return tensor.abs() == inf + + +def stft(input, n_fft, hop_length=None, win_length=None, window=None, + center=True, pad_mode='reflect', normalized=False, onesided=True): + r"""Short-time Fourier transform (STFT). + + Ignoring the optional batch dimension, this method computes the following + expression: + + .. math:: + X[m, \omega] = \sum_{k = 0}^{\text{win_length}}% + window[k]\ input[m \times hop_length + k]\ % + e^{- j \frac{2 \pi \cdot \omega k}{\text{win_length}}}, + + where :math:`m` is the index of the sliding window, and :math:`\omega` is + the frequency that :math:`0 \leq \omega < \text{n_fft}`. When + :attr:`onesided` is the default value ``True``, + + * :attr:`input` must be either a 1-D time sequenceor 2-D a batch of time + sequences. + + * If :attr:`hop_length` is ``None`` (default), it is treated as equal to + ``floor(n_fft / 4)``. + + * If :attr:`win_length` is ``None`` (default), it is treated as equal to + :attr:`n_fft`. + + * :attr:`window` can be a 1-D tensor of size :attr:`win_length`, e.g., from + :meth:`torch.hann_window`. If :attr:`window` is ``None`` (default), it is + treated as if having :math:`1` everywhere in the window. If + :math:`\text{win_length} < \text{n_fft}`, :attr:`window` will be padded on + both sides to length :attr:`n_fft` before being applied. + + * If :attr:`center` is ``True`` (default), :attr:`input` will be padded on + both sides so that the :math:`t`-th frame is centered at time + :math:`t \times \text{hop_length}`. Otherwise, the :math:`t`-th frame + begins at time :math:`t \times \text{hop_length}`. + + * :attr:`pad_mode` determines the padding method used on :attr:`input` when + :attr:`center` is ``True``. See :meth:`torch.nn.functional.pad` for + all available options. Default is ``"reflect"``. + + * If :attr:`onesided` is ``True`` (default), only values for :math:`\omega` + in :math:`\left[0, 1, 2, \dots, \left\lfloor \frac{\text{n_fft}}{2} \right\rfloor + 1\right]` + are returned because the real-to-complex Fourier transform satisfies the + conjugate symmetry, i.e., :math:`X[m, \omega] = X[m, \text{n_fft} - \omega]^*`. + + * If :attr:`normalized` is ``True`` (default is ``False``), the function + returns the normalized STFT results, i.e., multiplied by :math:`(\text{frame_length})^{-0.5}`. + + Returns the real and the imaginary parts together as one tensor of size + :math:`(* \times N \times T \times 2)`, where :math:`*` is the optional + batch size of :attr:`input`, :math:`N` is the number of frequencies where + STFT is applied, :math:`T` is the total number of frames used, and each pair + in the last dimension represents a complex number as the real part and the + imaginary part. + + .. warning:: + This function changed signature at version 0.4.1. Calling with the + previous signature may cause error or return incorrect result. + + Arguments: + input (Tensor): the input tensor + n_fft (int, optional): size of Fourier transform + hop_length (int): the distance between neighboring sliding window + frames. Default: ``None`` (treated as equal to ``floor(n_fft / 4)``) + win_length (int): the size of window frame and STFT filter. + Default: ``None`` (treated as equal to :attr:`n_fft`) + window (Tensor, optional): the optional window function. + Default: ``None`` (treated as window of all :math:`1`s) + center (bool, optional): whether to pad :attr:`input` on both sides so + that the :math:`t`-th frame is centered at time :math:`t \times \text{hop_length}`. + Default: ``True`` + pad_mode (string, optional): controls the padding method used when + :attr:`center` is ``True``. Default: ``"reflect"`` + normalized (bool, optional): controls whether to return the normalized STFT results + Default: ``False`` + onesided (bool, optional): controls whether to return half of results to + avoid redundancy Default: ``True`` + + Returns: + Tensor: A tensor containing the STFT result with shape described above + + """ + # TODO: after having proper ways to map Python strings to ATen Enum, move + # this and F.pad to ATen. + if center: + signal_dim = input.dim() + extended_shape = [1] * (3 - signal_dim) + list(input.size()) + pad = int(n_fft // 2) + input = F.pad(input.view(extended_shape), (pad, pad), pad_mode) + input = input.view(input.shape[-signal_dim:]) + return torch._C._VariableFunctions.stft(input, n_fft, hop_length, win_length, window, normalized, onesided) def isnan(tensor): diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py index fbf3fabbcfc113..4b605412dcd3a8 100644 --- a/torch/jit/__init__.py +++ b/torch/jit/__init__.py @@ -24,21 +24,10 @@ _jit_script_compile = torch._C._jit_script_compile BatchTensor = torch._C._jit.BatchTensor -# This global variable is set when we are tracing a *forwards* computation. -# It is intended to be a cheap way to test if tracing has occurred, before -# doing the slower path using `get_tracing_state` (below.) -_tracing = False - - -def get_tracing_state(args): - if not torch._C._is_tracing(args): - return None - return torch._C._get_tracing_state(args) - @contextlib.contextmanager -def scope(scope_name, *vars): - tracing_state = get_tracing_state(vars) +def scope(scope_name): + tracing_state = torch._C._get_tracing_state() if tracing_state: tracing_state.push_scope(scope_name) try: @@ -98,18 +87,19 @@ def __init__(self, inner): self.inner = inner def forward(self, *args): - global _tracing in_vars, in_desc = _flatten(args) # NOTE: use full state, because we need it for BatchNorm export # This differs from the compiler path, which doesn't support it at the moment. module_state = list(_unique_state_dict(self, keep_vars=True).values()) trace, all_trace_inputs = torch._C._tracer_enter(in_vars + module_state) - _tracing = True - trace_inputs = _unflatten(all_trace_inputs[:len(in_vars)], in_desc) - out = self.inner(*trace_inputs) - out_vars, _ = _flatten(out) - _tracing = False - torch._C._tracer_exit(out_vars) + try: + trace_inputs = _unflatten(all_trace_inputs[:len(in_vars)], in_desc) + out = self.inner(*trace_inputs) + out_vars, _ = _flatten(out) + torch._C._tracer_exit(out_vars) + except Exception: + torch._C._tracer_abandon() + raise return trace, out @@ -289,13 +279,7 @@ def wrapper(func): if len(kwargs) != 0: raise TypeError("got unexpected keyword arguments: {}".format(", ".join(kwargs.keys()))) - if isinstance(func, torch.nn.Module): - orig = func - else: - # traced functions become a method on an Empty module - orig = Module() - - module = TopLevelTracedModule(orig, **executor_options) + module = TopLevelTracedModule(func, **executor_options) module._create_method_from_trace('forward', func, args) return module @@ -683,10 +667,17 @@ class TracedModule(ScriptModule): __frozen = False def __init__(self, orig, id_set=None, optimize=True): + # XXX: orig can be a nn.Module or a function! super(TracedModule, self).__init__(optimize=optimize) if id_set is None: id_set = set() + if not isinstance(orig, torch.nn.Module): + self._name = orig.__name__ + orig = torch.nn.Module() + else: + self._name = 'TracedModule[' + type(orig).__name__ + ']' + def check_unique(param): if param in id_set: raise ValueError("TracedModules don't support parameter sharing between modules") @@ -702,7 +693,6 @@ def check_unique(param): if buf is not None: self._buffers[name] = buf check_unique(buf) - self._orig_class = type(orig) if orig._backward_hooks or orig._forward_hooks or orig._forward_pre_hooks: raise ValueError("Modules that have hooks assigned can't be compiled") @@ -719,7 +709,7 @@ def _freeze(self): self.__frozen = True def _get_name(self): - return 'TracedModule[' + self._orig_class.__name__ + ']' + return self._name def __setattr__(self, attr, value): if not self.__frozen or hasattr(self, attr): diff --git a/torch/legacy/nn/Normalize.py b/torch/legacy/nn/Normalize.py index 1c22f37af84155..1704bdf32b318a 100644 --- a/torch/legacy/nn/Normalize.py +++ b/torch/legacy/nn/Normalize.py @@ -1,4 +1,5 @@ import torch +from torch._six import inf from .Module import Module from .utils import clear @@ -34,7 +35,7 @@ def updateOutput(self, input): self._output.resize_as_(input) # specialization for the infinity norm - if self.p == float('inf'): + if self.p == inf: if not self._indices: self._indices = torch.cuda.FloatTensor() if torch.typename(self.output) == 'torch.cuda.FloatTensor' \ else torch.LongTensor() @@ -72,7 +73,7 @@ def updateGradInput(self, input, gradOutput): self.cross = input.new() # compute diagonal term with gradOutput self._gradInput.resize_(n, d) - if self.p == float('inf'): + if self.p == inf: # specialization for the inf case torch.mul(self.norm.view(n, 1, 1).expand(n, d, 1), gradOutput, out=self._gradInput) self.buffer.resize_as_(input).zero_() @@ -113,7 +114,7 @@ def updateGradInput(self, input, gradOutput): self._gradInput.add_(-1, self.buffer) # reuse cross buffer for normalization - if self.p == float('inf'): + if self.p == inf: torch.mul(self.norm, self.norm, out=self.cross) else: torch.mul(self.normp, self.norm, out=self.cross) diff --git a/torch/legacy/optim/cg.py b/torch/legacy/optim/cg.py index 118de3bd96aac8..7880489edd6f8d 100644 --- a/torch/legacy/optim/cg.py +++ b/torch/legacy/optim/cg.py @@ -1,10 +1,11 @@ import math INFINITY = float('inf') +NAN = float('nan') def sqrt_nothrow(x): - return math.sqrt(x) if x >= 0 else float('nan') + return math.sqrt(x) if x >= 0 else NAN def cg(opfunc, x, config, state=None): @@ -145,7 +146,7 @@ def cg(opfunc, x, config, state=None): A = 6 * (f2 - f3) / z3 + 3 * (d2 + d3) B = 3 * (f3 - f2) - z3 * (d3 + 2 * d2) _denom = (B + sqrt_nothrow(B * B - A * d2 * z3 * z3)) - z2 = -d2 * z3 * z3 / _denom if _denom != 0 else float('nan') + z2 = -d2 * z3 * z3 / _denom if _denom != 0 else NAN if z2 != z2 or z2 == INFINITY or z2 == -INFINITY or z2 < 0: if limit < -0.5: diff --git a/torch/lib/THD/master_worker/common/RPC-inl.hpp b/torch/lib/THD/master_worker/common/RPC-inl.hpp index b6dfb866d8a616..5885b8350504fc 100644 --- a/torch/lib/THD/master_worker/common/RPC-inl.hpp +++ b/torch/lib/THD/master_worker/common/RPC-inl.hpp @@ -1,5 +1,5 @@ #include -#include "TH/THStorage.h" +#include "TH/THStorageFunctions.h" #include "Traits.hpp" namespace thd { namespace rpc { namespace detail { diff --git a/torch/lib/THD/master_worker/common/RPC.hpp b/torch/lib/THD/master_worker/common/RPC.hpp index af6e8045f99947..99b45942b0a7dc 100644 --- a/torch/lib/THD/master_worker/common/RPC.hpp +++ b/torch/lib/THD/master_worker/common/RPC.hpp @@ -1,7 +1,7 @@ #pragma once #include "../master/THDTensor.h" #include "ByteArray.hpp" -#include "TH/THStorage.h" +#include "TH/THStorageFunctions.h" #include "RPCType.hpp" #include diff --git a/torch/lib/THD/master_worker/master/generic/THDTensor.cpp b/torch/lib/THD/master_worker/master/generic/THDTensor.cpp index e0e174ed6a17e8..93dd5d4b7246ac 100644 --- a/torch/lib/THD/master_worker/master/generic/THDTensor.cpp +++ b/torch/lib/THD/master_worker/master/generic/THDTensor.cpp @@ -826,8 +826,7 @@ ptrdiff_t THDTensor_(nElement)(const THDTensor *self) { } void THDTensor_(retain)(THDTensor *tensor) { - if (tensor->flag & TH_TENSOR_REFCOUNTED) - tensor->refcount++; + tensor->refcount++; } void THDTensor_(free)(THDTensor *tensor) { diff --git a/torch/lib/THD/master_worker/master/generic/THDTensorMeta.cpp b/torch/lib/THD/master_worker/master/generic/THDTensorMeta.cpp index 80214f0637bed1..05ec09748ce0ad 100644 --- a/torch/lib/THD/master_worker/master/generic/THDTensorMeta.cpp +++ b/torch/lib/THD/master_worker/master/generic/THDTensorMeta.cpp @@ -141,7 +141,6 @@ static THDTensor *THDTensor_(_alloc)() { new_tensor->storageOffset = 0; new_tensor->refcount = 1; - new_tensor->flag = TH_TENSOR_REFCOUNTED; new_tensor->tensor_id = THDState::s_nextId++; return new_tensor; diff --git a/torch/lib/THD/master_worker/worker/Dispatch.cpp b/torch/lib/THD/master_worker/worker/Dispatch.cpp index 1c5f3a793230c0..35e7a38731d5cb 100644 --- a/torch/lib/THD/master_worker/worker/Dispatch.cpp +++ b/torch/lib/THD/master_worker/worker/Dispatch.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include #include diff --git a/torch/lib/THD/test/rpc_serialization.cpp b/torch/lib/THD/test/rpc_serialization.cpp index 8e9a4783489b3d..cc4f437cb9050b 100644 --- a/torch/lib/THD/test/rpc_serialization.cpp +++ b/torch/lib/THD/test/rpc_serialization.cpp @@ -8,7 +8,7 @@ #include #include "../master_worker/common/RPC.hpp" -#include "TH/THStorage.h" +#include "TH/THStorageFunctions.h" using namespace std; using namespace thd; diff --git a/torch/nn/_functions/dropout.py b/torch/nn/_functions/dropout.py index 6ad2297030b73e..e35ff86bdfa8db 100644 --- a/torch/nn/_functions/dropout.py +++ b/torch/nn/_functions/dropout.py @@ -66,3 +66,65 @@ def symbolic(g, input, p=0.5, train=False, inplace=False): def _make_noise(input): return input.new().resize_(input.size(0), input.size(1), *repeat(1, input.dim() - 2)) + + +class AlphaDropout(Dropout): + + @staticmethod + def symbolic(g, input, p=0.5, train=False, inplace=False): + # See Note [Export inplace] + # NB: In inference mode, FeatureDropout is exported as an identity op. + from torch.onnx.symbolic import _unimplemented + if train: + return _unimplemented("AlphaDropout", "training mode") + return input + + @classmethod + def forward(cls, ctx, input, p=0.5, train=False, inplace=False): + if p < 0 or p > 1: + raise ValueError("dropout probability has to be between 0 and 1, " + "but got {}".format(p)) + ctx.p = p + ctx.train = train + ctx.inplace = inplace + + if ctx.p == 0 or not ctx.train: + return input + + if ctx.inplace: + ctx.mark_dirty(input) + output = input + else: + output = input.clone() + + ctx.noise = cls._make_noise(input) + if ctx.p == 1: + a = 0 + b = ctx.noise + else: + ctx.noise.bernoulli_(1 - ctx.p) + alpha = 1.7580993408473766 + a = ((alpha ** 2 * ctx.p + 1) * (1 - ctx.p)) ** (-0.5) + b = ctx.noise.add(-1).mul_(alpha * a).add_(alpha * a * ctx.p) + ctx.noise = ctx.noise.mul_(a).expand_as(input) + b = b.expand_as(input) + output.mul_(ctx.noise).add_(b) + + return output + + +class FeatureAlphaDropout(AlphaDropout): + + @staticmethod + def symbolic(g, input, p=0.5, train=False, inplace=False): + # See Note [Export inplace] + # NB: In inference mode, FeatureDropout is exported as an identity op. + from torch.onnx.symbolic import _unimplemented + if train: + return _unimplemented("FeatureAlphaDropout", "training mode") + return input + + @staticmethod + def _make_noise(input): + return input.new().resize_(input.size(0), input.size(1), + *repeat(1, input.dim() - 2)) diff --git a/torch/nn/_functions/rnn.py b/torch/nn/_functions/rnn.py index c7f5d10ccd4df3..1cccb77b78d35f 100644 --- a/torch/nn/_functions/rnn.py +++ b/torch/nn/_functions/rnn.py @@ -310,7 +310,7 @@ def forward(input, *fargs, **fkwargs): # function gets reconstructed each and every time when RNN() is invoked # and we don't want to pay the cost of decorator invocation import torch - if torch._C._jit_is_tracing(input): + if torch._C._get_tracing_state(): import torch.onnx.symbolic sym = torch.onnx.symbolic.RNN_symbolic_builder(*args, **kwargs) cell_type = args[0] @@ -318,7 +318,7 @@ def forward(input, *fargs, **fkwargs): bound_symbolic = partial(torch.onnx.symbolic.rnn_trace_override_symbolic, cell_type, func, sym) - decorator = torch.onnx.symbolic_override_first_arg_based(bound_symbolic) + decorator = torch.onnx.symbolic_override(bound_symbolic) func = decorator(func) return func(input, *fargs, **fkwargs) diff --git a/torch/nn/functional.py b/torch/nn/functional.py index 3de3a00cbd02a7..17a7c09b012da6 100644 --- a/torch/nn/functional.py +++ b/torch/nn/functional.py @@ -595,35 +595,12 @@ def dropout(input, p=0.5, training=False, inplace=False): return _functions.dropout.Dropout.apply(input, p, training, inplace) -def alpha_dropout(input, p=0.5, training=False): +def alpha_dropout(input, p=0.5, training=False, inplace=False): r"""Applies alpha dropout to the input. See :class:`~torch.nn.AlphaDropout` for details. - - Args: - p (float, optional): the drop probability. Default: 0.5 - training (bool, optional): switch between training and evaluation mode. Default: ``False`` """ - if p < 0 or p > 1: - raise ValueError("dropout probability has to be between 0 and 1, " - "but got {}".format(p)) - - if p == 0 or not training: - return input - - alpha = -1.7580993408473766 - keep_prob = 1 - p - # TODO avoid casting to byte after resize - noise = input.data.new().resize_(input.size()) - noise.bernoulli_(p) - noise = noise.byte() - - output = input.masked_fill(noise, alpha) - - a = (keep_prob + alpha ** 2 * keep_prob * (1 - keep_prob)) ** (-0.5) - b = -a * alpha * (1 - keep_prob) - - return output.mul_(a).add_(b) + return _functions.dropout.AlphaDropout.apply(input, p, training, inplace) def dropout2d(input, p=0.5, training=False, inplace=False): @@ -634,6 +611,10 @@ def dropout3d(input, p=0.5, training=False, inplace=False): return _functions.dropout.FeatureDropout.apply(input, p, training, inplace) +def feature_alpha_dropout(input, p=0.5, training=False, inplace=False): + return _functions.dropout.FeatureAlphaDropout.apply(input, p, training, inplace) + + def threshold(input, threshold, value, inplace=False): r"""Thresholds each element of the input Tensor. @@ -1293,7 +1274,7 @@ def instance_norm(input, running_mean=None, running_var=None, weight=None, import torch.onnx.symbolic - @torch.onnx.symbolic_override_first_arg_based(torch.onnx.symbolic.instance_norm) + @torch.onnx.symbolic_override(torch.onnx.symbolic.instance_norm) def _instance_norm(input, running_mean=None, running_var=None, weight=None, bias=None, use_input_stats=None, momentum=None, eps=None): # Repeat stored stats and affine transform params if necessary diff --git a/torch/nn/modules/__init__.py b/torch/nn/modules/__init__.py index cea0e41f399e56..4d98f482768a63 100644 --- a/torch/nn/modules/__init__.py +++ b/torch/nn/modules/__init__.py @@ -16,7 +16,7 @@ from .batchnorm import BatchNorm1d, BatchNorm2d, BatchNorm3d from .instancenorm import InstanceNorm1d, InstanceNorm2d, InstanceNorm3d from .normalization import LocalResponseNorm, CrossMapLRN2d, LayerNorm, GroupNorm -from .dropout import Dropout, Dropout2d, Dropout3d, AlphaDropout +from .dropout import Dropout, Dropout2d, Dropout3d, AlphaDropout, FeatureAlphaDropout from .padding import ReflectionPad1d, ReflectionPad2d, ReplicationPad1d, ReplicationPad2d, \ ReplicationPad3d, ZeroPad2d, ConstantPad1d, ConstantPad2d, ConstantPad3d from .sparse import Embedding, EmbeddingBag @@ -40,7 +40,8 @@ 'ParameterList', 'ParameterDict', 'AvgPool1d', 'AvgPool2d', 'AvgPool3d', 'MaxPool1d', 'MaxPool2d', 'MaxPool3d', 'MaxUnpool1d', 'MaxUnpool2d', 'MaxUnpool3d', 'FractionalMaxPool2d', 'LPPool1d', 'LPPool2d', 'LocalResponseNorm', 'BatchNorm1d', 'BatchNorm2d', 'BatchNorm3d', 'InstanceNorm1d', - 'InstanceNorm2d', 'InstanceNorm3d', 'LayerNorm', 'GroupNorm', 'Dropout', 'Dropout2d', 'Dropout3d', 'AlphaDropout', + 'InstanceNorm2d', 'InstanceNorm3d', 'LayerNorm', 'GroupNorm', + 'Dropout', 'Dropout2d', 'Dropout3d', 'AlphaDropout', 'FeatureAlphaDropout', 'ReflectionPad1d', 'ReflectionPad2d', 'ReplicationPad2d', 'ReplicationPad1d', 'ReplicationPad3d', 'CrossMapLRN2d', 'Embedding', 'EmbeddingBag', 'RNNBase', 'RNN', 'LSTM', 'GRU', 'RNNCell', 'LSTMCell', 'GRUCell', 'PixelShuffle', 'Upsample', 'UpsamplingNearest2d', 'UpsamplingBilinear2d', 'PairwiseDistance', diff --git a/torch/nn/modules/dropout.py b/torch/nn/modules/dropout.py index e0900954724932..48415f61929f65 100644 --- a/torch/nn/modules/dropout.py +++ b/torch/nn/modules/dropout.py @@ -131,7 +131,7 @@ def forward(self, input): return F.dropout3d(input, self.p, self.training, self.inplace) -class AlphaDropout(Module): +class AlphaDropout(_DropoutNd): r"""Applies Alpha Dropout over the input. Alpha Dropout is a type of Dropout that maintains the self-normalizing @@ -153,6 +153,8 @@ class AlphaDropout(Module): Args: p (float): probability of an element to be dropped. Default: 0.5 + inplace (bool, optional): If set to ``True``, will do this operation + in-place Shape: - Input: `Any`. Input can be of any shape @@ -167,16 +169,11 @@ class AlphaDropout(Module): .. _Self-Normalizing Neural Networks: https://arxiv.org/abs/1706.02515 """ - def __init__(self, p=0.5): - super(AlphaDropout, self).__init__() - if p < 0 or p > 1: - raise ValueError("dropout probability has to be between 0 and 1, " - "but got {}".format(p)) - self.p = p - def forward(self, input): return F.alpha_dropout(input, self.p, self.training) - def __repr__(self): - return self.__class__.__name__ + '(' \ - + 'p=' + str(self.p) + ')' + +class FeatureAlphaDropout(_DropoutNd): + + def forward(self, input): + return F.feature_alpha_dropout(input, self.p, self.training) diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py index 961ce858f2aab2..489e8998843f98 100644 --- a/torch/nn/modules/loss.py +++ b/torch/nn/modules/loss.py @@ -523,7 +523,7 @@ class BCEWithLogitsLoss(_Loss): :math:`p_n > 1` increases the recall, :math:`p_n < 1` increases the precision. For example, if a dataset contains 100 positive and 300 negative examples of a single class, - then `pos_weight` for the class should be equal to math:`\frac{300}{100}=3`. + then `pos_weight` for the class should be equal to :math:`\frac{300}{100}=3`. The loss would act as if the dataset contains math:`3\times 100=300` positive examples. Args: diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py index 91bab5c39e2f10..a00ff3dd9c268c 100644 --- a/torch/nn/modules/module.py +++ b/torch/nn/modules/module.py @@ -450,16 +450,16 @@ def _tracing_name(self, tracing_state): def _slow_forward(self, *input, **kwargs): input_vars = tuple(torch.autograd.function._iter_tensors(input)) - tracing_state = torch.jit.get_tracing_state(input_vars) + tracing_state = torch._C._get_tracing_state() if not tracing_state: return self.forward(*input, **kwargs) if not hasattr(tracing_state, '_traced_module_stack'): tracing_state._traced_module_stack = [] name = self._tracing_name(tracing_state) if name: - tracing_state.push_scope('%s[%s]' % (self.__class__.__name__, name)) + tracing_state.push_scope('%s[%s]' % (self._get_name(), name)) else: - tracing_state.push_scope(self.__class__.__name__) + tracing_state.push_scope(self._get_name()) tracing_state._traced_module_stack.append(self) try: result = self.forward(*input, **kwargs) @@ -471,7 +471,7 @@ def _slow_forward(self, *input, **kwargs): def __call__(self, *input, **kwargs): for hook in self._forward_pre_hooks.values(): hook(self, input) - if torch.jit._tracing: + if torch._C._get_tracing_state(): result = self._slow_forward(*input, **kwargs) else: result = self.forward(*input, **kwargs) diff --git a/torch/nn/parallel/_functions.py b/torch/nn/parallel/_functions.py index bdb770bd75b94a..48b2a77d2d598d 100644 --- a/torch/nn/parallel/_functions.py +++ b/torch/nn/parallel/_functions.py @@ -76,19 +76,17 @@ class Scatter(Function): @staticmethod def forward(ctx, target_gpus, chunk_sizes, dim, input): - ctx.target_gpus = target_gpus - ctx.chunk_sizes = chunk_sizes ctx.dim = dim ctx.input_device = input.get_device() if input.is_cuda else -1 streams = None if ctx.input_device == -1: # Perform CPU to GPU copies in a background stream - streams = [_get_stream(device) for device in ctx.target_gpus] - outputs = comm.scatter(input, ctx.target_gpus, ctx.chunk_sizes, ctx.dim, streams) + streams = [_get_stream(device) for device in target_gpus] + outputs = comm.scatter(input, target_gpus, chunk_sizes, ctx.dim, streams) # Synchronize with the copy stream if streams is not None: for i, output in enumerate(outputs): - with torch.cuda.device(ctx.target_gpus[i]): + with torch.cuda.device(target_gpus[i]): main_stream = torch.cuda.current_stream() main_stream.wait_stream(streams[i]) output.record_stream(main_stream) diff --git a/torch/nn/utils/clip_grad.py b/torch/nn/utils/clip_grad.py index db808adcf70b29..fcccc1f80bdde4 100644 --- a/torch/nn/utils/clip_grad.py +++ b/torch/nn/utils/clip_grad.py @@ -1,5 +1,6 @@ import warnings import torch +from torch._six import inf def clip_grad_norm_(parameters, max_norm, norm_type=2): @@ -23,7 +24,7 @@ def clip_grad_norm_(parameters, max_norm, norm_type=2): parameters = list(filter(lambda p: p.grad is not None, parameters)) max_norm = float(max_norm) norm_type = float(norm_type) - if norm_type == float('inf'): + if norm_type == inf: total_norm = max(p.grad.data.abs().max() for p in parameters) else: total_norm = 0 diff --git a/torch/nn/utils/rnn.py b/torch/nn/utils/rnn.py index 4c1eee05cba879..d91797f00b8114 100644 --- a/torch/nn/utils/rnn.py +++ b/torch/nn/utils/rnn.py @@ -168,8 +168,7 @@ def pack_padded_sequence_trace_wrapper(input, lengths): return tuple(o for o in outputs) -pack_padded_sequence = torch.onnx.symbolic_override_first_arg_based( - _symbolic_pack_padded_sequence)(pack_padded_sequence) +pack_padded_sequence = torch.onnx.symbolic_override(_symbolic_pack_padded_sequence)(pack_padded_sequence) def pad_packed_sequence(sequence, batch_first=False, padding_value=0.0, total_length=None): @@ -264,8 +263,7 @@ def pad_packed_sequence_trace_wrapper(data, batch_sizes): return data, lengths -pad_packed_sequence = torch.onnx.symbolic_override_packed_sequence_based( - _symbolic_pad_packed_sequence)(pad_packed_sequence) +pad_packed_sequence = torch.onnx.symbolic_override(_symbolic_pad_packed_sequence)(pad_packed_sequence) def pad_sequence(sequences, batch_first=False, padding_value=0): diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py index 1807b711a1ccea..0514343da18284 100644 --- a/torch/onnx/__init__.py +++ b/torch/onnx/__init__.py @@ -56,48 +56,6 @@ def _run_symbolic_method(*args, **kwargs): return utils._run_symbolic_method(*args, **kwargs) -def _symbolic_override_wrapper_maker(symbolic_fn, might_trace, fn): - - def wrapper(*args, **kwargs): - import torch - import torch.jit - from torch.autograd import Function, function - - # fast pass - if not might_trace(args): - return fn(*args, **kwargs) - - flat_args = tuple(function._iter_tensors_permissive(args)) - flat_args_only_tensors = tuple(t for t in flat_args if isinstance(t, torch.Tensor)) - if not any(map(torch._C._jit_is_tracing, flat_args_only_tensors)): - return fn(*args, **kwargs) - - tstate = torch._C._get_tracing_state(flat_args_only_tensors) - - arg_values = [torch._C._get_value_trace(tstate, x) if isinstance(x, torch.Tensor) else x for x in flat_args] - - # This must come after the calls to get_value_trace, lest we - # lose information due to in-place operations. - output_vars = fn(*args, **kwargs) - - symbolic_args = function._unflatten(arg_values, args) - output_vals = symbolic_fn(tstate.graph(), *symbolic_args, **kwargs) - - for var, val in zip( - function._iter_tensors(output_vars), - function._iter_jit_values(output_vals)): - val.inferTypeFrom(var.data) - torch._C._set_value_trace(tstate, var, val) - - return output_vars - - # fn might be autograd.Function too, in this case wrapping doesn't work - if isinstance(fn, types.FunctionType): - wrapper = functools.wraps(fn)(wrapper) - - return wrapper - - def symbolic_override(symbolic_fn): r""" Decorator to override ONNX export of the a function with specified subgraph. @@ -123,47 +81,36 @@ def foo(x, y): return x + y[0] + y[1] ``` """ + def decorator(fn): + import torch + from torch.autograd import function - return functools.partial(_symbolic_override_wrapper_maker, symbolic_fn, lambda x: True) - - -def symbolic_override_first_arg_based(symbolic_fn): - r""" - Decorator to override ONNX export of the a function with specified subgraph. + def wrapper(*args, **kwargs): + tstate = torch._C._get_tracing_state() + if not tstate: + return fn(*args, **kwargs) - Equivalent to :func:`symbolic_override` but checks only the first argument - of the function to figure out whether the tracing is on. Thus the first arg - needs to be a Tensor. - """ + flat_args = tuple(function._iter_tensors_permissive(args)) + arg_values = [torch._C._get_value_trace(x) if isinstance(x, torch.Tensor) else x for x in flat_args] - def might_trace(args): - import torch - first_arg = args[0] - if not isinstance(first_arg, torch.Tensor): - raise ValueError('First argument of {} is expected to be a tensor, ' - 'but got an object of type {}' - .format(symbolic_fn.__name__, type(first_arg))) - return torch._C._jit_is_tracing(first_arg) + # This must come after the calls to get_value_trace, lest we + # lose information due to in-place operations. + output_vars = fn(*args, **kwargs) - return functools.partial(_symbolic_override_wrapper_maker, symbolic_fn, might_trace) + symbolic_args = function._unflatten(arg_values, args) + output_vals = symbolic_fn(tstate.graph(), *symbolic_args, **kwargs) + for var, val in zip( + function._iter_tensors(output_vars), + function._iter_jit_values(output_vals)): + val.inferTypeFrom(var.data) + torch._C._set_value_trace(var, val) -def symbolic_override_packed_sequence_based(symbolic_fn): - r""" - Decorator to override ONNX export of the a function with specified subgraph. + return output_vars - Equivalent to :func:`symbolic_override` but checks only the first argument - of the function to figure out whether the tracing is on. Thus the first arg - needs to be a Tensor. - """ + # fn might be autograd.Function too, in this case wrapping doesn't work + if isinstance(fn, types.FunctionType): + wrapper = functools.wraps(fn)(wrapper) - def might_trace(args): - import torch - first_arg = args[0] - if not isinstance(first_arg, torch.nn.utils.rnn.PackedSequence): - raise ValueError('pad_packed_sequence expects sequence to be a ' - 'PackedSequence, but got an object of type {}' - .format(type(first_arg))) - return torch._C._jit_is_tracing(first_arg[0]) - - return functools.partial(_symbolic_override_wrapper_maker, symbolic_fn, might_trace) + return wrapper + return decorator diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py index f5477501080947..a88739c1cc4906 100644 --- a/torch/onnx/symbolic.py +++ b/torch/onnx/symbolic.py @@ -745,6 +745,15 @@ def _cast_func_template(to_i, g, input, non_blocking): globals()[name] = partial(_cast_func_template, v) +def zeros_like(g, input): + return g.op("Sub", input, input).setType(input.type().contiguous()) + + +def full_like(g, input, fill_value): + # TODO: a more efficient implementation (ConstantFill?) + return add(g, zeros_like(g, input), fill_value, alpha=torch.tensor(1)) + + def slice(g, self, dim, start, end, step): if step != 1: _unimplemented("slice", "step!=1 is currently not supported") diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py index b40a07580942e7..ad7f780719ccd3 100644 --- a/torch/optim/lr_scheduler.py +++ b/torch/optim/lr_scheduler.py @@ -1,4 +1,6 @@ import math +import torch +from torch._six import inf from bisect import bisect_right from functools import partial from .optimizer import Optimizer @@ -367,9 +369,9 @@ def _init_is_better(self, mode, threshold, threshold_mode): raise ValueError('threshold mode ' + threshold_mode + ' is unknown!') if mode == 'min': - self.mode_worse = float('inf') + self.mode_worse = inf else: # mode == 'max': - self.mode_worse = (-float('inf')) + self.mode_worse = -inf self.is_better = partial(self._cmp, mode, threshold_mode, threshold) diff --git a/torch/tensor.py b/torch/tensor.py index 7bef2a460db5b4..60a50b6b67b454 100644 --- a/torch/tensor.py +++ b/torch/tensor.py @@ -219,18 +219,6 @@ def share_memory_(self): self.storage().share_memory_() return self - def view_as(self, tensor): - r"""view_as(other) -> Tensor - - View this tensor as the same size as :attr:`other`. - ``self.view_as(other)`` is equivalent to ``self.view(other.size())``. - - Args: - other (:class:`torch.Tensor`): The result tensor has the same size - as :attr:`other.size()`. - """ - return self.view(tensor.size()) - def __reversed__(self): r"""Reverses the tensor along dimension 0.""" if self.dim() == 0: @@ -260,6 +248,17 @@ def btrifact(self, info=None, pivot=True): else: return super(Tensor, self).btrifact(pivot=pivot) + def stft(self, n_fft, hop_length=None, win_length=None, window=None, + center=True, pad_mode='reflect', normalized=False, onesided=True): + r"""See :func:`torch.stft` + + .. warning:: + This function changed signature at version 0.4.1. Calling with + the previous signature may cause error or return incorrect result. + """ + return torch.stft(self, n_fft, hop_length, win_length, window, center, + pad_mode, normalized, onesided) + def resize(self, *sizes): warnings.warn("non-inplace resize is deprecated") from torch.autograd._functions import Resize diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py index 914edf94b190e2..16ad4130d5a418 100644 --- a/torch/utils/cpp_extension.py +++ b/torch/utils/cpp_extension.py @@ -303,6 +303,7 @@ def CppExtension(name, sources, *args, **kwargs): libraries = kwargs.get('libraries', []) libraries.append('caffe2') + libraries.append('torch') libraries.append('_C') kwargs['libraries'] = libraries @@ -346,6 +347,7 @@ def CUDAExtension(name, sources, *args, **kwargs): libraries.append('cudart') if sys.platform == 'win32': libraries.append('caffe2') + libraries.append('torch') libraries.append('caffe2_gpu') libraries.append('_C') kwargs['libraries'] = libraries @@ -692,6 +694,7 @@ def _prepare_ldflags(extra_ldflags, with_cuda, verbose): lib_path = os.path.join(torch_path, 'lib') extra_ldflags.append('caffe2.lib') + extra_ldflags.append('torch.lib') if with_cuda: extra_ldflags.append('caffe2_gpu.lib') extra_ldflags.append('_C.lib')