diff --git a/.clang-tidy b/.clang-tidy index 5466a4a31d20a3..d5fc66c26d42d9 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -2,6 +2,7 @@ # NOTE: there must be no spaces before the '-', so put the comma first. Checks: ' * + ,clang-analyzer-* ,modernize-* ,-cert-err58-cpp ,-cert-err60-cpp @@ -9,6 +10,7 @@ Checks: ' ,-cppcoreguidelines-owning-memory ,-cppcoreguidelines-pro-bounds-array-to-pointer-decay ,-cppcoreguidelines-pro-bounds-constant-array-index + ,-cppcoreguidelines-pro-type-member-init ,-cppcoreguidelines-pro-type-static-cast-downcast ,-cppcoreguidelines-pro-type-vararg ,-cppcoreguidelines-special-member-functions @@ -23,9 +25,11 @@ Checks: ' ,-hicpp-braces-around-statements ,-hicpp-explicit-conversions ,-hicpp-no-array-decay + ,-hicpp-signed-bitwise ,-hicpp-special-member-functions ,-hicpp-vararg ,-llvm-header-guard + ,-llvm-include-order ,-llvm-namespace-comment ,-misc-unused-parameters ,-modernize-make-unique @@ -34,7 +38,6 @@ Checks: ' ,-readability-braces-around-statements ,-readability-else-after-return ,-readability-named-parameter - ,clang-analyzer-* ' WarningsAsErrors: '' HeaderFilterRegex: 'torch/csrc/' diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000000000..cd41d1a02f8290 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.bat text eol=crlf diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh index 6b8aa6fc62bb94..3bc5157d9cab7a 100755 --- a/.jenkins/caffe2/build.sh +++ b/.jenkins/caffe2/build.sh @@ -124,7 +124,7 @@ CMAKE_ARGS+=("-DUSE_OBSERVERS=ON") CMAKE_ARGS+=("-DUSE_ZSTD=ON") CMAKE_ARGS+=("-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}") -if [[ $BUILD_ENVIRONMENT == *-aten-* ]]; then +if [[ $BUILD_ENVIRONMENT == *-aten-* || -n "$INTEGRATED" ]]; then if [[ CMAKE_ARGS != *USE_ATEN* ]] && [[ CMAKE_ARGS != *BUILD_ATEN* ]]; then CMAKE_ARGS+=("-DBUILD_ATEN=ON") fi diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh index 053a9be5e05487..40e3e21417b9b2 100755 --- a/.jenkins/caffe2/test.sh +++ b/.jenkins/caffe2/test.sh @@ -64,7 +64,13 @@ for test in $(find "${INSTALL_PREFIX}/test" -executable -type f); do ;; */aten/*) # ATen uses test framework Catch2 - "$test" -r=xml -o "${junit_reports_dir}/$(basename $test).xml" + # NB: We do NOT use the xml test reporter, because + # Catch doesn't support multiple reporters + # c.f. https://github.com/catchorg/Catch2/blob/master/docs/release-notes.md#223 + # which means that enabling XML output means you lose useful stdout + # output for Jenkins. It's more important to have useful console + # output than it is to have XML output for Jenkins. + "$test" ;; *) "$test" --gtest_output=xml:"$gtest_reports_dir/$(basename $test).xml" @@ -109,6 +115,10 @@ if [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then # Our cuda top_k op has some asm code, the hipified version doesn't # compile yet, so we don't have top_k operator for now rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/top_k_test.py") + + # Our AMD CI boxes have 4 gpus on each + # Remove this once we have added multi-gpu support + export HIP_VISIBLE_DEVICES=$(($BUILD_NUMBER % 4)) fi # Python tests diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index 56db6914c1c20a..48e81dfd635bce 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -43,12 +43,9 @@ if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then # https://github.com/RadeonOpenCompute/hcc#hcc-with-thinlto-linking export KMTHINLTO=1 - sudo chown -R jenkins:jenkins /usr/local - rm -rf "$(dirname "${BASH_SOURCE[0]}")/../../../pytorch_amd/" || true - python "$(dirname "${BASH_SOURCE[0]}")/../../tools/amd_build/build_pytorch_amd.py" - - USE_ROCM=1 python setup.py install - exit + python tools/amd_build/build_pytorch_amd.py + USE_ROCM=1 python setup.py install --user + exit 0 fi # TODO: Don't install this here diff --git a/CMakeLists.txt b/CMakeLists.txt index 651e230ab35ea7..c7eb20d1336550 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -214,9 +214,10 @@ if(NOT MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-strict-overflow") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-strict-aliasing") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-stringop-overflow") # These flags are not available in GCC-4.8.5. Set only when using clang. # Compared against https://gcc.gnu.org/onlinedocs/gcc-4.8.5/gcc/Option-Summary.html - if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-invalid-partial-specialization") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-typedef-redefinition") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-warning-option") @@ -226,6 +227,7 @@ if(NOT MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-c++14-extensions") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-constexpr-not-const") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-braces") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qunused-arguments") endif() if ((APPLE AND (NOT ("${CLANG_VERSION_STRING}" VERSION_LESS "9.0"))) OR (CMAKE_COMPILER_IS_GNUCXX @@ -284,6 +286,8 @@ include_directories(BEFORE ${PROJECT_SOURCE_DIR}) # in PROJECT_SOURCE_DIR. include_directories(BEFORE ${PROJECT_BINARY_DIR}) +include_directories(BEFORE ${PROJECT_SOURCE_DIR}/aten/src/) + # ---[ Old caffe protobuf if(BUILD_CAFFE2) add_subdirectory(caffe/proto) diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt index 462a12b086d2d0..2f2ffdce186d39 100644 --- a/aten/CMakeLists.txt +++ b/aten/CMakeLists.txt @@ -146,4 +146,5 @@ if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE) set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE) set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE) + set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE) endif() diff --git a/aten/src/ATen/Allocator.h b/aten/src/ATen/Allocator.h index c1c78102a0fef8..26989a7ea7fbed 100644 --- a/aten/src/ATen/Allocator.h +++ b/aten/src/ATen/Allocator.h @@ -6,7 +6,7 @@ #include #include #include -#include +#include namespace at { diff --git a/aten/src/ATen/ArrayRef.cpp b/aten/src/ATen/ArrayRef.cpp new file mode 100644 index 00000000000000..2a5d1f7a7cb595 --- /dev/null +++ b/aten/src/ATen/ArrayRef.cpp @@ -0,0 +1 @@ +#include diff --git a/aten/src/ATen/ArrayRef.h b/aten/src/ATen/ArrayRef.h index df144025578c6b..f52a5fcf1c2c58 100644 --- a/aten/src/ATen/ArrayRef.h +++ b/aten/src/ATen/ArrayRef.h @@ -1,192 +1,2 @@ -//===--- ArrayRef.h - Array Reference Wrapper -------------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -// ATen: modified from llvm::ArrayRef. -// removed llvm-specific functionality -// removed some implicit const -> non-const conversions that rely on -// complicated std::enable_if meta-programming -// removed a bunch of slice variants for simplicity... - #pragma once - -#include -#include - -#include -#include -#include - -namespace at { - /// ArrayRef - Represent a constant reference to an array (0 or more elements - /// consecutively in memory), i.e. a start pointer and a length. It allows - /// various APIs to take consecutive elements easily and conveniently. - /// - /// This class does not own the underlying data, it is expected to be used in - /// situations where the data resides in some other buffer, whose lifetime - /// extends past that of the ArrayRef. For this reason, it is not in general - /// safe to store an ArrayRef. - /// - /// This is intended to be trivially copyable, so it should be passed by - /// value. - template - class ArrayRef { - public: - typedef const T *iterator; - typedef const T *const_iterator; - typedef size_t size_type; - - typedef std::reverse_iterator reverse_iterator; - - private: - /// The start of the array, in an external buffer. - const T *Data; - - /// The number of elements. - size_type Length; - - public: - /// @name Constructors - /// @{ - - /// Construct an empty ArrayRef. - /*implicit*/ ArrayRef() : Data(nullptr), Length(0) {} - - /// Construct an ArrayRef from a single element. - /*implicit*/ ArrayRef(const T &OneElt) - : Data(&OneElt), Length(1) {} - - /// Construct an ArrayRef from a pointer and length. - /*implicit*/ ArrayRef(const T *data, size_t length) - : Data(data), Length(length) {} - - /// Construct an ArrayRef from a range. - ArrayRef(const T *begin, const T *end) - : Data(begin), Length(end - begin) {} - - /// Construct an ArrayRef from a SmallVector. This is templated in order to - /// avoid instantiating SmallVectorTemplateCommon whenever we - /// copy-construct an ArrayRef. - template - /*implicit*/ ArrayRef(const SmallVectorTemplateCommon &Vec) - : Data(Vec.data()), Length(Vec.size()) { - } - - /// Construct an ArrayRef from a std::vector. - template - /*implicit*/ ArrayRef(const std::vector &Vec) - : Data(Vec.data()), Length(Vec.size()) {} - - /// Construct an ArrayRef from a std::array - template - /*implicit*/ constexpr ArrayRef(const std::array &Arr) - : Data(Arr.data()), Length(N) {} - - /// Construct an ArrayRef from a C array. - template - /*implicit*/ constexpr ArrayRef(const T (&Arr)[N]) : Data(Arr), Length(N) {} - - /// Construct an ArrayRef from a std::initializer_list. - /*implicit*/ ArrayRef(const std::initializer_list &Vec) - : Data(Vec.begin() == Vec.end() ? (T*)nullptr : Vec.begin()), - Length(Vec.size()) {} - - /// @} - /// @name Simple Operations - /// @{ - - const_iterator begin() const { return Data; } - const_iterator end() const { return Data + Length; } - - reverse_iterator rbegin() const { return reverse_iterator(end()); } - reverse_iterator rend() const { return reverse_iterator(begin()); } - - /// empty - Check if the array is empty. - bool empty() const { return Length == 0; } - - const T *data() const { return Data; } - - /// size - Get the array size. - size_t size() const { return Length; } - - /// front - Get the first element. - const T &front() const { - AT_CHECK(!empty(), "ArrayRef: attempted to access front() of empty list"); - return Data[0]; - } - - /// back - Get the last element. - const T &back() const { - AT_CHECK(!empty(), "ArrayRef: attempted to access back() of empty list"); - return Data[Length-1]; - } - - /// equals - Check for element-wise equality. - bool equals(ArrayRef RHS) const { - if (Length != RHS.Length) - return false; - return std::equal(begin(), end(), RHS.begin()); - } - - /// slice(n, m) - Chop off the first N elements of the array, and keep M - /// elements in the array. - ArrayRef slice(size_t N, size_t M) const { - AT_CHECK(N+M <= size(), "ArrayRef: invalid slice, ", N, " + ", M, " is not <= ", size()); - return ArrayRef(data()+N, M); - } - - /// slice(n) - Chop off the first N elements of the array. - ArrayRef slice(size_t N) const { return slice(N, size() - N); } - - /// @} - /// @name Operator Overloads - /// @{ - const T &operator[](size_t Index) const { - return Data[Index]; - } - - /// Vector compatibility - const T &at(size_t Index) const { - AT_CHECK(Index < Length, "ArrayRef: invalid index ", Index, " for length ", Length); - return Data[Index]; - } - - /// Disallow accidental assignment from a temporary. - /// - /// The declaration here is extra complicated so that "arrayRef = {}" - /// continues to select the move assignment operator. - template - typename std::enable_if::value, ArrayRef>::type & - operator=(U &&Temporary) = delete; - - /// Disallow accidental assignment from a temporary. - /// - /// The declaration here is extra complicated so that "arrayRef = {}" - /// continues to select the move assignment operator. - template - typename std::enable_if::value, ArrayRef>::type & - operator=(std::initializer_list) = delete; - - /// @} - /// @name Expensive Operations - /// @{ - std::vector vec() const { - return std::vector(Data, Data+Length); - } - - /// @} - /// @name Conversion operators - /// @{ - operator std::vector() const { - return std::vector(Data, Data+Length); - } - - /// @} - }; - -} // end namespace at +#include diff --git a/aten/src/ATen/Backtrace.h b/aten/src/ATen/Backtrace.h index 347c430d61b75c..bdef9f4a9de439 100644 --- a/aten/src/ATen/Backtrace.h +++ b/aten/src/ATen/Backtrace.h @@ -1,28 +1,2 @@ #pragma once - -#include -#include -#include - -#include - -namespace at { -/// Utility to demangle a C++ symbol name. -AT_API std::string demangle(const char* name); - -/// Returns the printable name of the type. -template -inline const char* demangle_type() { -#ifdef __GXX_RTTI - static const std::string name = demangle(typeid(T).name()); - return name.c_str(); -#else // __GXX_RTTI - return "(RTTI disabled, cannot show name)"; -#endif // __GXX_RTTI -} - -AT_API std::string get_backtrace( - size_t frames_to_skip = 0, - size_t maximum_number_of_frames = 64, - bool skip_python_frames = true); -} // namespace at +#include diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index 562910ad86a298..25a2e6d8b501f0 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -44,6 +44,7 @@ CONFIGURE_FILE(cuda/CUDAConfig.h.in "${CMAKE_CURRENT_SOURCE_DIR}/cuda/CUDAConfig # NB: If you edit these globs, you'll have to update setup.py package_data as well FILE(GLOB base_h "*.h" "detail/*.h") FILE(GLOB base_cpp "*.cpp" "detail/*.cpp") +add_subdirectory(core) FILE(GLOB cuda_h "cuda/*.h" "cuda/detail/*.h" "cuda/*.cuh" "cuda/detail/*.cuh") FILE(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp") FILE(GLOB cuda_cu "cuda/*.cu" "cuda/detail/*.cu") @@ -62,7 +63,7 @@ FILE(GLOB native_cuda_cpp "native/cuda/*.cpp") FILE(GLOB native_mkl_cpp "native/mkl/*.cpp") FILE(GLOB native_mkldnn_cpp "native/mkldnn/*.cpp") -set(all_cpu_cpp ${base_cpp} ${native_cpp} ${native_sparse_cpp} ${native_mkl_cpp} ${native_mkldnn_cpp} ${generated_cpp} ${ATen_CPU_SRCS} ${cpu_kernel_cpp}) +set(all_cpu_cpp ${base_cpp} ${ATen_CORE_SRCS} ${native_cpp} ${native_sparse_cpp} ${native_mkl_cpp} ${native_mkldnn_cpp} ${generated_cpp} ${ATen_CPU_SRCS} ${cpu_kernel_cpp}) if(AT_MKL_ENABLED) set(all_cpu_cpp ${all_cpu_cpp} ${mkl_cpp}) endif() @@ -393,7 +394,7 @@ INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake" DESTINATION "${AT_INSTALL_SHARE_DIR}/cmake/ATen") # https://stackoverflow.com/questions/11096471/how-can-i-install-a-hierarchy-of-files-using-cmake -FOREACH(HEADER ${base_h} ${cuda_h} ${cudnn_h}) +FOREACH(HEADER ${base_h} ${ATen_CORE_HEADERS} ${cuda_h} ${cudnn_h}) string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}/" "" HEADER_SUB ${HEADER}) GET_FILENAME_COMPONENT(DIR ${HEADER_SUB} DIRECTORY) INSTALL(FILES ${HEADER} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen/${DIR}) @@ -444,6 +445,7 @@ if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) endif() # Pass source, includes, and libs to parent +set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE) set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE) set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} PARENT_SCOPE) set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE) diff --git a/aten/src/ATen/CPUApplyUtils.h b/aten/src/ATen/CPUApplyUtils.h index 2db2786b1c66cd..ef370ea6e0bc30 100644 --- a/aten/src/ATen/CPUApplyUtils.h +++ b/aten/src/ATen/CPUApplyUtils.h @@ -109,8 +109,8 @@ struct strided_tensor_iter { : data_(tensor.data()), dim_(tensor.ndimension()), counter_(dim_, 0), - sizes_(tensor.sizes()), - strides_(tensor.strides()) { + sizes_(tensor.sizes().vec()), + strides_(tensor.strides().vec()) { _setup_arrays(tensor, this); } }; diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp index 59f6ff755ee3f1..d153e6bc6ada00 100644 --- a/aten/src/ATen/Context.cpp +++ b/aten/src/ATen/Context.cpp @@ -37,8 +37,11 @@ Context::Context() Type::registerCPU(this); } +// NB: Ensure that globalContext is initialized before we load +// variable hooks, otherwise we will deadlock. Regardless, the +// deadlock is bad, and being tracked at https://github.com/pytorch/pytorch/issues/9784 +static Context globalContext_; Context & globalContext() { - static Context globalContext_; return globalContext_; } diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index 309c4be2e651dd..7d3fdd1cc2d4af 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -9,6 +9,9 @@ #include "ATen/detail/CUDAHooksInterface.h" #include "ATen/CUDAStream.h" +// This is temporary +#include "ATen/core/ATenCoreTest.h" + #include #include #include diff --git a/aten/src/ATen/Error.h b/aten/src/ATen/Error.h index 5a41eb7c74e7cb..2a184d4ecbd5ea 100644 --- a/aten/src/ATen/Error.h +++ b/aten/src/ATen/Error.h @@ -1,131 +1,2 @@ #pragma once - -#include // for AT_API -#include - -#include -#include -#include -#include -#include - -#if defined(_MSC_VER) && _MSC_VER <= 1900 -#define __func__ __FUNCTION__ -#endif - -namespace at { - -namespace detail { - -inline std::ostream& _str(std::ostream& ss) { return ss; } - -template -inline std::ostream& _str(std::ostream& ss, const T& t) { - ss << t; - return ss; -} - -template -inline std::ostream& -_str(std::ostream& ss, const T& t, const Args&... args) { - return _str(_str(ss, t), args...); -} - -} // namespace detail - -// Convert a list of string-like arguments into a single string. -template -inline std::string str(const Args&... args) { - std::ostringstream ss; - detail::_str(ss, args...); - return ss.str(); -} - -// Specializations for already-a-string types. -template <> -inline std::string str(const std::string& str) { - return str; -} -inline std::string str(const char* c_str) { - return c_str; -} - -/// Represents a location in source code (for debugging). -struct SourceLocation { - const char* function; - const char* file; - uint32_t line; -}; - -std::ostream& operator<<(std::ostream& out, const SourceLocation& loc); - -/// The primary ATen error class. -/// Provides a complete error message with source location information via -/// `what()`, and a more concise message via `what_without_backtrace()`. Should -/// primarily be used with the `AT_ERROR` macro. -/// -/// NB: at::Error is handled specially by the default torch to suppress the -/// backtrace, see torch/csrc/Exceptions.h -class AT_API Error : public std::exception { - std::string what_without_backtrace_; - std::string what_; - -public: - Error(SourceLocation source_location, std::string err); - - /// Returns the complete error message, including the source location. - const char* what() const noexcept override { - return what_.c_str(); - } - - /// Returns only the error message string, without source location. - const char* what_without_backtrace() const noexcept { - return what_without_backtrace_.c_str(); - } -}; - -class AT_API Warning { - using handler_t = void(*)(const SourceLocation& source_location, const char* msg); - -public: - /// Issue a warning with a given message. Dispatched to the current - /// warning handler. - static void warn(SourceLocation source_location, std::string msg); - - /// Sets the global warning handler. This is not thread-safe, so it should - /// generally be called once during initialization. - static void set_warning_handler(handler_t handler); - - /// The default warning handler. Prints the message to stderr. - static void print_warning(const SourceLocation& source_location, const char* msg); - -private: - static handler_t warning_handler_; -}; - - -} // namespace at - -// TODO: variants that print the expression tested and thus don't require strings -// TODO: CAFFE_ENFORCE_WITH_CALLER style macro - -#define AT_ERROR(...) \ - throw at::Error({__func__, __FILE__, __LINE__}, at::str(__VA_ARGS__)) - -#define AT_WARN(...) \ - at::Warning::warn({__func__, __FILE__, __LINE__}, at::str(__VA_ARGS__)) - -#define AT_ASSERT(cond) \ - if (!(cond)) { \ - AT_ERROR(#cond " ASSERT FAILED at ", __FILE__, ":", __LINE__, ", please report a bug to PyTorch."); \ - } - -#define AT_ASSERTM(cond, ...) \ - if (!(cond)) { \ - AT_ERROR(at::str(#cond, " ASSERT FAILED at ", __FILE__, ":", __LINE__, ", please report a bug to PyTorch. ", __VA_ARGS__)); \ - } - -#define AT_CHECK(cond, ...) \ - if (!(cond)) { \ - AT_ERROR(at::str(__VA_ARGS__)); \ - } +#include diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h index 35125cfa6751bb..934be4093b7257 100644 --- a/aten/src/ATen/ExpandUtils.h +++ b/aten/src/ATen/ExpandUtils.h @@ -111,7 +111,7 @@ inline std::vector expand_outplace(TensorList to_expand) { if (!to_expand[i].defined()) { continue; } else if (first) { - sizes = to_expand[i].sizes(); + sizes = to_expand[i].sizes().vec(); first = false; } else { sizes = infer_size(sizes, to_expand[i].sizes()); diff --git a/aten/src/ATen/Half-inl.h b/aten/src/ATen/Half-inl.h deleted file mode 100644 index e5563faca3ab33..00000000000000 --- a/aten/src/ATen/Half-inl.h +++ /dev/null @@ -1,168 +0,0 @@ -#pragma once - -#include "ATen/ATenGeneral.h" -#include -#include - -#ifdef __CUDACC__ -#include -#endif - -namespace at { - -/// Constructors - -inline AT_HOSTDEVICE Half::Half(float value) { -#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - x = __half_as_short(__float2half(value)); -#else - x = detail::float2halfbits(value); -#endif -} - -/// Implicit conversions - -inline AT_HOSTDEVICE Half::operator float() const { -#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - return __half2float(*reinterpret_cast(&x)); -#else - return detail::halfbits2float(x); -#endif -} - -#ifdef __CUDACC__ -inline AT_HOSTDEVICE Half::Half(const __half& value) { - x = *reinterpret_cast(&value); -} -inline AT_HOSTDEVICE Half::operator __half() const { - return *reinterpret_cast(&x); -} -#endif - -/// Arithmetic - -inline AT_HOSTDEVICE Half operator+(const Half& a, const Half& b) { - return (float)a + (float)b; -} - -inline AT_HOSTDEVICE Half operator-(const Half& a, const Half& b) { - return (float)a - (float)b; -} - -inline AT_HOSTDEVICE Half operator*(const Half& a, const Half& b) { - return (float)a * (float)b; -} - -inline AT_HOSTDEVICE Half operator/(const Half& a, const Half& b) { - return (float)a / (float)b; -} - -inline AT_HOSTDEVICE Half operator-(const Half& a) { - return -(float)a; -} - -inline AT_HOSTDEVICE Half& operator+=(Half& a, const Half& b) { - a = a + b; - return a; -} - -inline AT_HOSTDEVICE Half& operator-=(Half& a, const Half& b) { - a = a - b; - return a; -} - -inline AT_HOSTDEVICE Half& operator*=(Half& a, const Half& b) { - a = a * b; - return a; -} - -inline AT_HOSTDEVICE Half& operator/=(Half& a, const Half& b) { - a = a / b; - return a; -} - -/// Arithmetic with floats - -inline AT_HOSTDEVICE float operator+(Half a, float b) { return (float)a + b; } -inline AT_HOSTDEVICE float operator-(Half a, float b) { return (float)a - b; } -inline AT_HOSTDEVICE float operator*(Half a, float b) { return (float)a * b; } -inline AT_HOSTDEVICE float operator/(Half a, float b) { return (float)a / b; } - -inline AT_HOSTDEVICE float operator+(float a, Half b) { return a + (float)b; } -inline AT_HOSTDEVICE float operator-(float a, Half b) { return a - (float)b; } -inline AT_HOSTDEVICE float operator*(float a, Half b) { return a * (float)b; } -inline AT_HOSTDEVICE float operator/(float a, Half b) { return a / (float)b; } - -inline AT_HOSTDEVICE float& operator+=(float& a, const Half& b) { return a += (float)b; } -inline AT_HOSTDEVICE float& operator-=(float& a, const Half& b) { return a -= (float)b; } -inline AT_HOSTDEVICE float& operator*=(float& a, const Half& b) { return a *= (float)b; } -inline AT_HOSTDEVICE float& operator/=(float& a, const Half& b) { return a /= (float)b; } - -/// Arithmetic with doubles - -inline AT_HOSTDEVICE double operator+(Half a, double b) { return (double)a + b; } -inline AT_HOSTDEVICE double operator-(Half a, double b) { return (double)a - b; } -inline AT_HOSTDEVICE double operator*(Half a, double b) { return (double)a * b; } -inline AT_HOSTDEVICE double operator/(Half a, double b) { return (double)a / b; } - -inline AT_HOSTDEVICE double operator+(double a, Half b) { return a + (double)b; } -inline AT_HOSTDEVICE double operator-(double a, Half b) { return a - (double)b; } -inline AT_HOSTDEVICE double operator*(double a, Half b) { return a * (double)b; } -inline AT_HOSTDEVICE double operator/(double a, Half b) { return a / (double)b; } - -/// Arithmetic with ints - -inline AT_HOSTDEVICE Half operator+(Half a, int b) { return a + (Half)b; } -inline AT_HOSTDEVICE Half operator-(Half a, int b) { return a - (Half)b; } -inline AT_HOSTDEVICE Half operator*(Half a, int b) { return a * (Half)b; } -inline AT_HOSTDEVICE Half operator/(Half a, int b) { return a / (Half)b; } - -inline AT_HOSTDEVICE Half operator+(int a, Half b) { return (Half)a + b; } -inline AT_HOSTDEVICE Half operator-(int a, Half b) { return (Half)a - b; } -inline AT_HOSTDEVICE Half operator*(int a, Half b) { return (Half)a * b; } -inline AT_HOSTDEVICE Half operator/(int a, Half b) { return (Half)a / b; } - -/// NOTE: we do not define comparisons directly and instead rely on the implicit -/// conversion from at::Half to float. - -} // namespace at - -namespace std { - -template<> class numeric_limits { - public: - static constexpr bool is_specialized = true; - static constexpr bool is_signed = true; - static constexpr bool is_integer = false; - static constexpr bool is_exact = false; - static constexpr bool has_infinity = true; - static constexpr bool has_quiet_NaN = true; - static constexpr bool has_signaling_NaN = true; - static constexpr auto has_denorm = numeric_limits::has_denorm; - static constexpr auto has_denorm_loss = numeric_limits::has_denorm_loss; - static constexpr auto round_style = numeric_limits::round_style; - static constexpr bool is_iec559 = true; - static constexpr bool is_bounded = true; - static constexpr bool is_modulo = false; - static constexpr int digits = 11; - static constexpr int digits10 = 3; - static constexpr int max_digits10 = 5; - static constexpr int radix = 2; - static constexpr int min_exponent = -13; - static constexpr int min_exponent10 = -4; - static constexpr int max_exponent = 16; - static constexpr int max_exponent10 = 4; - static constexpr auto traps = numeric_limits::traps; - static constexpr auto tinyness_before = numeric_limits::tinyness_before; - static constexpr at::Half min() { return at::Half(0x0400, at::Half::from_bits); } - static constexpr at::Half lowest() { return at::Half(0xFBFF, at::Half::from_bits); } - static constexpr at::Half max() { return at::Half(0x7BFF, at::Half::from_bits); } - static constexpr at::Half epsilon() { return at::Half(0x1400, at::Half::from_bits); } - static constexpr at::Half round_error() { return at::Half(0x3800, at::Half::from_bits); } - static constexpr at::Half infinity() { return at::Half(0x7C00, at::Half::from_bits); } - static constexpr at::Half quiet_NaN() { return at::Half(0x7E00, at::Half::from_bits); } - static constexpr at::Half signaling_NaN() { return at::Half(0x7D00, at::Half::from_bits); } - static constexpr at::Half denorm_min() { return at::Half(0x0001, at::Half::from_bits); } -}; - -} // namespace std diff --git a/aten/src/ATen/Half.cpp b/aten/src/ATen/Half.cpp deleted file mode 100644 index 68f80a56ea8195..00000000000000 --- a/aten/src/ATen/Half.cpp +++ /dev/null @@ -1,34 +0,0 @@ -#include "ATen/Half.h" - -#include "ATen/Tensor.h" -#include "ATen/Context.h" - -#include -#include - -namespace at { - -static_assert(std::is_standard_layout::value, "at::Half must be standard layout."); - -namespace detail { - -float halfbits2float(unsigned short bits) { - float value; - TH_halfbits2float(&bits, &value); - return value; -} - -unsigned short float2halfbits(float value) { - unsigned short bits; - TH_float2halfbits(&value, &bits); - return bits; -} - -} // namespace detail - -std::ostream& operator<<(std::ostream & out, const Half& value) { - out << (float)value; - return out; -} - -} // namespace at diff --git a/aten/src/ATen/Half.h b/aten/src/ATen/Half.h index b7ac47e4fda79a..21941116f19e82 100644 --- a/aten/src/ATen/Half.h +++ b/aten/src/ATen/Half.h @@ -1,120 +1,2 @@ #pragma once - -/// Defines the Half type (half-precision floating-point) including conversions -/// to standard C types and basic arithmetic operations. Note that arithmetic -/// operations are implemented by converting to floating point and -/// performing the operation in float32, instead of using CUDA half intrinisics. -/// Most uses of this type within ATen are memory bound, including the -/// element-wise kernels, and the half intrinisics aren't efficient on all GPUs. -/// If you are writing a compute bound kernel, you can use the CUDA half -/// intrinsics directly on the Half type from device code. - -#include "ATen/ATenGeneral.h" - -#include -#include -#include -#include -#include -#include -#include - -#ifdef __CUDACC__ -#include -#endif - -#ifndef AT_HOSTDEVICE - #ifdef __CUDACC__ - #define AT_HOSTDEVICE __host__ __device__ - #else - #define AT_HOSTDEVICE - #endif -#endif - -namespace at { - -namespace detail { - -AT_API float halfbits2float(unsigned short bits); -AT_API unsigned short float2halfbits(float value); - -} - -struct alignas(2) Half { - unsigned short x; - - struct from_bits_t {}; - static constexpr from_bits_t from_bits = from_bits_t(); - - // HIP wants __host__ __device__ tag, CUDA does not -#ifdef __HIP_PLATFORM_HCC__ - AT_HOSTDEVICE Half() = default; -#else - Half() = default; -#endif - - constexpr AT_HOSTDEVICE Half(unsigned short bits, from_bits_t) : x(bits) {}; - inline AT_HOSTDEVICE Half(float value); - inline AT_HOSTDEVICE operator float() const; - -#ifdef __CUDACC__ - inline AT_HOSTDEVICE Half(const __half& value); - inline AT_HOSTDEVICE operator __half() const; -#endif -}; - -template To convert(From f) { - return static_cast(f); -} - -// skip isnan and isinf check for integral types -template -typename std::enable_if::value, bool>::type overflows(From f) { - using limit = std::numeric_limits; - if (!limit::is_signed && std::numeric_limits::is_signed) { - // allow for negative numbers to wrap using two's complement arithmetic. - // For example, with uint8, this allows for `a - b` to be treated as - // `a + 255 * b`. - return f > limit::max() || (f < 0 && -(uint64_t)f > limit::max()); - } else { - return f < limit::lowest() || f > limit::max(); - } -} - -template -typename std::enable_if::value, bool>::type overflows(From f) { - using limit = std::numeric_limits; - if (limit::has_infinity && std::isinf((double)f)) { - return false; - } - if (!limit::has_quiet_NaN && (f != f)) { - return true; - } - return f < limit::lowest() || f > limit::max(); -} - -template To checked_convert(From f, const char* name) { - if (overflows(f)) { - std::string msg = "value cannot be converted to type "; - msg += name; - msg += " without overflow: "; - msg += std::to_string(f); - throw std::domain_error(std::move(msg)); - } - return convert(f); -} - -template -To HalfFix(From h) { - To ret; - ret.x = h.x; - return ret; -} - -AT_API std::ostream& operator<<(std::ostream & out, const Half& value); - -} // namespace at - -#include "Half-inl.h" - -#undef AT_HOSTDEVICE +#include diff --git a/aten/src/ATen/Parallel.h b/aten/src/ATen/Parallel.h index 794d8e5f8c31a9..6aadd62eb1d3fd 100644 --- a/aten/src/ATen/Parallel.h +++ b/aten/src/ATen/Parallel.h @@ -37,7 +37,9 @@ inline void parallel_for( f(begin_tid, std::min(end, chunk_size + begin_tid)); } #else - f(begin, end); + if (begin < end) { + f(begin, end); + } #endif } diff --git a/aten/src/ATen/Scalar.h b/aten/src/ATen/Scalar.h index e80d467b138ac3..f0b84d67554c02 100644 --- a/aten/src/ATen/Scalar.h +++ b/aten/src/ATen/Scalar.h @@ -10,7 +10,6 @@ #include "ATen/Half.h" #include "ATen/ScalarType.h" #include "ATen/TensorBase.h" -#include "ATen/Utils.h" namespace at { diff --git a/aten/src/ATen/ScalarType.h b/aten/src/ATen/ScalarType.h index f7c9243a89df2a..3651aef60e3e1e 100644 --- a/aten/src/ATen/ScalarType.h +++ b/aten/src/ATen/ScalarType.h @@ -10,16 +10,16 @@ namespace at { // NB: Order matters for this macro; it is relied upon in -// _promoteTypesLookup and probably other places. +// _promoteTypesLookup and the serialization format. #define AT_FORALL_SCALAR_TYPES(_) \ -_(uint8_t,Byte,i) \ -_(int8_t,Char,i) \ -_(int16_t,Short,i) \ -_(int,Int,i) \ -_(int64_t,Long,i) \ -_(at::Half,Half,d) \ -_(float,Float,d) \ -_(double,Double,d) +_(uint8_t,Byte,i) /* 0 */ \ +_(int8_t,Char,i) /* 1 */ \ +_(int16_t,Short,i) /* 2 */ \ +_(int,Int,i) /* 3 */ \ +_(int64_t,Long,i) /* 4 */ \ +_(at::Half,Half,d) /* 5 */ \ +_(float,Float,d) /* 6 */ \ +_(double,Double,d) /* 7 */ #define AT_FORALL_SCALAR_TYPES_EXCEPT_HALF(_) \ _(uint8_t,Byte,i) \ @@ -35,7 +35,7 @@ enum class ScalarType { n, AT_FORALL_SCALAR_TYPES(DEFINE_ENUM) #undef DEFINE_ENUM - Undefined, + Undefined, // 8 NumOptions }; diff --git a/aten/src/ATen/SmallVector.h b/aten/src/ATen/SmallVector.h index 7c52ef686aa41a..1dbaa933c555dd 100644 --- a/aten/src/ATen/SmallVector.h +++ b/aten/src/ATen/SmallVector.h @@ -1,982 +1,2 @@ -//===- llvm/ADT/SmallVector.h - 'Normally small' vectors --------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines the SmallVector class. -// -//===----------------------------------------------------------------------===// - -// ATen: modified from llvm::SmallVector. -// replaced report_bad_alloc_error with std::bad_alloc -// replaced isPodLike with AT_IS_TRIVIALLY_COPYABLE -// replaced iterator_range constructor with inline Container&& constructor -// removed LLVM_NODISCARD and LLVM_ATTRIBUTE_ALWAYS_INLINE qualifiers -// removed LLVM_UNLIKELY - #pragma once - -#include "AlignOf.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#if __GNUG__ && __GNUC__ < 5 -#define AT_IS_TRIVIALLY_COPYABLE(T) __has_trivial_copy(T) -#else -#define AT_IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable::value -#endif - -namespace at { - -namespace detail { - -// From llvm/Support/MathExtras.h -static inline uint64_t NextPowerOf2(uint64_t A) { - A |= (A >> 1); - A |= (A >> 2); - A |= (A >> 4); - A |= (A >> 8); - A |= (A >> 16); - A |= (A >> 32); - return A + 1; -} - -} - -/// This is all the non-templated stuff common to all SmallVectors. -class AT_API SmallVectorBase { -protected: - void *BeginX, *EndX, *CapacityX; - -protected: - SmallVectorBase(void *FirstEl, size_t Size) - : BeginX(FirstEl), EndX(FirstEl), CapacityX((char*)FirstEl+Size) {} - - /// This is an implementation of the grow() method which only works - /// on POD-like data types and is out of line to reduce code duplication. - void grow_pod(void *FirstEl, size_t MinSizeInBytes, size_t TSize); - -public: - /// This returns size()*sizeof(T). - size_t size_in_bytes() const { - return size_t((char*)EndX - (char*)BeginX); - } - - /// capacity_in_bytes - This returns capacity()*sizeof(T). - size_t capacity_in_bytes() const { - return size_t((char*)CapacityX - (char*)BeginX); - } - - bool empty() const { return BeginX == EndX; } -}; - -/// This is the part of SmallVectorTemplateBase which does not depend on whether -/// the type T is a POD. The extra dummy template argument is used by ArrayRef -/// to avoid unnecessarily requiring T to be complete. -template -class SmallVectorTemplateCommon : public SmallVectorBase { -private: - template friend struct SmallVectorStorage; - - // Allocate raw space for N elements of type T. If T has a ctor or dtor, we - // don't want it to be automatically run, so we need to represent the space as - // something else. Use an array of char of sufficient alignment. - using U = AlignedCharArrayUnion; - U FirstEl; - // Space after 'FirstEl' is clobbered, do not add any instance vars after it. - -protected: - SmallVectorTemplateCommon(size_t Size) : SmallVectorBase(&FirstEl, Size) {} - - void grow_pod(size_t MinSizeInBytes, size_t TSize) { - SmallVectorBase::grow_pod(&FirstEl, MinSizeInBytes, TSize); - } - - /// Return true if this is a smallvector which has not had dynamic - /// memory allocated for it. - bool isSmall() const { - return BeginX == static_cast(&FirstEl); - } - - /// Put this vector in a state of being small. - void resetToSmall() { - BeginX = EndX = CapacityX = &FirstEl; - } - - void setEnd(T *P) { this->EndX = P; } - -public: - using size_type = size_t; - using difference_type = ptrdiff_t; - using value_type = T; - using iterator = T *; - using const_iterator = const T *; - - using const_reverse_iterator = std::reverse_iterator; - using reverse_iterator = std::reverse_iterator; - - using reference = T &; - using const_reference = const T &; - using pointer = T *; - using const_pointer = const T *; - - // forward iterator creation methods. - iterator begin() { return (iterator)this->BeginX; } - const_iterator begin() const { return (const_iterator)this->BeginX; } - iterator end() { return (iterator)this->EndX; } - const_iterator end() const { return (const_iterator)this->EndX; } - -protected: - iterator capacity_ptr() { return (iterator)this->CapacityX; } - const_iterator capacity_ptr() const { return (const_iterator)this->CapacityX;} - -public: - // reverse iterator creation methods. - reverse_iterator rbegin() { return reverse_iterator(end()); } - const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); } - reverse_iterator rend() { return reverse_iterator(begin()); } - const_reverse_iterator rend() const { return const_reverse_iterator(begin());} - - size_type size() const { return end()-begin(); } - size_type max_size() const { return size_type(-1) / sizeof(T); } - - /// Return the total number of elements in the currently allocated buffer. - size_t capacity() const { return capacity_ptr() - begin(); } - - /// Return a pointer to the vector's buffer, even if empty(). - pointer data() { return pointer(begin()); } - /// Return a pointer to the vector's buffer, even if empty(). - const_pointer data() const { return const_pointer(begin()); } - - reference operator[](size_type idx) { - assert(idx < size()); - return begin()[idx]; - } - const_reference operator[](size_type idx) const { - assert(idx < size()); - return begin()[idx]; - } - - reference front() { - assert(!empty()); - return begin()[0]; - } - const_reference front() const { - assert(!empty()); - return begin()[0]; - } - - reference back() { - assert(!empty()); - return end()[-1]; - } - const_reference back() const { - assert(!empty()); - return end()[-1]; - } -}; - -/// SmallVectorTemplateBase - This is where we put method -/// implementations that are designed to work with non-POD-like T's. -template -class SmallVectorTemplateBase : public SmallVectorTemplateCommon { -protected: - SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon(Size) {} - - static void destroy_range(T *S, T *E) { - while (S != E) { - --E; - E->~T(); - } - } - - /// Move the range [I, E) into the uninitialized memory starting with "Dest", - /// constructing elements as needed. - template - static void uninitialized_move(It1 I, It1 E, It2 Dest) { - std::uninitialized_copy(std::make_move_iterator(I), - std::make_move_iterator(E), Dest); - } - - /// Copy the range [I, E) onto the uninitialized memory starting with "Dest", - /// constructing elements as needed. - template - static void uninitialized_copy(It1 I, It1 E, It2 Dest) { - std::uninitialized_copy(I, E, Dest); - } - - /// Grow the allocated memory (without initializing new elements), doubling - /// the size of the allocated memory. Guarantees space for at least one more - /// element, or MinSize more elements if specified. - void grow(size_t MinSize = 0); - -public: - void push_back(const T &Elt) { - if (this->EndX >= this->CapacityX) - this->grow(); - ::new ((void*) this->end()) T(Elt); - this->setEnd(this->end()+1); - } - - void push_back(T &&Elt) { - if (this->EndX >= this->CapacityX) - this->grow(); - ::new ((void*) this->end()) T(::std::move(Elt)); - this->setEnd(this->end()+1); - } - - void pop_back() { - this->setEnd(this->end()-1); - this->end()->~T(); - } -}; - -// Define this out-of-line to dissuade the C++ compiler from inlining it. -template -void SmallVectorTemplateBase::grow(size_t MinSize) { - size_t CurCapacity = this->capacity(); - size_t CurSize = this->size(); - // Always grow, even from zero. - size_t NewCapacity = size_t(detail::NextPowerOf2(CurCapacity+2)); - if (NewCapacity < MinSize) - NewCapacity = MinSize; - T *NewElts = static_cast(malloc(NewCapacity*sizeof(T))); - if (NewElts == nullptr) - throw std::bad_alloc(); - - // Move the elements over. - this->uninitialized_move(this->begin(), this->end(), NewElts); - - // Destroy the original elements. - destroy_range(this->begin(), this->end()); - - // If this wasn't grown from the inline copy, deallocate the old space. - if (!this->isSmall()) - free(this->begin()); - - this->setEnd(NewElts+CurSize); - this->BeginX = NewElts; - this->CapacityX = this->begin()+NewCapacity; -} - - -/// SmallVectorTemplateBase - This is where we put method -/// implementations that are designed to work with POD-like T's. -template -class SmallVectorTemplateBase : public SmallVectorTemplateCommon { -protected: - SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon(Size) {} - - // No need to do a destroy loop for POD's. - static void destroy_range(T *, T *) {} - - /// Move the range [I, E) onto the uninitialized memory - /// starting with "Dest", constructing elements into it as needed. - template - static void uninitialized_move(It1 I, It1 E, It2 Dest) { - // Just do a copy. - uninitialized_copy(I, E, Dest); - } - - /// Copy the range [I, E) onto the uninitialized memory - /// starting with "Dest", constructing elements into it as needed. - template - static void uninitialized_copy(It1 I, It1 E, It2 Dest) { - // Arbitrary iterator types; just use the basic implementation. - std::uninitialized_copy(I, E, Dest); - } - - /// Copy the range [I, E) onto the uninitialized memory - /// starting with "Dest", constructing elements into it as needed. - template - static void uninitialized_copy( - T1 *I, T1 *E, T2 *Dest, - typename std::enable_if::type, - T2>::value>::type * = nullptr) { - // Use memcpy for PODs iterated by pointers (which includes SmallVector - // iterators): std::uninitialized_copy optimizes to memmove, but we can - // use memcpy here. Note that I and E are iterators and thus might be - // invalid for memcpy if they are equal. - if (I != E) - memcpy(Dest, I, (E - I) * sizeof(T)); - } - - /// Double the size of the allocated memory, guaranteeing space for at - /// least one more element or MinSize if specified. - void grow(size_t MinSize = 0) { - this->grow_pod(MinSize*sizeof(T), sizeof(T)); - } - -public: - void push_back(const T &Elt) { - if (this->EndX >= this->CapacityX) - this->grow(); - memcpy(this->end(), &Elt, sizeof(T)); - this->setEnd(this->end()+1); - } - - void pop_back() { - this->setEnd(this->end()-1); - } -}; - -/// This class consists of common code factored out of the SmallVector class to -/// reduce code duplication based on the SmallVector 'N' template parameter. -template -class SmallVectorImpl : public SmallVectorTemplateBase { - using SuperClass = SmallVectorTemplateBase; - -public: - using iterator = typename SuperClass::iterator; - using const_iterator = typename SuperClass::const_iterator; - using size_type = typename SuperClass::size_type; - -protected: - // Default ctor - Initialize to empty. - explicit SmallVectorImpl(unsigned N) - : SmallVectorTemplateBase(N*sizeof(T)) { - } - -public: - SmallVectorImpl(const SmallVectorImpl &) = delete; - - ~SmallVectorImpl() { - // Destroy the constructed elements in the vector. - this->destroy_range(this->begin(), this->end()); - - // If this wasn't grown from the inline copy, deallocate the old space. - if (!this->isSmall()) - free(this->begin()); - } - - void clear() { - this->destroy_range(this->begin(), this->end()); - this->EndX = this->BeginX; - } - - void resize(size_type N) { - if (N < this->size()) { - this->destroy_range(this->begin()+N, this->end()); - this->setEnd(this->begin()+N); - } else if (N > this->size()) { - if (this->capacity() < N) - this->grow(N); - auto I = this->end(); - for (auto E = this->begin() + N; I != E; ++I) - new (&*I) T(); - this->setEnd(this->begin()+N); - } - } - - void resize(size_type N, const T &NV) { - if (N < this->size()) { - this->destroy_range(this->begin()+N, this->end()); - this->setEnd(this->begin()+N); - } else if (N > this->size()) { - if (this->capacity() < N) - this->grow(N); - std::uninitialized_fill(this->end(), this->begin()+N, NV); - this->setEnd(this->begin()+N); - } - } - - void reserve(size_type N) { - if (this->capacity() < N) - this->grow(N); - } - - T pop_back_val() { - T Result = ::std::move(this->back()); - this->pop_back(); - return Result; - } - - void swap(SmallVectorImpl &RHS); - - /// Add the specified range to the end of the SmallVector. - template ::iterator_category, - std::input_iterator_tag>::value>::type> - void append(in_iter in_start, in_iter in_end) { - size_type NumInputs = std::distance(in_start, in_end); - // Grow allocated space if needed. - if (NumInputs > size_type(this->capacity_ptr()-this->end())) - this->grow(this->size()+NumInputs); - - // Copy the new elements over. - this->uninitialized_copy(in_start, in_end, this->end()); - this->setEnd(this->end() + NumInputs); - } - - /// Add the specified range to the end of the SmallVector. - void append(size_type NumInputs, const T &Elt) { - // Grow allocated space if needed. - if (NumInputs > size_type(this->capacity_ptr()-this->end())) - this->grow(this->size()+NumInputs); - - // Copy the new elements over. - std::uninitialized_fill_n(this->end(), NumInputs, Elt); - this->setEnd(this->end() + NumInputs); - } - - void append(std::initializer_list IL) { - append(IL.begin(), IL.end()); - } - - // FIXME: Consider assigning over existing elements, rather than clearing & - // re-initializing them - for all assign(...) variants. - - void assign(size_type NumElts, const T &Elt) { - clear(); - if (this->capacity() < NumElts) - this->grow(NumElts); - this->setEnd(this->begin()+NumElts); - std::uninitialized_fill(this->begin(), this->end(), Elt); - } - - template ::iterator_category, - std::input_iterator_tag>::value>::type> - void assign(in_iter in_start, in_iter in_end) { - clear(); - append(in_start, in_end); - } - - void assign(std::initializer_list IL) { - clear(); - append(IL); - } - - iterator erase(const_iterator CI) { - // Just cast away constness because this is a non-const member function. - iterator I = const_cast(CI); - - assert(I >= this->begin() && "Iterator to erase is out of bounds."); - assert(I < this->end() && "Erasing at past-the-end iterator."); - - iterator N = I; - // Shift all elts down one. - std::move(I+1, this->end(), I); - // Drop the last elt. - this->pop_back(); - return(N); - } - - iterator erase(const_iterator CS, const_iterator CE) { - // Just cast away constness because this is a non-const member function. - iterator S = const_cast(CS); - iterator E = const_cast(CE); - - assert(S >= this->begin() && "Range to erase is out of bounds."); - assert(S <= E && "Trying to erase invalid range."); - assert(E <= this->end() && "Trying to erase past the end."); - - iterator N = S; - // Shift all elts down. - iterator I = std::move(E, this->end(), S); - // Drop the last elts. - this->destroy_range(I, this->end()); - this->setEnd(I); - return(N); - } - - iterator insert(iterator I, T &&Elt) { - if (I == this->end()) { // Important special case for empty vector. - this->push_back(::std::move(Elt)); - return this->end()-1; - } - - assert(I >= this->begin() && "Insertion iterator is out of bounds."); - assert(I <= this->end() && "Inserting past the end of the vector."); - - if (this->EndX >= this->CapacityX) { - size_t EltNo = I-this->begin(); - this->grow(); - I = this->begin()+EltNo; - } - - ::new ((void*) this->end()) T(::std::move(this->back())); - // Push everything else over. - std::move_backward(I, this->end()-1, this->end()); - this->setEnd(this->end()+1); - - // If we just moved the element we're inserting, be sure to update - // the reference. - T *EltPtr = &Elt; - if (I <= EltPtr && EltPtr < this->EndX) - ++EltPtr; - - *I = ::std::move(*EltPtr); - return I; - } - - iterator insert(iterator I, const T &Elt) { - if (I == this->end()) { // Important special case for empty vector. - this->push_back(Elt); - return this->end()-1; - } - - assert(I >= this->begin() && "Insertion iterator is out of bounds."); - assert(I <= this->end() && "Inserting past the end of the vector."); - - if (this->EndX >= this->CapacityX) { - size_t EltNo = I-this->begin(); - this->grow(); - I = this->begin()+EltNo; - } - ::new ((void*) this->end()) T(std::move(this->back())); - // Push everything else over. - std::move_backward(I, this->end()-1, this->end()); - this->setEnd(this->end()+1); - - // If we just moved the element we're inserting, be sure to update - // the reference. - const T *EltPtr = &Elt; - if (I <= EltPtr && EltPtr < this->EndX) - ++EltPtr; - - *I = *EltPtr; - return I; - } - - iterator insert(iterator I, size_type NumToInsert, const T &Elt) { - // Convert iterator to elt# to avoid invalidating iterator when we reserve() - size_t InsertElt = I - this->begin(); - - if (I == this->end()) { // Important special case for empty vector. - append(NumToInsert, Elt); - return this->begin()+InsertElt; - } - - assert(I >= this->begin() && "Insertion iterator is out of bounds."); - assert(I <= this->end() && "Inserting past the end of the vector."); - - // Ensure there is enough space. - reserve(this->size() + NumToInsert); - - // Uninvalidate the iterator. - I = this->begin()+InsertElt; - - // If there are more elements between the insertion point and the end of the - // range than there are being inserted, we can use a simple approach to - // insertion. Since we already reserved space, we know that this won't - // reallocate the vector. - if (size_t(this->end()-I) >= NumToInsert) { - T *OldEnd = this->end(); - append(std::move_iterator(this->end() - NumToInsert), - std::move_iterator(this->end())); - - // Copy the existing elements that get replaced. - std::move_backward(I, OldEnd-NumToInsert, OldEnd); - - std::fill_n(I, NumToInsert, Elt); - return I; - } - - // Otherwise, we're inserting more elements than exist already, and we're - // not inserting at the end. - - // Move over the elements that we're about to overwrite. - T *OldEnd = this->end(); - this->setEnd(this->end() + NumToInsert); - size_t NumOverwritten = OldEnd-I; - this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten); - - // Replace the overwritten part. - std::fill_n(I, NumOverwritten, Elt); - - // Insert the non-overwritten middle part. - std::uninitialized_fill_n(OldEnd, NumToInsert-NumOverwritten, Elt); - return I; - } - - template ::iterator_category, - std::input_iterator_tag>::value>::type> - iterator insert(iterator I, ItTy From, ItTy To) { - // Convert iterator to elt# to avoid invalidating iterator when we reserve() - size_t InsertElt = I - this->begin(); - - if (I == this->end()) { // Important special case for empty vector. - append(From, To); - return this->begin()+InsertElt; - } - - assert(I >= this->begin() && "Insertion iterator is out of bounds."); - assert(I <= this->end() && "Inserting past the end of the vector."); - - size_t NumToInsert = std::distance(From, To); - - // Ensure there is enough space. - reserve(this->size() + NumToInsert); - - // Uninvalidate the iterator. - I = this->begin()+InsertElt; - - // If there are more elements between the insertion point and the end of the - // range than there are being inserted, we can use a simple approach to - // insertion. Since we already reserved space, we know that this won't - // reallocate the vector. - if (size_t(this->end()-I) >= NumToInsert) { - T *OldEnd = this->end(); - append(std::move_iterator(this->end() - NumToInsert), - std::move_iterator(this->end())); - - // Copy the existing elements that get replaced. - std::move_backward(I, OldEnd-NumToInsert, OldEnd); - - std::copy(From, To, I); - return I; - } - - // Otherwise, we're inserting more elements than exist already, and we're - // not inserting at the end. - - // Move over the elements that we're about to overwrite. - T *OldEnd = this->end(); - this->setEnd(this->end() + NumToInsert); - size_t NumOverwritten = OldEnd-I; - this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten); - - // Replace the overwritten part. - for (T *J = I; NumOverwritten > 0; --NumOverwritten) { - *J = *From; - ++J; ++From; - } - - // Insert the non-overwritten middle part. - this->uninitialized_copy(From, To, OldEnd); - return I; - } - - void insert(iterator I, std::initializer_list IL) { - insert(I, IL.begin(), IL.end()); - } - - template void emplace_back(ArgTypes &&... Args) { - if (this->EndX >= this->CapacityX) - this->grow(); - ::new ((void *)this->end()) T(std::forward(Args)...); - this->setEnd(this->end() + 1); - } - - SmallVectorImpl &operator=(const SmallVectorImpl &RHS); - - SmallVectorImpl &operator=(SmallVectorImpl &&RHS); - - bool operator==(const SmallVectorImpl &RHS) const { - if (this->size() != RHS.size()) return false; - return std::equal(this->begin(), this->end(), RHS.begin()); - } - bool operator!=(const SmallVectorImpl &RHS) const { - return !(*this == RHS); - } - - bool operator<(const SmallVectorImpl &RHS) const { - return std::lexicographical_compare(this->begin(), this->end(), - RHS.begin(), RHS.end()); - } - - /// Set the array size to \p N, which the current array must have enough - /// capacity for. - /// - /// This does not construct or destroy any elements in the vector. - /// - /// Clients can use this in conjunction with capacity() to write past the end - /// of the buffer when they know that more elements are available, and only - /// update the size later. This avoids the cost of value initializing elements - /// which will only be overwritten. - void set_size(size_type N) { - assert(N <= this->capacity()); - this->setEnd(this->begin() + N); - } -}; - -template -void SmallVectorImpl::swap(SmallVectorImpl &RHS) { - if (this == &RHS) return; - - // We can only avoid copying elements if neither vector is small. - if (!this->isSmall() && !RHS.isSmall()) { - std::swap(this->BeginX, RHS.BeginX); - std::swap(this->EndX, RHS.EndX); - std::swap(this->CapacityX, RHS.CapacityX); - return; - } - if (RHS.size() > this->capacity()) - this->grow(RHS.size()); - if (this->size() > RHS.capacity()) - RHS.grow(this->size()); - - // Swap the shared elements. - size_t NumShared = this->size(); - if (NumShared > RHS.size()) NumShared = RHS.size(); - for (size_type i = 0; i != NumShared; ++i) - std::swap((*this)[i], RHS[i]); - - // Copy over the extra elts. - if (this->size() > RHS.size()) { - size_t EltDiff = this->size() - RHS.size(); - this->uninitialized_copy(this->begin()+NumShared, this->end(), RHS.end()); - RHS.setEnd(RHS.end()+EltDiff); - this->destroy_range(this->begin()+NumShared, this->end()); - this->setEnd(this->begin()+NumShared); - } else if (RHS.size() > this->size()) { - size_t EltDiff = RHS.size() - this->size(); - this->uninitialized_copy(RHS.begin()+NumShared, RHS.end(), this->end()); - this->setEnd(this->end() + EltDiff); - this->destroy_range(RHS.begin()+NumShared, RHS.end()); - RHS.setEnd(RHS.begin()+NumShared); - } -} - -template -SmallVectorImpl &SmallVectorImpl:: - operator=(const SmallVectorImpl &RHS) { - // Avoid self-assignment. - if (this == &RHS) return *this; - - // If we already have sufficient space, assign the common elements, then - // destroy any excess. - size_t RHSSize = RHS.size(); - size_t CurSize = this->size(); - if (CurSize >= RHSSize) { - // Assign common elements. - iterator NewEnd; - if (RHSSize) - NewEnd = std::copy(RHS.begin(), RHS.begin()+RHSSize, this->begin()); - else - NewEnd = this->begin(); - - // Destroy excess elements. - this->destroy_range(NewEnd, this->end()); - - // Trim. - this->setEnd(NewEnd); - return *this; - } - - // If we have to grow to have enough elements, destroy the current elements. - // This allows us to avoid copying them during the grow. - // FIXME: don't do this if they're efficiently moveable. - if (this->capacity() < RHSSize) { - // Destroy current elements. - this->destroy_range(this->begin(), this->end()); - this->setEnd(this->begin()); - CurSize = 0; - this->grow(RHSSize); - } else if (CurSize) { - // Otherwise, use assignment for the already-constructed elements. - std::copy(RHS.begin(), RHS.begin()+CurSize, this->begin()); - } - - // Copy construct the new elements in place. - this->uninitialized_copy(RHS.begin()+CurSize, RHS.end(), - this->begin()+CurSize); - - // Set end. - this->setEnd(this->begin()+RHSSize); - return *this; -} - -template -SmallVectorImpl &SmallVectorImpl::operator=(SmallVectorImpl &&RHS) { - // Avoid self-assignment. - if (this == &RHS) return *this; - - // If the RHS isn't small, clear this vector and then steal its buffer. - if (!RHS.isSmall()) { - this->destroy_range(this->begin(), this->end()); - if (!this->isSmall()) free(this->begin()); - this->BeginX = RHS.BeginX; - this->EndX = RHS.EndX; - this->CapacityX = RHS.CapacityX; - RHS.resetToSmall(); - return *this; - } - - // If we already have sufficient space, assign the common elements, then - // destroy any excess. - size_t RHSSize = RHS.size(); - size_t CurSize = this->size(); - if (CurSize >= RHSSize) { - // Assign common elements. - iterator NewEnd = this->begin(); - if (RHSSize) - NewEnd = std::move(RHS.begin(), RHS.end(), NewEnd); - - // Destroy excess elements and trim the bounds. - this->destroy_range(NewEnd, this->end()); - this->setEnd(NewEnd); - - // Clear the RHS. - RHS.clear(); - - return *this; - } - - // If we have to grow to have enough elements, destroy the current elements. - // This allows us to avoid copying them during the grow. - // FIXME: this may not actually make any sense if we can efficiently move - // elements. - if (this->capacity() < RHSSize) { - // Destroy current elements. - this->destroy_range(this->begin(), this->end()); - this->setEnd(this->begin()); - CurSize = 0; - this->grow(RHSSize); - } else if (CurSize) { - // Otherwise, use assignment for the already-constructed elements. - std::move(RHS.begin(), RHS.begin()+CurSize, this->begin()); - } - - // Move-construct the new elements in place. - this->uninitialized_move(RHS.begin()+CurSize, RHS.end(), - this->begin()+CurSize); - - // Set end. - this->setEnd(this->begin()+RHSSize); - - RHS.clear(); - return *this; -} - -/// Storage for the SmallVector elements which aren't contained in -/// SmallVectorTemplateCommon. There are 'N-1' elements here. The remaining '1' -/// element is in the base class. This is specialized for the N=1 and N=0 cases -/// to avoid allocating unnecessary storage. -template -struct SmallVectorStorage { - typename SmallVectorTemplateCommon::U InlineElts[N - 1]; -}; -template struct SmallVectorStorage {}; -template struct SmallVectorStorage {}; - -/// This is a 'vector' (really, a variable-sized array), optimized -/// for the case when the array is small. It contains some number of elements -/// in-place, which allows it to avoid heap allocation when the actual number of -/// elements is below that threshold. This allows normal "small" cases to be -/// fast without losing generality for large inputs. -/// -/// Note that this does not attempt to be exception safe. -/// -template -class SmallVector : public SmallVectorImpl { - /// Inline space for elements which aren't stored in the base class. - SmallVectorStorage Storage; - -public: - SmallVector() : SmallVectorImpl(N) {} - - explicit SmallVector(size_t Size, const T &Value = T()) - : SmallVectorImpl(N) { - this->assign(Size, Value); - } - - template ::iterator_category, - std::input_iterator_tag>::value>::type> - SmallVector(ItTy S, ItTy E) : SmallVectorImpl(N) { - this->append(S, E); - } - - template - explicit SmallVector(Container &&c) : SmallVectorImpl(N) { - this->append(c.begin(), c.end()); - } - - SmallVector(std::initializer_list IL) : SmallVectorImpl(N) { - this->assign(IL); - } - - SmallVector(const SmallVector &RHS) : SmallVectorImpl(N) { - if (!RHS.empty()) - SmallVectorImpl::operator=(RHS); - } - - const SmallVector &operator=(const SmallVector &RHS) { - SmallVectorImpl::operator=(RHS); - return *this; - } - - SmallVector(SmallVector &&RHS) : SmallVectorImpl(N) { - if (!RHS.empty()) - SmallVectorImpl::operator=(::std::move(RHS)); - } - - template - const SmallVector &operator=(const Container &RHS) { - this->assign(RHS.begin(), RHS.end()); - return *this; - } - - SmallVector(SmallVectorImpl &&RHS) : SmallVectorImpl(N) { - if (!RHS.empty()) - SmallVectorImpl::operator=(::std::move(RHS)); - } - - const SmallVector &operator=(SmallVector &&RHS) { - SmallVectorImpl::operator=(::std::move(RHS)); - return *this; - } - - const SmallVector &operator=(SmallVectorImpl &&RHS) { - SmallVectorImpl::operator=(::std::move(RHS)); - return *this; - } - - template - const SmallVector &operator=(Container &&C) { - this->assign(C.begin(), C.end()); - return *this; - } - - const SmallVector &operator=(std::initializer_list IL) { - this->assign(IL); - return *this; - } -}; - -template -inline size_t capacity_in_bytes(const SmallVector &X) { - return X.capacity_in_bytes(); -} - -} // end namespace at - -namespace std { - - /// Implement std::swap in terms of SmallVector swap. - template - inline void - swap(at::SmallVectorImpl &LHS, at::SmallVectorImpl &RHS) { - LHS.swap(RHS); - } - - /// Implement std::swap in terms of SmallVector swap. - template - inline void - swap(at::SmallVector &LHS, at::SmallVector &RHS) { - LHS.swap(RHS); - } - -} // end namespace std +#include diff --git a/aten/src/ATen/SparseTensorImpl.cpp b/aten/src/ATen/SparseTensorImpl.cpp index 968fd8ebbec266..03a5a6008e7d24 100644 --- a/aten/src/ATen/SparseTensorImpl.cpp +++ b/aten/src/ATen/SparseTensorImpl.cpp @@ -18,14 +18,14 @@ namespace at { // tensor and a [0] size values tensor for such an empty tensor. However, // we don't currently support zero-size dimensions, so we can't actually // do this; so we just allocate zero-size tensors for everything. -SparseTensorImpl::SparseTensorImpl(Type * type) - : TensorImpl(type, nullptr) +SparseTensorImpl::SparseTensorImpl(at::Backend backend, at::ScalarType scalar_type) + : TensorImpl(backend, scalar_type, nullptr, false) , size_{0} , sparseDims_(1) , denseDims_(0) - , indices_(type->toDense().toScalarType(ScalarType::Long).tensor()) - , values_(type->toDense().tensor()) { - AT_ASSERT(type->is_sparse()); + , indices_(globalContext().getTypeOpt(toDense(backend), ScalarType::Long)->tensor()) + , values_(globalContext().getTypeOpt(toDense(backend), scalar_type)->tensor()) { + AT_ASSERT(backend == Backend::SparseCPU || backend == Backend::SparseCUDA); } IntList SparseTensorImpl::sizes() const { diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h index 9ef08705bb0f45..307c0f9e5574d1 100644 --- a/aten/src/ATen/SparseTensorImpl.h +++ b/aten/src/ATen/SparseTensorImpl.h @@ -48,7 +48,7 @@ struct AT_API SparseTensorImpl : public TensorImpl { public: // Public for now... - explicit SparseTensorImpl(Type * type); + explicit SparseTensorImpl(at::Backend, at::ScalarType); int64_t nnz() const { return nnz_; } int64_t sparseDims() const { return sparseDims_; } @@ -75,7 +75,7 @@ struct AT_API SparseTensorImpl : public TensorImpl { if (size.size() == 0) { size_ = {0}; } else { - size_ = size; + size_ = size.vec(); } sparseDims_ = sparseDims; denseDims_ = denseDims; diff --git a/aten/src/ATen/Storage.cpp b/aten/src/ATen/Storage.cpp index f5ba512cc27105..991cfba92efd2a 100644 --- a/aten/src/ATen/Storage.cpp +++ b/aten/src/ATen/Storage.cpp @@ -1,23 +1,32 @@ #include -#include #include namespace at { +Storage::Storage(at::ScalarType scalar_type, size_t size, Allocator* allocator) + : storage_impl_(new StorageImpl( + scalar_type, + size, + allocator, + /* resizable */ false)) {} + +Storage::Storage( + at::ScalarType scalar_type, + at::DataPtr data_ptr, + size_t size, + const std::function& deleter) + : storage_impl_(new StorageImpl( + scalar_type, + size, + std::move(data_ptr), + /* allocator */ nullptr, + /* resizable */ false)) {} + Storage::~Storage() { if (!storage_impl_) { return; } - if (--storage_impl_->refcount == 0) { - if (storage_impl_->finalizer) { - (*storage_impl_->finalizer)(); - } - storage_impl_->finalizer = nullptr; - storage_impl_->data_ptr.clear(); - if (storage_impl_ && --storage_impl_->weakcount == 0) { - delete storage_impl_; - } - } + storage_impl_->release(); } } // namespace at diff --git a/aten/src/ATen/Storage.h b/aten/src/ATen/Storage.h index a5c85192e36f8c..aa27296c74d40f 100644 --- a/aten/src/ATen/Storage.h +++ b/aten/src/ATen/Storage.h @@ -8,6 +8,12 @@ struct AT_API Storage { public: Storage() = delete; Storage(StorageImpl* storage_impl) : storage_impl_(storage_impl) {} + Storage(at::ScalarType, size_t size, Allocator* allocator); + Storage( + at::ScalarType, + at::DataPtr, + size_t size, + const std::function& deleter); ~Storage(); // There are reasonable interpretations of these constructors, but they're to // be implemented on demand. diff --git a/aten/src/ATen/StorageImpl.cpp b/aten/src/ATen/StorageImpl.cpp index a26f8971310aa5..6e3d693d012c5c 100644 --- a/aten/src/ATen/StorageImpl.cpp +++ b/aten/src/ATen/StorageImpl.cpp @@ -12,8 +12,6 @@ StorageImpl::StorageImpl( : scalar_type(scalar_type), data_ptr(std::move(data_ptr)), size(size), - refcount(1), - weakcount(1), // from the strong reference resizable(resizable), allocator(allocator), finalizer(nullptr) {} diff --git a/aten/src/ATen/StorageImpl.h b/aten/src/ATen/StorageImpl.h index c48ec51e013d4c..f1c23c54677dba 100644 --- a/aten/src/ATen/StorageImpl.h +++ b/aten/src/ATen/StorageImpl.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -39,7 +40,7 @@ namespace at { struct Type; -struct TH_CPP_API StorageImpl { +struct AT_API StorageImpl : public Retainable { StorageImpl() = delete; virtual ~StorageImpl() {}; @@ -48,8 +49,6 @@ struct TH_CPP_API StorageImpl { at::ScalarType scalar_type; at::DataPtr data_ptr; ptrdiff_t size; - std::atomic refcount; - std::atomic weakcount; bool resizable; at::Allocator* allocator; std::unique_ptr finalizer; @@ -58,6 +57,8 @@ struct TH_CPP_API StorageImpl { StorageImpl(StorageImpl&&) = delete; StorageImpl(const StorageImpl&&) = delete; + // TODO: Rename this into th_data, and move it out of the class; + // the real data shouldn't call th::from_type template inline T* data() const { auto scalar_type_T = at::CTypeToScalarType>::to(); @@ -76,6 +77,14 @@ struct TH_CPP_API StorageImpl { return static_cast(this->data_ptr.get()); } + void release_resources() { + if (finalizer) { + (*finalizer)(); + } + finalizer = nullptr; + data_ptr.clear(); + } + void operator=(const StorageImpl&) = delete; virtual size_t elementSize() const { @@ -94,9 +103,6 @@ struct TH_CPP_API StorageImpl { const void* data() const { return data_ptr.get(); }; - void retain() { - ++refcount; - } int getDevice() const { return data_ptr.device().index(); diff --git a/aten/src/ATen/THLongStorageView.h b/aten/src/ATen/THLongStorageView.h index 55e7d3de6dea4a..8ebcfdaeada40f 100644 --- a/aten/src/ATen/THLongStorageView.h +++ b/aten/src/ATen/THLongStorageView.h @@ -64,7 +64,6 @@ class THLongStorageView { storage.size = ref.size(); } storage.scalar_type = at::CTypeToScalarType>::to(); - storage.refcount = 0; storage.set_resizable(false); } private: diff --git a/aten/src/ATen/TensorGeometry.h b/aten/src/ATen/TensorGeometry.h index 60f6098762cd05..15f59e902182c4 100644 --- a/aten/src/ATen/TensorGeometry.h +++ b/aten/src/ATen/TensorGeometry.h @@ -9,7 +9,7 @@ struct AT_API TensorGeometry { TensorGeometry() : storage_offset_(0) {} explicit TensorGeometry(IntList sizes) - : sizes_(sizes) + : sizes_(sizes.vec()) , strides_(sizes.size()) , storage_offset_(0) { int64_t dim = sizes.size(); @@ -21,8 +21,8 @@ struct AT_API TensorGeometry { } explicit TensorGeometry(const Tensor& t) - : sizes_(t.sizes()) - , strides_(t.strides()) + : sizes_(t.sizes().vec()) + , strides_(t.strides().vec()) , storage_offset_(t.storage_offset()) {} // true if the tensor is contiguous diff --git a/aten/src/ATen/TensorImpl.cpp b/aten/src/ATen/TensorImpl.cpp index 59cc303a1acf5c..a48cb033b2de49 100644 --- a/aten/src/ATen/TensorImpl.cpp +++ b/aten/src/ATen/TensorImpl.cpp @@ -2,10 +2,23 @@ #include #include +#include + +#include #include namespace at { + +Type& TensorImpl::type() const { + Type* base_type = &globalContext().getType(backend_, scalar_type_); + if (is_variable_) { + return detail::getVariableHooks().getVariableType(*base_type); + } else { + return *base_type; + } +} + Tensor& TensorImpl::grad() { AT_ERROR("grad is not implemented for Tensor"); } diff --git a/aten/src/ATen/TensorImpl.h b/aten/src/ATen/TensorImpl.h index 9c3591eb96b31f..1aa4d8390ed175 100644 --- a/aten/src/ATen/TensorImpl.h +++ b/aten/src/ATen/TensorImpl.h @@ -18,16 +18,18 @@ struct Tensor; namespace at { struct AT_API TensorImpl : public Retainable { - explicit TensorImpl(Type * type, THTensor * tensor) - : type_(type), tensor(tensor) {} + explicit TensorImpl(Backend backend, ScalarType scalar_type, THTensor * tensor, bool is_variable) + : backend_(backend), scalar_type_(scalar_type), is_variable_(is_variable), tensor(tensor) {} virtual ~TensorImpl(); virtual void release_resources() override; - Type & type() const { - return *type_; - } + // The implementation of this method will have to be hoisted out and + // hooked in, so that Caffe2 doesn't need to know about Context + // TODO: This really really needs to be inlined. + Type & type() const; + const char * toString() const; virtual IntList sizes() const; virtual IntList strides() const; @@ -91,8 +93,12 @@ struct AT_API TensorImpl : public Retainable { virtual void set_data(Tensor new_data); protected: + Backend backend_; + // INVARIANT: When storage is non-null, this scalar type must + // agree with the scalar type in storage + ScalarType scalar_type_; + bool is_variable_ = false; bool is_wrapped_number_ = false; - Type * type_; public: THTensor * tensor; }; diff --git a/aten/src/ATen/UndefinedTensor.cpp b/aten/src/ATen/UndefinedTensor.cpp index 5e4059421c1283..ecfb70fa1bbede 100644 --- a/aten/src/ATen/UndefinedTensor.cpp +++ b/aten/src/ATen/UndefinedTensor.cpp @@ -6,7 +6,7 @@ namespace at { // should this use the globalContext? Can it get a context passed in somehow? UndefinedTensor::UndefinedTensor() -: TensorImpl(&(globalContext().getType(Backend::Undefined,ScalarType::Undefined)), nullptr) { +: TensorImpl(Backend::Undefined, ScalarType::Undefined, nullptr, /* is variable */ false) { } IntList UndefinedTensor::sizes() const { diff --git a/aten/src/ATen/core/ATenCoreTest.cpp b/aten/src/ATen/core/ATenCoreTest.cpp new file mode 100644 index 00000000000000..5bb595a0bce5de --- /dev/null +++ b/aten/src/ATen/core/ATenCoreTest.cpp @@ -0,0 +1,10 @@ +#include + +namespace at { + +static int CoreTestGlobal = 0; +int CoreTest() { + return CoreTestGlobal++; +} + +} // namespace at diff --git a/aten/src/ATen/core/ATenCoreTest.h b/aten/src/ATen/core/ATenCoreTest.h new file mode 100644 index 00000000000000..ee8471f66fe258 --- /dev/null +++ b/aten/src/ATen/core/ATenCoreTest.h @@ -0,0 +1,8 @@ +#pragma once + +#include + +namespace at { + +AT_CORE_API int CoreTest(); +} diff --git a/aten/src/ATen/AlignOf.h b/aten/src/ATen/core/AlignOf.h similarity index 68% rename from aten/src/ATen/AlignOf.h rename to aten/src/ATen/core/AlignOf.h index 5e9f0127b32e70..a7e42196f43ecd 100644 --- a/aten/src/ATen/AlignOf.h +++ b/aten/src/ATen/core/AlignOf.h @@ -33,7 +33,7 @@ namespace at { // MSVC requires special handling here. #ifndef _MSC_VER -template +template struct AlignedCharArray { alignas(Alignment) char buffer[Size]; }; @@ -41,7 +41,7 @@ struct AlignedCharArray { #else // _MSC_VER /// \brief Create a type with an aligned char buffer. -template +template struct AlignedCharArray; // We provide special variations of this template for the most common @@ -52,7 +52,7 @@ struct AlignedCharArray; // MSVC warns on the existence of the declspec despite the union member forcing // proper alignment. -template +template struct AlignedCharArray<1, Size> { union { char aligned; @@ -60,7 +60,7 @@ struct AlignedCharArray<1, Size> { }; }; -template +template struct AlignedCharArray<2, Size> { union { short aligned; @@ -68,7 +68,7 @@ struct AlignedCharArray<2, Size> { }; }; -template +template struct AlignedCharArray<4, Size> { union { int aligned; @@ -76,7 +76,7 @@ struct AlignedCharArray<4, Size> { }; }; -template +template struct AlignedCharArray<8, Size> { union { double aligned; @@ -84,14 +84,13 @@ struct AlignedCharArray<8, Size> { }; }; - // The rest of these are provided with a __declspec(align(...)) and we simply // can't pass them by-value as function arguments on MSVC. #define AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(x) \ - template \ - struct AlignedCharArray { \ - __declspec(align(x)) char buffer[Size]; \ + template \ + struct AlignedCharArray { \ + __declspec(align(x)) char buffer[Size]; \ }; AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(16) @@ -104,24 +103,47 @@ AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(128) #endif // _MSC_VER namespace detail { -template +template < + typename T1, + typename T2 = char, + typename T3 = char, + typename T4 = char, + typename T5 = char, + typename T6 = char, + typename T7 = char, + typename T8 = char, + typename T9 = char, + typename T10 = char> class AlignerImpl { - T1 t1; T2 t2; T3 t3; T4 t4; T5 t5; T6 t6; T7 t7; T8 t8; T9 t9; T10 t10; + T1 t1; + T2 t2; + T3 t3; + T4 t4; + T5 t5; + T6 t6; + T7 t7; + T8 t8; + T9 t9; + T10 t10; AlignerImpl() = delete; }; -template +template < + typename T1, + typename T2 = char, + typename T3 = char, + typename T4 = char, + typename T5 = char, + typename T6 = char, + typename T7 = char, + typename T8 = char, + typename T9 = char, + typename T10 = char> union SizerImpl { char arr1[sizeof(T1)], arr2[sizeof(T2)], arr3[sizeof(T3)], arr4[sizeof(T4)], - arr5[sizeof(T5)], arr6[sizeof(T6)], arr7[sizeof(T7)], arr8[sizeof(T8)], - arr9[sizeof(T9)], arr10[sizeof(T10)]; + arr5[sizeof(T5)], arr6[sizeof(T6)], arr7[sizeof(T7)], arr8[sizeof(T8)], + arr9[sizeof(T9)], arr10[sizeof(T10)]; }; } // end namespace detail @@ -132,14 +154,20 @@ union SizerImpl { /// expose a char array buffer member which can be used as suitable storage for /// a placement new of any of these types. Support for more than ten types can /// be added at the cost of more boilerplate. -template -struct AlignedCharArrayUnion : AlignedCharArray< - alignof(detail::AlignerImpl), - sizeof(::at::detail::SizerImpl)> { -}; +template < + typename T1, + typename T2 = char, + typename T3 = char, + typename T4 = char, + typename T5 = char, + typename T6 = char, + typename T7 = char, + typename T8 = char, + typename T9 = char, + typename T10 = char> +struct AlignedCharArrayUnion + : AlignedCharArray< + alignof(detail::AlignerImpl), + sizeof(::at::detail:: + SizerImpl)> {}; } // end namespace at diff --git a/aten/src/ATen/core/ArrayRef.h b/aten/src/ATen/core/ArrayRef.h new file mode 100644 index 00000000000000..7e997d6572f3c0 --- /dev/null +++ b/aten/src/ATen/core/ArrayRef.h @@ -0,0 +1,212 @@ +//===--- ArrayRef.h - Array Reference Wrapper -------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// ATen: modified from llvm::ArrayRef. +// removed llvm-specific functionality +// removed some implicit const -> non-const conversions that rely on +// complicated std::enable_if meta-programming +// removed a bunch of slice variants for simplicity... + +#pragma once + +#include +#include +#include + +#include +#include +#include + +namespace at { + +/// ArrayRef - Represent a constant reference to an array (0 or more elements +/// consecutively in memory), i.e. a start pointer and a length. It allows +/// various APIs to take consecutive elements easily and conveniently. +/// +/// This class does not own the underlying data, it is expected to be used in +/// situations where the data resides in some other buffer, whose lifetime +/// extends past that of the ArrayRef. For this reason, it is not in general +/// safe to store an ArrayRef. +/// +/// This is intended to be trivially copyable, so it should be passed by +/// value. +template +class ArrayRef final { + public: + using iterator = const T*; + using const_iterator = const T*; + using size_type = size_t; + + using reverse_iterator = std::reverse_iterator; + + private: + /// The start of the array, in an external buffer. + const T* Data; + + /// The number of elements. + size_type Length; + + public: + /// @name Constructors + /// @{ + + /// Construct an empty ArrayRef. + /* implicit */ constexpr ArrayRef() : Data(nullptr), Length(0) {} + + /// Construct an ArrayRef from a single element. + // TODO Make this explicit + constexpr ArrayRef(const T& OneElt) : Data(&OneElt), Length(1) {} + + /// Construct an ArrayRef from a pointer and length. + constexpr ArrayRef(const T* data, size_t length) + : Data(data), Length(length) {} + + /// Construct an ArrayRef from a range. + constexpr ArrayRef(const T* begin, const T* end) + : Data(begin), Length(end - begin) {} + + /// Construct an ArrayRef from a SmallVector. This is templated in order to + /// avoid instantiating SmallVectorTemplateCommon whenever we + /// copy-construct an ArrayRef. + template + /* implicit */ ArrayRef(const SmallVectorTemplateCommon& Vec) + : Data(Vec.data()), Length(Vec.size()) {} + + /// Construct an ArrayRef from a std::vector. + template + /* implicit */ ArrayRef(const std::vector& Vec) + : Data(Vec.data()), Length(Vec.size()) {} + + /// Construct an ArrayRef from a std::array + template + /* implicit */ constexpr ArrayRef(const std::array& Arr) + : Data(Arr.data()), Length(N) {} + + /// Construct an ArrayRef from a C array. + template + /* implicit */ constexpr ArrayRef(const T (&Arr)[N]) : Data(Arr), Length(N) {} + + /// Construct an ArrayRef from a std::initializer_list. + /* implicit */ constexpr ArrayRef(const std::initializer_list& Vec) + : Data(Vec.begin() == Vec.end() ? static_cast(nullptr) : Vec.begin()), + Length(Vec.size()) {} + + /// @} + /// @name Simple Operations + /// @{ + + constexpr iterator begin() const { + return Data; + } + constexpr iterator end() const { + return Data + Length; + } + + constexpr reverse_iterator rbegin() const { + return reverse_iterator(end()); + } + constexpr reverse_iterator rend() const { + return reverse_iterator(begin()); + } + + /// empty - Check if the array is empty. + constexpr bool empty() const { + return Length == 0; + } + + constexpr const T* data() const { + return Data; + } + + /// size - Get the array size. + constexpr size_t size() const { + return Length; + } + + /// front - Get the first element. + AT_CPP14_CONSTEXPR const T& front() const { + AT_CHECK(!empty(), "ArrayRef: attempted to access front() of empty list"); + return Data[0]; + } + + /// back - Get the last element. + AT_CPP14_CONSTEXPR const T& back() const { + AT_CHECK(!empty(), "ArrayRef: attempted to access back() of empty list"); + return Data[Length - 1]; + } + + /// equals - Check for element-wise equality. + constexpr bool equals(ArrayRef RHS) const { + return Length == RHS.Length && std::equal(begin(), end(), RHS.begin()); + } + + /// slice(n, m) - Chop off the first N elements of the array, and keep M + /// elements in the array. + AT_CPP14_CONSTEXPR ArrayRef slice(size_t N, size_t M) const { + AT_CHECK( + N + M <= size(), + "ArrayRef: invalid slice, N = ", + N, + "; M = ", + M, + "; size = ", + size()); + return ArrayRef(data() + N, M); + } + + /// slice(n) - Chop off the first N elements of the array. + constexpr ArrayRef slice(size_t N) const { + return slice(N, size() - N); + } + + /// @} + /// @name Operator Overloads + /// @{ + constexpr const T& operator[](size_t Index) const { + return Data[Index]; + } + + /// Vector compatibility + AT_CPP14_CONSTEXPR const T& at(size_t Index) const { + AT_CHECK( + Index < Length, + "ArrayRef: invalid index Index = ", + Index, + "; Length = ", + Length); + return Data[Index]; + } + + /// Disallow accidental assignment from a temporary. + /// + /// The declaration here is extra complicated so that "arrayRef = {}" + /// continues to select the move assignment operator. + template + typename std::enable_if::value, ArrayRef>::type& + operator=(U&& Temporary) = delete; + + /// Disallow accidental assignment from a temporary. + /// + /// The declaration here is extra complicated so that "arrayRef = {}" + /// continues to select the move assignment operator. + template + typename std::enable_if::value, ArrayRef>::type& + operator=(std::initializer_list) = delete; + + /// @} + /// @name Expensive Operations + /// @{ + std::vector vec() const { + return std::vector(Data, Data + Length); + } + + /// @} +}; + +} // namespace at diff --git a/aten/src/ATen/Backtrace.cpp b/aten/src/ATen/core/Backtrace.cpp similarity index 92% rename from aten/src/ATen/Backtrace.cpp rename to aten/src/ATen/core/Backtrace.cpp index a8e062051ee633..7914489d50ece3 100644 --- a/aten/src/ATen/Backtrace.cpp +++ b/aten/src/ATen/core/Backtrace.cpp @@ -1,5 +1,5 @@ -#include -#include +#include +#include #include #include @@ -7,18 +7,30 @@ #include #include -#if !defined(_WIN32) && !defined(__EMSCRIPTEN__) +#if defined(__ANDROID__) +#define AT_CORE_MOBILE 1 +#elif ( \ + defined(__APPLE__) && \ + (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE)) +#define AT_CORE_MOBILE 1 +#else +#define AT_CORE_MOBILE 0 +#endif + +#if !AT_CORE_MOBILE && !defined(_WIN32) && !defined(__EMSCRIPTEN__) +#define SUPPORTS_BACKTRACE 1 +#else +#define SUPPORTS_BACKTRACE 0 +#endif + +#if SUPPORTS_BACKTRACE #include #include #endif // !defined(_WIN32) namespace at { -#if defined(_MSC_VER) -// Windows does not have cxxabi.h, so we will simply return the original. -std::string demangle(const char* name) { - return std::string(name); -} -#elif !defined(__EMSCRIPTEN__) + +#if SUPPORTS_BACKTRACE std::string demangle(const char* name) { int status = -1; @@ -45,6 +57,10 @@ std::string demangle(const char* name) { return name; } } +#else +std::string demangle(const char* name) { + return std::string(name); +} #endif // TODO: This backtrace retrieval can be implemented on Windows via the Windows @@ -52,8 +68,7 @@ std::string demangle(const char* name) { // https://stackoverflow.com/questions/5693192/win32-backtrace-from-c-code // https://stackoverflow.com/questions/26398064/counterpart-to-glibcs-backtrace-and-backtrace-symbols-on-windows // https://msdn.microsoft.com/en-us/library/windows/desktop/bb204633%28v=vs.85%29.aspx. -#if !defined(_WIN32) && !defined(__EMSCRIPTEN__) - +#if SUPPORTS_BACKTRACE namespace { struct FrameInformation { @@ -143,14 +158,13 @@ at::optional parse_frame_information( } } // anonymous namespace - -#endif // !defined(_WIN32) +#endif // SUPPORTS_BACKTRACE std::string get_backtrace( size_t frames_to_skip, size_t maximum_number_of_frames, bool skip_python_frames) { -#if !defined(_WIN32) && !defined(__EMSCRIPTEN__) +#if SUPPORTS_BACKTRACE // We always skip this frame (backtrace). frames_to_skip += 1; @@ -221,10 +235,9 @@ std::string get_backtrace( } return stream.str(); - -#else - +#else // !SUPPORTS_BACKTRACE return "(no backtrace available)"; -#endif +#endif // SUPPORTS_BACKTRACE } + } // namespace at diff --git a/aten/src/ATen/core/Backtrace.h b/aten/src/ATen/core/Backtrace.h new file mode 100644 index 00000000000000..ec4c17c6f6a531 --- /dev/null +++ b/aten/src/ATen/core/Backtrace.h @@ -0,0 +1,28 @@ +#pragma once + +#include +#include +#include + +#include + +namespace at { +/// Utility to demangle a C++ symbol name. +AT_CORE_API std::string demangle(const char* name); + +/// Returns the printable name of the type. +template +inline const char* demangle_type() { +#ifdef __GXX_RTTI + static const std::string name = demangle(typeid(T).name()); + return name.c_str(); +#else // __GXX_RTTI + return "(RTTI disabled, cannot show name)"; +#endif // __GXX_RTTI +} + +AT_CORE_API std::string get_backtrace( + size_t frames_to_skip = 0, + size_t maximum_number_of_frames = 64, + bool skip_python_frames = true); +} // namespace at diff --git a/aten/src/ATen/core/C++17.cpp b/aten/src/ATen/core/C++17.cpp new file mode 100644 index 00000000000000..6074cb6be15e9c --- /dev/null +++ b/aten/src/ATen/core/C++17.cpp @@ -0,0 +1 @@ +#include diff --git a/caffe2/utils/C++17.h b/aten/src/ATen/core/C++17.h similarity index 93% rename from caffe2/utils/C++17.h rename to aten/src/ATen/core/C++17.h index 0186944e251159..5112d9070dcd5e 100644 --- a/caffe2/utils/C++17.h +++ b/aten/src/ATen/core/C++17.h @@ -95,10 +95,14 @@ template using decay_t = typename std::decay::type; #ifdef __cpp_lib_logical_traits -using conjunction = std::conjunction; -using disjunction = std::disjunction; -using bool_constant = std::bool_constant; -using negation = std::negation; +template +using conjunction = std::conjunction; +template +using disjunction = std::disjunction; +template +using bool_constant = std::bool_constant; +template +using negation = std::negation; #else @@ -145,7 +149,10 @@ template using void_t = typename make_void::type; #ifdef __cpp_lib_apply -using apply = std::apply; +template +inline constexpr decltype(auto) apply(F&& f, Tuple&& t) { + return std::apply(std::forward(f), std::forward(t)); +} #else @@ -175,9 +182,9 @@ constexpr auto apply(F&& f, Tuple&& t) -> decltype(detail::apply_impl( #if defined(__cpp_constexpr) && __cpp_constexpr >= 201304 -# define C10_CPP14_CONSTEXPR constexpr +# define AT_CPP14_CONSTEXPR constexpr #else -# define C10_CPP14_CONSTEXPR +# define AT_CPP14_CONSTEXPR #endif diff --git a/aten/src/ATen/core/CMakeLists.txt b/aten/src/ATen/core/CMakeLists.txt new file mode 100644 index 00000000000000..59149be784c3a6 --- /dev/null +++ b/aten/src/ATen/core/CMakeLists.txt @@ -0,0 +1,16 @@ +# This file solely exists to let Caffe2 Android build get at the list +# of core files without having to trundle through all of ATen's CMakeLists.txt + +FILE(GLOB ATen_CORE_HEADERS "*.h") +FILE(GLOB ATen_CORE_SRCS "*.cpp") +FILE(GLOB ATen_CORE_TEST_SRCS "*_test.cpp") +EXCLUDE(ATen_CORE_SRCS "${ATen_CORE_SRCS}" ${ATen_CORE_TEST_SRCS}) + +# Pass to parent +set(ATen_CORE_HEADERS ${ATen_CORE_HEADERS} PARENT_SCOPE) +set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE) +set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE) +# This is a little dodgy, because it means ALL ATen headers are made +# visible. Fortunately, you should just get a lot of undefined symbol +# errors if you go outside core +set(ATen_CORE_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../.. PARENT_SCOPE) diff --git a/aten/src/ATen/core/CoreAPI.h b/aten/src/ATen/core/CoreAPI.h new file mode 100644 index 00000000000000..0ee114d9f4cfdd --- /dev/null +++ b/aten/src/ATen/core/CoreAPI.h @@ -0,0 +1,20 @@ +// You can use the definition AT_CORE_STATIC_WINDOWS to control whether +// or not we apply __declspec. You will want to set this as +// -DAT_CORE_STATIC_WINDOWS=1 when compiling code which links +// against ATen/core on Windows, when ATen/core is built as a +// static library (in which case, saying the symbol is coming +// from a DLL would be incorrect). + +#ifdef _WIN32 +#if !defined(AT_CORE_STATIC_WINDOWS) +#if defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS) +#define AT_CORE_API __declspec(dllexport) +#else +#define AT_CORE_API __declspec(dllimport) +#endif +#else +#define AT_CORE_API +#endif +#else +#define AT_CORE_API +#endif diff --git a/aten/src/ATen/Error.cpp b/aten/src/ATen/core/Error.cpp similarity index 64% rename from aten/src/ATen/Error.cpp rename to aten/src/ATen/core/Error.cpp index 1261fbe0295d6c..35ba7d644e109b 100644 --- a/aten/src/ATen/Error.cpp +++ b/aten/src/ATen/core/Error.cpp @@ -1,5 +1,5 @@ -#include -#include +#include +#include #include #include @@ -11,9 +11,13 @@ std::ostream& operator<<(std::ostream& out, const SourceLocation& loc) { } Error::Error(SourceLocation source_location, std::string err) - : what_without_backtrace_(err) - , what_(str(err, " (", source_location, ")\n", get_backtrace(/*frames_to_skip=*/2))) - {} + : what_without_backtrace_(err), + what_( + str(err, + " (", + source_location, + ")\n", + get_backtrace(/*frames_to_skip=*/2))) {} void Warning::warn(SourceLocation source_location, std::string msg) { warning_handler_(source_location, msg.c_str()); @@ -23,7 +27,9 @@ void Warning::set_warning_handler(handler_t handler) { warning_handler_ = handler; } -void Warning::print_warning(const SourceLocation& source_location, const char* msg) { +void Warning::print_warning( + const SourceLocation& source_location, + const char* msg) { std::cerr << "Warning: " << msg << " (" << source_location << ")\n"; } diff --git a/aten/src/ATen/core/Error.h b/aten/src/ATen/core/Error.h new file mode 100644 index 00000000000000..b95a5f120f21b8 --- /dev/null +++ b/aten/src/ATen/core/Error.h @@ -0,0 +1,147 @@ +#pragma once + +#include +#include + +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) && _MSC_VER <= 1900 +#define __func__ __FUNCTION__ +#endif + +namespace at { + +namespace detail { + +inline std::ostream& _str(std::ostream& ss) { + return ss; +} + +template +inline std::ostream& _str(std::ostream& ss, const T& t) { + ss << t; + return ss; +} + +template +inline std::ostream& _str(std::ostream& ss, const T& t, const Args&... args) { + return _str(_str(ss, t), args...); +} + +} // namespace detail + +// Convert a list of string-like arguments into a single string. +template +inline std::string str(const Args&... args) { + std::ostringstream ss; + detail::_str(ss, args...); + return ss.str(); +} + +// Specializations for already-a-string types. +template <> +inline std::string str(const std::string& str) { + return str; +} +inline std::string str(const char* c_str) { + return c_str; +} + +/// Represents a location in source code (for debugging). +struct SourceLocation { + const char* function; + const char* file; + uint32_t line; +}; + +std::ostream& operator<<(std::ostream& out, const SourceLocation& loc); + +/// The primary ATen error class. +/// Provides a complete error message with source location information via +/// `what()`, and a more concise message via `what_without_backtrace()`. Should +/// primarily be used with the `AT_ERROR` macro. +/// +/// NB: at::Error is handled specially by the default torch to suppress the +/// backtrace, see torch/csrc/Exceptions.h +class AT_CORE_API Error : public std::exception { + std::string what_without_backtrace_; + std::string what_; + + public: + Error(SourceLocation source_location, std::string err); + + /// Returns the complete error message, including the source location. + const char* what() const noexcept override { + return what_.c_str(); + } + + /// Returns only the error message string, without source location. + const char* what_without_backtrace() const noexcept { + return what_without_backtrace_.c_str(); + } +}; + +class AT_CORE_API Warning { + using handler_t = + void (*)(const SourceLocation& source_location, const char* msg); + + public: + /// Issue a warning with a given message. Dispatched to the current + /// warning handler. + static void warn(SourceLocation source_location, std::string msg); + + /// Sets the global warning handler. This is not thread-safe, so it should + /// generally be called once during initialization. + static void set_warning_handler(handler_t handler); + + /// The default warning handler. Prints the message to stderr. + static void print_warning( + const SourceLocation& source_location, + const char* msg); + + private: + static handler_t warning_handler_; +}; + +} // namespace at + +// TODO: variants that print the expression tested and thus don't require +// strings +// TODO: CAFFE_ENFORCE_WITH_CALLER style macro + +#define AT_ERROR(...) \ + throw at::Error({__func__, __FILE__, __LINE__}, at::str(__VA_ARGS__)) + +#define AT_WARN(...) \ + at::Warning::warn({__func__, __FILE__, __LINE__}, at::str(__VA_ARGS__)) + +#define AT_ASSERT(cond) \ + if (!(cond)) { \ + AT_ERROR( \ + #cond " ASSERT FAILED at ", \ + __FILE__, \ + ":", \ + __LINE__, \ + ", please report a bug to PyTorch."); \ + } + +#define AT_ASSERTM(cond, ...) \ + if (!(cond)) { \ + AT_ERROR(at::str( \ + #cond, \ + " ASSERT FAILED at ", \ + __FILE__, \ + ":", \ + __LINE__, \ + ", please report a bug to PyTorch. ", \ + __VA_ARGS__)); \ + } + +#define AT_CHECK(cond, ...) \ + if (!(cond)) { \ + AT_ERROR(at::str(__VA_ARGS__)); \ + } diff --git a/aten/src/ATen/core/Half-inl.h b/aten/src/ATen/core/Half-inl.h new file mode 100644 index 00000000000000..d89b496d7083b8 --- /dev/null +++ b/aten/src/ATen/core/Half-inl.h @@ -0,0 +1,249 @@ +#pragma once + +#include +#include +#include + +#ifdef __CUDACC__ +#include +#endif + +#if defined(__HIP_DEVICE_COMPILE__) +#include +#endif + +namespace at { + +/// Constructors + +inline AT_HOSTDEVICE Half::Half(float value) { +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + x = __half_as_short(__float2half(value)); +#else + x = detail::float2halfbits(value); +#endif +} + +/// Implicit conversions + +inline AT_HOSTDEVICE Half::operator float() const { +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + return __half2float(*reinterpret_cast(&x)); +#else + return detail::halfbits2float(x); +#endif +} + +#ifdef __CUDACC__ +inline AT_HOSTDEVICE Half::Half(const __half& value) { + x = *reinterpret_cast(&value); +} +inline AT_HOSTDEVICE Half::operator __half() const { + return *reinterpret_cast(&x); +} +#endif + +/// Arithmetic + +inline AT_HOSTDEVICE Half operator+(const Half& a, const Half& b) { + return (float)a + (float)b; +} + +inline AT_HOSTDEVICE Half operator-(const Half& a, const Half& b) { + return (float)a - (float)b; +} + +inline AT_HOSTDEVICE Half operator*(const Half& a, const Half& b) { + return (float)a * (float)b; +} + +inline AT_HOSTDEVICE Half operator/(const Half& a, const Half& b) { + return (float)a / (float)b; +} + +inline AT_HOSTDEVICE Half operator-(const Half& a) { + return -(float)a; +} + +inline AT_HOSTDEVICE Half& operator+=(Half& a, const Half& b) { + a = a + b; + return a; +} + +inline AT_HOSTDEVICE Half& operator-=(Half& a, const Half& b) { + a = a - b; + return a; +} + +inline AT_HOSTDEVICE Half& operator*=(Half& a, const Half& b) { + a = a * b; + return a; +} + +inline AT_HOSTDEVICE Half& operator/=(Half& a, const Half& b) { + a = a / b; + return a; +} + +/// Arithmetic with floats + +inline AT_HOSTDEVICE float operator+(Half a, float b) { + return (float)a + b; +} +inline AT_HOSTDEVICE float operator-(Half a, float b) { + return (float)a - b; +} +inline AT_HOSTDEVICE float operator*(Half a, float b) { + return (float)a * b; +} +inline AT_HOSTDEVICE float operator/(Half a, float b) { + return (float)a / b; +} + +inline AT_HOSTDEVICE float operator+(float a, Half b) { + return a + (float)b; +} +inline AT_HOSTDEVICE float operator-(float a, Half b) { + return a - (float)b; +} +inline AT_HOSTDEVICE float operator*(float a, Half b) { + return a * (float)b; +} +inline AT_HOSTDEVICE float operator/(float a, Half b) { + return a / (float)b; +} + +inline AT_HOSTDEVICE float& operator+=(float& a, const Half& b) { + return a += (float)b; +} +inline AT_HOSTDEVICE float& operator-=(float& a, const Half& b) { + return a -= (float)b; +} +inline AT_HOSTDEVICE float& operator*=(float& a, const Half& b) { + return a *= (float)b; +} +inline AT_HOSTDEVICE float& operator/=(float& a, const Half& b) { + return a /= (float)b; +} + +/// Arithmetic with doubles + +inline AT_HOSTDEVICE double operator+(Half a, double b) { + return (double)a + b; +} +inline AT_HOSTDEVICE double operator-(Half a, double b) { + return (double)a - b; +} +inline AT_HOSTDEVICE double operator*(Half a, double b) { + return (double)a * b; +} +inline AT_HOSTDEVICE double operator/(Half a, double b) { + return (double)a / b; +} + +inline AT_HOSTDEVICE double operator+(double a, Half b) { + return a + (double)b; +} +inline AT_HOSTDEVICE double operator-(double a, Half b) { + return a - (double)b; +} +inline AT_HOSTDEVICE double operator*(double a, Half b) { + return a * (double)b; +} +inline AT_HOSTDEVICE double operator/(double a, Half b) { + return a / (double)b; +} + +/// Arithmetic with ints + +inline AT_HOSTDEVICE Half operator+(Half a, int b) { + return a + (Half)b; +} +inline AT_HOSTDEVICE Half operator-(Half a, int b) { + return a - (Half)b; +} +inline AT_HOSTDEVICE Half operator*(Half a, int b) { + return a * (Half)b; +} +inline AT_HOSTDEVICE Half operator/(Half a, int b) { + return a / (Half)b; +} + +inline AT_HOSTDEVICE Half operator+(int a, Half b) { + return (Half)a + b; +} +inline AT_HOSTDEVICE Half operator-(int a, Half b) { + return (Half)a - b; +} +inline AT_HOSTDEVICE Half operator*(int a, Half b) { + return (Half)a * b; +} +inline AT_HOSTDEVICE Half operator/(int a, Half b) { + return (Half)a / b; +} + +/// NOTE: we do not define comparisons directly and instead rely on the implicit +/// conversion from at::Half to float. + +} // namespace at + +namespace std { + +template <> +class numeric_limits { + public: + static constexpr bool is_specialized = true; + static constexpr bool is_signed = true; + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr bool has_infinity = true; + static constexpr bool has_quiet_NaN = true; + static constexpr bool has_signaling_NaN = true; + static constexpr auto has_denorm = numeric_limits::has_denorm; + static constexpr auto has_denorm_loss = + numeric_limits::has_denorm_loss; + static constexpr auto round_style = numeric_limits::round_style; + static constexpr bool is_iec559 = true; + static constexpr bool is_bounded = true; + static constexpr bool is_modulo = false; + static constexpr int digits = 11; + static constexpr int digits10 = 3; + static constexpr int max_digits10 = 5; + static constexpr int radix = 2; + static constexpr int min_exponent = -13; + static constexpr int min_exponent10 = -4; + static constexpr int max_exponent = 16; + static constexpr int max_exponent10 = 4; + static constexpr auto traps = numeric_limits::traps; + static constexpr auto tinyness_before = + numeric_limits::tinyness_before; + static constexpr at::Half min() { + return at::Half(0x0400, at::Half::from_bits); + } + static constexpr at::Half lowest() { + return at::Half(0xFBFF, at::Half::from_bits); + } + static constexpr at::Half max() { + return at::Half(0x7BFF, at::Half::from_bits); + } + static constexpr at::Half epsilon() { + return at::Half(0x1400, at::Half::from_bits); + } + static constexpr at::Half round_error() { + return at::Half(0x3800, at::Half::from_bits); + } + static constexpr at::Half infinity() { + return at::Half(0x7C00, at::Half::from_bits); + } + static constexpr at::Half quiet_NaN() { + return at::Half(0x7E00, at::Half::from_bits); + } + static constexpr at::Half signaling_NaN() { + return at::Half(0x7D00, at::Half::from_bits); + } + static constexpr at::Half denorm_min() { + return at::Half(0x0001, at::Half::from_bits); + } +}; + +} // namespace std diff --git a/aten/src/ATen/core/Half.cpp b/aten/src/ATen/core/Half.cpp new file mode 100644 index 00000000000000..e511f03a92bc73 --- /dev/null +++ b/aten/src/ATen/core/Half.cpp @@ -0,0 +1,105 @@ +#include + +#include + +namespace at { + +static_assert( + std::is_standard_layout::value, + "at::Half must be standard layout."); + +namespace detail { + +// Host functions for converting between FP32 and FP16 formats + +float halfbits2float(unsigned short h) { + unsigned sign = ((h >> 15) & 1); + unsigned exponent = ((h >> 10) & 0x1f); + unsigned mantissa = ((h & 0x3ff) << 13); + + if (exponent == 0x1f) { /* NaN or Inf */ + mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0); + exponent = 0xff; + } else if (!exponent) { /* Denorm or Zero */ + if (mantissa) { + unsigned int msb; + exponent = 0x71; + do { + msb = (mantissa & 0x400000); + mantissa <<= 1; /* normalize */ + --exponent; + } while (!msb); + mantissa &= 0x7fffff; /* 1.mantissa is implicit */ + } + } else { + exponent += 0x70; + } + + unsigned result_bit = (sign << 31) | (exponent << 23) | mantissa; + + // Reinterpret the result bit pattern as a float + float result_float; + std::memcpy(&result_float, &result_bit, sizeof(result_float)); + return result_float; +} + +unsigned short float2halfbits(float src) { + // Reinterpret the float as a bit pattern + unsigned x; + std::memcpy(&x, &src, sizeof(x)); + + unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1; + unsigned sign, exponent, mantissa; + + // Get rid of +NaN/-NaN case first. + if (u > 0x7f800000) { + return 0x7fffU; + } + + sign = ((x >> 16) & 0x8000); + + // Get rid of +Inf/-Inf, +0/-0. + if (u > 0x477fefff) { + return sign | 0x7c00U; + } + if (u < 0x33000001) { + return (sign | 0x0000); + } + + exponent = ((u >> 23) & 0xff); + mantissa = (u & 0x7fffff); + + if (exponent > 0x70) { + shift = 13; + exponent -= 0x70; + } else { + shift = 0x7e - exponent; + exponent = 0; + mantissa |= 0x800000; + } + lsb = (1 << shift); + lsb_s1 = (lsb >> 1); + lsb_m1 = (lsb - 1); + + // Round to nearest even. + remainder = (mantissa & lsb_m1); + mantissa >>= shift; + if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) { + ++mantissa; + if (!(mantissa & 0x3ff)) { + ++exponent; + mantissa = 0; + } + } + + return (sign | (exponent << 10) | mantissa); +} + +} // namespace detail + +std::ostream& operator<<(std::ostream& out, const Half& value) { + out << (float)value; + return out; +} + +} // namespace at diff --git a/aten/src/ATen/core/Half.h b/aten/src/ATen/core/Half.h new file mode 100644 index 00000000000000..385f18e78cab02 --- /dev/null +++ b/aten/src/ATen/core/Half.h @@ -0,0 +1,127 @@ +#pragma once + +/// Defines the Half type (half-precision floating-point) including conversions +/// to standard C types and basic arithmetic operations. Note that arithmetic +/// operations are implemented by converting to floating point and +/// performing the operation in float32, instead of using CUDA half intrinisics. +/// Most uses of this type within ATen are memory bound, including the +/// element-wise kernels, and the half intrinisics aren't efficient on all GPUs. +/// If you are writing a compute bound kernel, you can use the CUDA half +/// intrinsics directly on the Half type from device code. + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __CUDACC__ +#include +#endif + +#if defined(__HIP_DEVICE_COMPILE__) +#include +#endif + +#ifndef AT_HOSTDEVICE +#ifdef __CUDACC__ +#define AT_HOSTDEVICE __host__ __device__ +#else +#define AT_HOSTDEVICE +#endif +#endif + +namespace at { + +namespace detail { + +AT_CORE_API float halfbits2float(unsigned short bits); +AT_CORE_API unsigned short float2halfbits(float value); + +} // namespace detail + +struct alignas(2) Half { + unsigned short x; + + struct from_bits_t {}; + static constexpr from_bits_t from_bits = from_bits_t(); + + // HIP wants __host__ __device__ tag, CUDA does not +#ifdef __HIP_PLATFORM_HCC__ + AT_HOSTDEVICE Half() = default; +#else + Half() = default; +#endif + + constexpr AT_HOSTDEVICE Half(unsigned short bits, from_bits_t) : x(bits){}; + inline AT_HOSTDEVICE Half(float value); + inline AT_HOSTDEVICE operator float() const; + +#ifdef __CUDACC__ + inline AT_HOSTDEVICE Half(const __half& value); + inline AT_HOSTDEVICE operator __half() const; +#endif +}; + +template +To convert(From f) { + return static_cast(f); +} + +// skip isnan and isinf check for integral types +template +typename std::enable_if::value, bool>::type overflows( + From f) { + using limit = std::numeric_limits; + if (!limit::is_signed && std::numeric_limits::is_signed) { + // allow for negative numbers to wrap using two's complement arithmetic. + // For example, with uint8, this allows for `a - b` to be treated as + // `a + 255 * b`. + return f > limit::max() || (f < 0 && -(uint64_t)f > limit::max()); + } else { + return f < limit::lowest() || f > limit::max(); + } +} + +template +typename std::enable_if::value, bool>::type overflows( + From f) { + using limit = std::numeric_limits; + if (limit::has_infinity && std::isinf((double)f)) { + return false; + } + if (!limit::has_quiet_NaN && (f != f)) { + return true; + } + return f < limit::lowest() || f > limit::max(); +} + +template +To checked_convert(From f, const char* name) { + if (overflows(f)) { + std::ostringstream oss; + oss << "value cannot be converted to type " << name << " without overflow: " << f; + throw std::domain_error(oss.str()); + } + return convert(f); +} + +template +To HalfFix(From h) { + To ret; + ret.x = h.x; + return ret; +} + +AT_CORE_API std::ostream& operator<<(std::ostream& out, const Half& value); + +} // namespace at + +#include "ATen/core/Half-inl.h" + +#undef AT_HOSTDEVICE diff --git a/aten/src/ATen/core/IdWrapper.h b/aten/src/ATen/core/IdWrapper.h new file mode 100644 index 00000000000000..7d152269d9a8c2 --- /dev/null +++ b/aten/src/ATen/core/IdWrapper.h @@ -0,0 +1,75 @@ +#pragma once + +#include + +namespace at { + +/** + * This template simplifies generation of simple classes that wrap an id + * in a typesafe way. Namely, you can use it to create a very lightweight + * type that only offers equality comparators and hashing. Example: + * + * struct MyIdType final : IdWrapper { + * constexpr explicit MyIdType(uint32_t id): IdWrapper(id) {} + * }; + * + * Then in the global top level namespace: + * + * AT_DEFINE_HASH_FOR_IDWRAPPER(MyIdType); + * + * That's it - equality operators and hash functions are automatically defined + * for you, given the underlying type supports it. + */ +template +class IdWrapper { + public: + using underlying_type = UnderlyingType; + using concrete_type = ConcreteType; + + protected: + constexpr explicit IdWrapper(underlying_type id) noexcept( + noexcept(underlying_type(std::declval()))) + : id_(id) {} + + constexpr underlying_type underlyingId() const + noexcept(noexcept(underlying_type(std::declval()))) { + return id_; + } + + private: + friend size_t hash_value(const concrete_type& v) { + return std::hash()(v.id_); + } + + // TODO Making operator== noexcept if underlying type is noexcept equality + // comparable doesn't work with GCC 4.8. + // Fix this once we don't need GCC 4.8 anymore. + friend constexpr bool operator==( + const concrete_type& lhs, + const concrete_type& rhs) { + return lhs.id_ == rhs.id_; + } + + // TODO Making operator!= noexcept if operator== is noexcept doesn't work with + // GCC 4.8. + // Fix this once we don't need GCC 4.8 anymore. + friend constexpr bool operator!=( + const concrete_type& lhs, + const concrete_type& rhs) { + return !(lhs == rhs); + } + + underlying_type id_; +}; + +} // namespace at + +#define AT_DEFINE_HASH_FOR_IDWRAPPER(ClassName) \ + namespace std { \ + template <> \ + struct hash { \ + size_t operator()(ClassName x) const { \ + return hash_value(x); \ + } \ + }; \ + } diff --git a/aten/src/ATen/core/README.md b/aten/src/ATen/core/README.md new file mode 100644 index 00000000000000..71654f44e26f91 --- /dev/null +++ b/aten/src/ATen/core/README.md @@ -0,0 +1,5 @@ +ATen Core +--------- + +ATen Core is a minimal subset of ATen which is suitable for deployment +on mobile. Binary size of files in this folder is an important constraint. diff --git a/aten/src/ATen/SmallVector.cpp b/aten/src/ATen/core/SmallVector.cpp similarity index 87% rename from aten/src/ATen/SmallVector.cpp rename to aten/src/ATen/core/SmallVector.cpp index 59095a2809c7a8..976809c5b50931 100644 --- a/aten/src/ATen/SmallVector.cpp +++ b/aten/src/ATen/core/SmallVector.cpp @@ -14,20 +14,22 @@ // ATen: modified from llvm::SmallVector. // replaced report_bad_alloc_error with std::bad_alloc -#include "SmallVector.h" +#include namespace at { /// grow_pod - This is an implementation of the grow() method which only works /// on POD-like datatypes and is out of line to reduce code duplication. -void SmallVectorBase::grow_pod(void *FirstEl, size_t MinSizeInBytes, - size_t TSize) { +void SmallVectorBase::grow_pod( + void* FirstEl, + size_t MinSizeInBytes, + size_t TSize) { size_t CurSizeBytes = size_in_bytes(); size_t NewCapacityInBytes = 2 * capacity_in_bytes() + TSize; // Always grow. if (NewCapacityInBytes < MinSizeInBytes) NewCapacityInBytes = MinSizeInBytes; - void *NewElts; + void* NewElts; if (BeginX == FirstEl) { NewElts = malloc(NewCapacityInBytes); if (NewElts == nullptr) @@ -42,9 +44,9 @@ void SmallVectorBase::grow_pod(void *FirstEl, size_t MinSizeInBytes, throw std::bad_alloc(); } - this->EndX = (char*)NewElts+CurSizeBytes; + this->EndX = (char*)NewElts + CurSizeBytes; this->BeginX = NewElts; this->CapacityX = (char*)this->BeginX + NewCapacityInBytes; } -} +} // namespace at diff --git a/aten/src/ATen/core/SmallVector.h b/aten/src/ATen/core/SmallVector.h new file mode 100644 index 00000000000000..269b21b0d5cf37 --- /dev/null +++ b/aten/src/ATen/core/SmallVector.h @@ -0,0 +1,1034 @@ +//===- llvm/ADT/SmallVector.h - 'Normally small' vectors --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the SmallVector class. +// +//===----------------------------------------------------------------------===// + +// ATen: modified from llvm::SmallVector. +// replaced report_bad_alloc_error with std::bad_alloc +// replaced isPodLike with AT_IS_TRIVIALLY_COPYABLE +// replaced iterator_range constructor with inline Container&& constructor +// removed LLVM_NODISCARD and LLVM_ATTRIBUTE_ALWAYS_INLINE qualifiers +// removed LLVM_UNLIKELY + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if __GNUG__ && __GNUC__ < 5 +#define AT_IS_TRIVIALLY_COPYABLE(T) __has_trivial_copy(T) +#else +#define AT_IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable::value +#endif + +namespace at { + +namespace detail { + +// From llvm/Support/MathExtras.h +static inline uint64_t NextPowerOf2(uint64_t A) { + A |= (A >> 1); + A |= (A >> 2); + A |= (A >> 4); + A |= (A >> 8); + A |= (A >> 16); + A |= (A >> 32); + return A + 1; +} + +} // namespace detail + +/// This is all the non-templated stuff common to all SmallVectors. +class AT_CORE_API SmallVectorBase { + protected: + void *BeginX, *EndX, *CapacityX; + + protected: + SmallVectorBase(void* FirstEl, size_t Size) + : BeginX(FirstEl), EndX(FirstEl), CapacityX((char*)FirstEl + Size) {} + + /// This is an implementation of the grow() method which only works + /// on POD-like data types and is out of line to reduce code duplication. + void grow_pod(void* FirstEl, size_t MinSizeInBytes, size_t TSize); + + public: + /// This returns size()*sizeof(T). + size_t size_in_bytes() const { + return size_t((char*)EndX - (char*)BeginX); + } + + /// capacity_in_bytes - This returns capacity()*sizeof(T). + size_t capacity_in_bytes() const { + return size_t((char*)CapacityX - (char*)BeginX); + } + + bool empty() const { + return BeginX == EndX; + } +}; + +/// This is the part of SmallVectorTemplateBase which does not depend on whether +/// the type T is a POD. The extra dummy template argument is used by ArrayRef +/// to avoid unnecessarily requiring T to be complete. +template +class SmallVectorTemplateCommon : public SmallVectorBase { + private: + template + friend struct SmallVectorStorage; + + // Allocate raw space for N elements of type T. If T has a ctor or dtor, we + // don't want it to be automatically run, so we need to represent the space as + // something else. Use an array of char of sufficient alignment. + using U = AlignedCharArrayUnion; + U FirstEl; + // Space after 'FirstEl' is clobbered, do not add any instance vars after it. + + protected: + SmallVectorTemplateCommon(size_t Size) : SmallVectorBase(&FirstEl, Size) {} + + void grow_pod(size_t MinSizeInBytes, size_t TSize) { + SmallVectorBase::grow_pod(&FirstEl, MinSizeInBytes, TSize); + } + + /// Return true if this is a smallvector which has not had dynamic + /// memory allocated for it. + bool isSmall() const { + return BeginX == static_cast(&FirstEl); + } + + /// Put this vector in a state of being small. + void resetToSmall() { + BeginX = EndX = CapacityX = &FirstEl; + } + + void setEnd(T* P) { + this->EndX = P; + } + + public: + using size_type = size_t; + using difference_type = ptrdiff_t; + using value_type = T; + using iterator = T*; + using const_iterator = const T*; + + using const_reverse_iterator = std::reverse_iterator; + using reverse_iterator = std::reverse_iterator; + + using reference = T&; + using const_reference = const T&; + using pointer = T*; + using const_pointer = const T*; + + // forward iterator creation methods. + iterator begin() { + return (iterator)this->BeginX; + } + const_iterator begin() const { + return (const_iterator)this->BeginX; + } + iterator end() { + return (iterator)this->EndX; + } + const_iterator end() const { + return (const_iterator)this->EndX; + } + + protected: + iterator capacity_ptr() { + return (iterator)this->CapacityX; + } + const_iterator capacity_ptr() const { + return (const_iterator)this->CapacityX; + } + + public: + // reverse iterator creation methods. + reverse_iterator rbegin() { + return reverse_iterator(end()); + } + const_reverse_iterator rbegin() const { + return const_reverse_iterator(end()); + } + reverse_iterator rend() { + return reverse_iterator(begin()); + } + const_reverse_iterator rend() const { + return const_reverse_iterator(begin()); + } + + size_type size() const { + return end() - begin(); + } + size_type max_size() const { + return size_type(-1) / sizeof(T); + } + + /// Return the total number of elements in the currently allocated buffer. + size_t capacity() const { + return capacity_ptr() - begin(); + } + + /// Return a pointer to the vector's buffer, even if empty(). + pointer data() { + return pointer(begin()); + } + /// Return a pointer to the vector's buffer, even if empty(). + const_pointer data() const { + return const_pointer(begin()); + } + + reference operator[](size_type idx) { + assert(idx < size()); + return begin()[idx]; + } + const_reference operator[](size_type idx) const { + assert(idx < size()); + return begin()[idx]; + } + + reference front() { + assert(!empty()); + return begin()[0]; + } + const_reference front() const { + assert(!empty()); + return begin()[0]; + } + + reference back() { + assert(!empty()); + return end()[-1]; + } + const_reference back() const { + assert(!empty()); + return end()[-1]; + } +}; + +/// SmallVectorTemplateBase - This is where we put method +/// implementations that are designed to work with non-POD-like T's. +template +class SmallVectorTemplateBase : public SmallVectorTemplateCommon { + protected: + SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon(Size) {} + + static void destroy_range(T* S, T* E) { + while (S != E) { + --E; + E->~T(); + } + } + + /// Move the range [I, E) into the uninitialized memory starting with "Dest", + /// constructing elements as needed. + template + static void uninitialized_move(It1 I, It1 E, It2 Dest) { + std::uninitialized_copy( + std::make_move_iterator(I), std::make_move_iterator(E), Dest); + } + + /// Copy the range [I, E) onto the uninitialized memory starting with "Dest", + /// constructing elements as needed. + template + static void uninitialized_copy(It1 I, It1 E, It2 Dest) { + std::uninitialized_copy(I, E, Dest); + } + + /// Grow the allocated memory (without initializing new elements), doubling + /// the size of the allocated memory. Guarantees space for at least one more + /// element, or MinSize more elements if specified. + void grow(size_t MinSize = 0); + + public: + void push_back(const T& Elt) { + if (this->EndX >= this->CapacityX) + this->grow(); + ::new ((void*)this->end()) T(Elt); + this->setEnd(this->end() + 1); + } + + void push_back(T&& Elt) { + if (this->EndX >= this->CapacityX) + this->grow(); + ::new ((void*)this->end()) T(::std::move(Elt)); + this->setEnd(this->end() + 1); + } + + void pop_back() { + this->setEnd(this->end() - 1); + this->end()->~T(); + } +}; + +// Define this out-of-line to dissuade the C++ compiler from inlining it. +template +void SmallVectorTemplateBase::grow(size_t MinSize) { + size_t CurCapacity = this->capacity(); + size_t CurSize = this->size(); + // Always grow, even from zero. + size_t NewCapacity = size_t(detail::NextPowerOf2(CurCapacity + 2)); + if (NewCapacity < MinSize) + NewCapacity = MinSize; + T* NewElts = static_cast(malloc(NewCapacity * sizeof(T))); + if (NewElts == nullptr) + throw std::bad_alloc(); + + // Move the elements over. + this->uninitialized_move(this->begin(), this->end(), NewElts); + + // Destroy the original elements. + destroy_range(this->begin(), this->end()); + + // If this wasn't grown from the inline copy, deallocate the old space. + if (!this->isSmall()) + free(this->begin()); + + this->setEnd(NewElts + CurSize); + this->BeginX = NewElts; + this->CapacityX = this->begin() + NewCapacity; +} + +/// SmallVectorTemplateBase - This is where we put method +/// implementations that are designed to work with POD-like T's. +template +class SmallVectorTemplateBase : public SmallVectorTemplateCommon { + protected: + SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon(Size) {} + + // No need to do a destroy loop for POD's. + static void destroy_range(T*, T*) {} + + /// Move the range [I, E) onto the uninitialized memory + /// starting with "Dest", constructing elements into it as needed. + template + static void uninitialized_move(It1 I, It1 E, It2 Dest) { + // Just do a copy. + uninitialized_copy(I, E, Dest); + } + + /// Copy the range [I, E) onto the uninitialized memory + /// starting with "Dest", constructing elements into it as needed. + template + static void uninitialized_copy(It1 I, It1 E, It2 Dest) { + // Arbitrary iterator types; just use the basic implementation. + std::uninitialized_copy(I, E, Dest); + } + + /// Copy the range [I, E) onto the uninitialized memory + /// starting with "Dest", constructing elements into it as needed. + template + static void uninitialized_copy( + T1* I, + T1* E, + T2* Dest, + typename std::enable_if< + std::is_same::type, T2>::value>:: + type* = nullptr) { + // Use memcpy for PODs iterated by pointers (which includes SmallVector + // iterators): std::uninitialized_copy optimizes to memmove, but we can + // use memcpy here. Note that I and E are iterators and thus might be + // invalid for memcpy if they are equal. + if (I != E) + memcpy(Dest, I, (E - I) * sizeof(T)); + } + + /// Double the size of the allocated memory, guaranteeing space for at + /// least one more element or MinSize if specified. + void grow(size_t MinSize = 0) { + this->grow_pod(MinSize * sizeof(T), sizeof(T)); + } + + public: + void push_back(const T& Elt) { + if (this->EndX >= this->CapacityX) + this->grow(); + memcpy(this->end(), &Elt, sizeof(T)); + this->setEnd(this->end() + 1); + } + + void pop_back() { + this->setEnd(this->end() - 1); + } +}; + +/// This class consists of common code factored out of the SmallVector class to +/// reduce code duplication based on the SmallVector 'N' template parameter. +template +class SmallVectorImpl + : public SmallVectorTemplateBase { + using SuperClass = SmallVectorTemplateBase; + + public: + using iterator = typename SuperClass::iterator; + using const_iterator = typename SuperClass::const_iterator; + using size_type = typename SuperClass::size_type; + + protected: + // Default ctor - Initialize to empty. + explicit SmallVectorImpl(unsigned N) + : SmallVectorTemplateBase(N * sizeof(T)) { + } + + public: + SmallVectorImpl(const SmallVectorImpl&) = delete; + + ~SmallVectorImpl() { + // Destroy the constructed elements in the vector. + this->destroy_range(this->begin(), this->end()); + + // If this wasn't grown from the inline copy, deallocate the old space. + if (!this->isSmall()) + free(this->begin()); + } + + void clear() { + this->destroy_range(this->begin(), this->end()); + this->EndX = this->BeginX; + } + + void resize(size_type N) { + if (N < this->size()) { + this->destroy_range(this->begin() + N, this->end()); + this->setEnd(this->begin() + N); + } else if (N > this->size()) { + if (this->capacity() < N) + this->grow(N); + auto I = this->end(); + for (auto E = this->begin() + N; I != E; ++I) + new (&*I) T(); + this->setEnd(this->begin() + N); + } + } + + void resize(size_type N, const T& NV) { + if (N < this->size()) { + this->destroy_range(this->begin() + N, this->end()); + this->setEnd(this->begin() + N); + } else if (N > this->size()) { + if (this->capacity() < N) + this->grow(N); + std::uninitialized_fill(this->end(), this->begin() + N, NV); + this->setEnd(this->begin() + N); + } + } + + void reserve(size_type N) { + if (this->capacity() < N) + this->grow(N); + } + + T pop_back_val() { + T Result = ::std::move(this->back()); + this->pop_back(); + return Result; + } + + void swap(SmallVectorImpl& RHS); + + /// Add the specified range to the end of the SmallVector. + template < + typename in_iter, + typename = typename std::enable_if::iterator_category, + std::input_iterator_tag>::value>::type> + void append(in_iter in_start, in_iter in_end) { + size_type NumInputs = std::distance(in_start, in_end); + // Grow allocated space if needed. + if (NumInputs > size_type(this->capacity_ptr() - this->end())) + this->grow(this->size() + NumInputs); + + // Copy the new elements over. + this->uninitialized_copy(in_start, in_end, this->end()); + this->setEnd(this->end() + NumInputs); + } + + /// Add the specified range to the end of the SmallVector. + void append(size_type NumInputs, const T& Elt) { + // Grow allocated space if needed. + if (NumInputs > size_type(this->capacity_ptr() - this->end())) + this->grow(this->size() + NumInputs); + + // Copy the new elements over. + std::uninitialized_fill_n(this->end(), NumInputs, Elt); + this->setEnd(this->end() + NumInputs); + } + + void append(std::initializer_list IL) { + append(IL.begin(), IL.end()); + } + + // FIXME: Consider assigning over existing elements, rather than clearing & + // re-initializing them - for all assign(...) variants. + + void assign(size_type NumElts, const T& Elt) { + clear(); + if (this->capacity() < NumElts) + this->grow(NumElts); + this->setEnd(this->begin() + NumElts); + std::uninitialized_fill(this->begin(), this->end(), Elt); + } + + template < + typename in_iter, + typename = typename std::enable_if::iterator_category, + std::input_iterator_tag>::value>::type> + void assign(in_iter in_start, in_iter in_end) { + clear(); + append(in_start, in_end); + } + + void assign(std::initializer_list IL) { + clear(); + append(IL); + } + + iterator erase(const_iterator CI) { + // Just cast away constness because this is a non-const member function. + iterator I = const_cast(CI); + + assert(I >= this->begin() && "Iterator to erase is out of bounds."); + assert(I < this->end() && "Erasing at past-the-end iterator."); + + iterator N = I; + // Shift all elts down one. + std::move(I + 1, this->end(), I); + // Drop the last elt. + this->pop_back(); + return (N); + } + + iterator erase(const_iterator CS, const_iterator CE) { + // Just cast away constness because this is a non-const member function. + iterator S = const_cast(CS); + iterator E = const_cast(CE); + + assert(S >= this->begin() && "Range to erase is out of bounds."); + assert(S <= E && "Trying to erase invalid range."); + assert(E <= this->end() && "Trying to erase past the end."); + + iterator N = S; + // Shift all elts down. + iterator I = std::move(E, this->end(), S); + // Drop the last elts. + this->destroy_range(I, this->end()); + this->setEnd(I); + return (N); + } + + iterator insert(iterator I, T&& Elt) { + if (I == this->end()) { // Important special case for empty vector. + this->push_back(::std::move(Elt)); + return this->end() - 1; + } + + assert(I >= this->begin() && "Insertion iterator is out of bounds."); + assert(I <= this->end() && "Inserting past the end of the vector."); + + if (this->EndX >= this->CapacityX) { + size_t EltNo = I - this->begin(); + this->grow(); + I = this->begin() + EltNo; + } + + ::new ((void*)this->end()) T(::std::move(this->back())); + // Push everything else over. + std::move_backward(I, this->end() - 1, this->end()); + this->setEnd(this->end() + 1); + + // If we just moved the element we're inserting, be sure to update + // the reference. + T* EltPtr = &Elt; + if (I <= EltPtr && EltPtr < this->EndX) + ++EltPtr; + + *I = ::std::move(*EltPtr); + return I; + } + + iterator insert(iterator I, const T& Elt) { + if (I == this->end()) { // Important special case for empty vector. + this->push_back(Elt); + return this->end() - 1; + } + + assert(I >= this->begin() && "Insertion iterator is out of bounds."); + assert(I <= this->end() && "Inserting past the end of the vector."); + + if (this->EndX >= this->CapacityX) { + size_t EltNo = I - this->begin(); + this->grow(); + I = this->begin() + EltNo; + } + ::new ((void*)this->end()) T(std::move(this->back())); + // Push everything else over. + std::move_backward(I, this->end() - 1, this->end()); + this->setEnd(this->end() + 1); + + // If we just moved the element we're inserting, be sure to update + // the reference. + const T* EltPtr = &Elt; + if (I <= EltPtr && EltPtr < this->EndX) + ++EltPtr; + + *I = *EltPtr; + return I; + } + + iterator insert(iterator I, size_type NumToInsert, const T& Elt) { + // Convert iterator to elt# to avoid invalidating iterator when we reserve() + size_t InsertElt = I - this->begin(); + + if (I == this->end()) { // Important special case for empty vector. + append(NumToInsert, Elt); + return this->begin() + InsertElt; + } + + assert(I >= this->begin() && "Insertion iterator is out of bounds."); + assert(I <= this->end() && "Inserting past the end of the vector."); + + // Ensure there is enough space. + reserve(this->size() + NumToInsert); + + // Uninvalidate the iterator. + I = this->begin() + InsertElt; + + // If there are more elements between the insertion point and the end of the + // range than there are being inserted, we can use a simple approach to + // insertion. Since we already reserved space, we know that this won't + // reallocate the vector. + if (size_t(this->end() - I) >= NumToInsert) { + T* OldEnd = this->end(); + append( + std::move_iterator(this->end() - NumToInsert), + std::move_iterator(this->end())); + + // Copy the existing elements that get replaced. + std::move_backward(I, OldEnd - NumToInsert, OldEnd); + + std::fill_n(I, NumToInsert, Elt); + return I; + } + + // Otherwise, we're inserting more elements than exist already, and we're + // not inserting at the end. + + // Move over the elements that we're about to overwrite. + T* OldEnd = this->end(); + this->setEnd(this->end() + NumToInsert); + size_t NumOverwritten = OldEnd - I; + this->uninitialized_move(I, OldEnd, this->end() - NumOverwritten); + + // Replace the overwritten part. + std::fill_n(I, NumOverwritten, Elt); + + // Insert the non-overwritten middle part. + std::uninitialized_fill_n(OldEnd, NumToInsert - NumOverwritten, Elt); + return I; + } + + template < + typename ItTy, + typename = typename std::enable_if::iterator_category, + std::input_iterator_tag>::value>::type> + iterator insert(iterator I, ItTy From, ItTy To) { + // Convert iterator to elt# to avoid invalidating iterator when we reserve() + size_t InsertElt = I - this->begin(); + + if (I == this->end()) { // Important special case for empty vector. + append(From, To); + return this->begin() + InsertElt; + } + + assert(I >= this->begin() && "Insertion iterator is out of bounds."); + assert(I <= this->end() && "Inserting past the end of the vector."); + + size_t NumToInsert = std::distance(From, To); + + // Ensure there is enough space. + reserve(this->size() + NumToInsert); + + // Uninvalidate the iterator. + I = this->begin() + InsertElt; + + // If there are more elements between the insertion point and the end of the + // range than there are being inserted, we can use a simple approach to + // insertion. Since we already reserved space, we know that this won't + // reallocate the vector. + if (size_t(this->end() - I) >= NumToInsert) { + T* OldEnd = this->end(); + append( + std::move_iterator(this->end() - NumToInsert), + std::move_iterator(this->end())); + + // Copy the existing elements that get replaced. + std::move_backward(I, OldEnd - NumToInsert, OldEnd); + + std::copy(From, To, I); + return I; + } + + // Otherwise, we're inserting more elements than exist already, and we're + // not inserting at the end. + + // Move over the elements that we're about to overwrite. + T* OldEnd = this->end(); + this->setEnd(this->end() + NumToInsert); + size_t NumOverwritten = OldEnd - I; + this->uninitialized_move(I, OldEnd, this->end() - NumOverwritten); + + // Replace the overwritten part. + for (T* J = I; NumOverwritten > 0; --NumOverwritten) { + *J = *From; + ++J; + ++From; + } + + // Insert the non-overwritten middle part. + this->uninitialized_copy(From, To, OldEnd); + return I; + } + + void insert(iterator I, std::initializer_list IL) { + insert(I, IL.begin(), IL.end()); + } + + template + void emplace_back(ArgTypes&&... Args) { + if (this->EndX >= this->CapacityX) + this->grow(); + ::new ((void*)this->end()) T(std::forward(Args)...); + this->setEnd(this->end() + 1); + } + + SmallVectorImpl& operator=(const SmallVectorImpl& RHS); + + SmallVectorImpl& operator=(SmallVectorImpl&& RHS); + + bool operator==(const SmallVectorImpl& RHS) const { + if (this->size() != RHS.size()) + return false; + return std::equal(this->begin(), this->end(), RHS.begin()); + } + bool operator!=(const SmallVectorImpl& RHS) const { + return !(*this == RHS); + } + + bool operator<(const SmallVectorImpl& RHS) const { + return std::lexicographical_compare( + this->begin(), this->end(), RHS.begin(), RHS.end()); + } + + /// Set the array size to \p N, which the current array must have enough + /// capacity for. + /// + /// This does not construct or destroy any elements in the vector. + /// + /// Clients can use this in conjunction with capacity() to write past the end + /// of the buffer when they know that more elements are available, and only + /// update the size later. This avoids the cost of value initializing elements + /// which will only be overwritten. + void set_size(size_type N) { + assert(N <= this->capacity()); + this->setEnd(this->begin() + N); + } +}; + +template +void SmallVectorImpl::swap(SmallVectorImpl& RHS) { + if (this == &RHS) + return; + + // We can only avoid copying elements if neither vector is small. + if (!this->isSmall() && !RHS.isSmall()) { + std::swap(this->BeginX, RHS.BeginX); + std::swap(this->EndX, RHS.EndX); + std::swap(this->CapacityX, RHS.CapacityX); + return; + } + if (RHS.size() > this->capacity()) + this->grow(RHS.size()); + if (this->size() > RHS.capacity()) + RHS.grow(this->size()); + + // Swap the shared elements. + size_t NumShared = this->size(); + if (NumShared > RHS.size()) + NumShared = RHS.size(); + for (size_type i = 0; i != NumShared; ++i) + std::swap((*this)[i], RHS[i]); + + // Copy over the extra elts. + if (this->size() > RHS.size()) { + size_t EltDiff = this->size() - RHS.size(); + this->uninitialized_copy(this->begin() + NumShared, this->end(), RHS.end()); + RHS.setEnd(RHS.end() + EltDiff); + this->destroy_range(this->begin() + NumShared, this->end()); + this->setEnd(this->begin() + NumShared); + } else if (RHS.size() > this->size()) { + size_t EltDiff = RHS.size() - this->size(); + this->uninitialized_copy(RHS.begin() + NumShared, RHS.end(), this->end()); + this->setEnd(this->end() + EltDiff); + this->destroy_range(RHS.begin() + NumShared, RHS.end()); + RHS.setEnd(RHS.begin() + NumShared); + } +} + +template +SmallVectorImpl& SmallVectorImpl::operator=( + const SmallVectorImpl& RHS) { + // Avoid self-assignment. + if (this == &RHS) + return *this; + + // If we already have sufficient space, assign the common elements, then + // destroy any excess. + size_t RHSSize = RHS.size(); + size_t CurSize = this->size(); + if (CurSize >= RHSSize) { + // Assign common elements. + iterator NewEnd; + if (RHSSize) + NewEnd = std::copy(RHS.begin(), RHS.begin() + RHSSize, this->begin()); + else + NewEnd = this->begin(); + + // Destroy excess elements. + this->destroy_range(NewEnd, this->end()); + + // Trim. + this->setEnd(NewEnd); + return *this; + } + + // If we have to grow to have enough elements, destroy the current elements. + // This allows us to avoid copying them during the grow. + // FIXME: don't do this if they're efficiently moveable. + if (this->capacity() < RHSSize) { + // Destroy current elements. + this->destroy_range(this->begin(), this->end()); + this->setEnd(this->begin()); + CurSize = 0; + this->grow(RHSSize); + } else if (CurSize) { + // Otherwise, use assignment for the already-constructed elements. + std::copy(RHS.begin(), RHS.begin() + CurSize, this->begin()); + } + + // Copy construct the new elements in place. + this->uninitialized_copy( + RHS.begin() + CurSize, RHS.end(), this->begin() + CurSize); + + // Set end. + this->setEnd(this->begin() + RHSSize); + return *this; +} + +template +SmallVectorImpl& SmallVectorImpl::operator=(SmallVectorImpl&& RHS) { + // Avoid self-assignment. + if (this == &RHS) + return *this; + + // If the RHS isn't small, clear this vector and then steal its buffer. + if (!RHS.isSmall()) { + this->destroy_range(this->begin(), this->end()); + if (!this->isSmall()) + free(this->begin()); + this->BeginX = RHS.BeginX; + this->EndX = RHS.EndX; + this->CapacityX = RHS.CapacityX; + RHS.resetToSmall(); + return *this; + } + + // If we already have sufficient space, assign the common elements, then + // destroy any excess. + size_t RHSSize = RHS.size(); + size_t CurSize = this->size(); + if (CurSize >= RHSSize) { + // Assign common elements. + iterator NewEnd = this->begin(); + if (RHSSize) + NewEnd = std::move(RHS.begin(), RHS.end(), NewEnd); + + // Destroy excess elements and trim the bounds. + this->destroy_range(NewEnd, this->end()); + this->setEnd(NewEnd); + + // Clear the RHS. + RHS.clear(); + + return *this; + } + + // If we have to grow to have enough elements, destroy the current elements. + // This allows us to avoid copying them during the grow. + // FIXME: this may not actually make any sense if we can efficiently move + // elements. + if (this->capacity() < RHSSize) { + // Destroy current elements. + this->destroy_range(this->begin(), this->end()); + this->setEnd(this->begin()); + CurSize = 0; + this->grow(RHSSize); + } else if (CurSize) { + // Otherwise, use assignment for the already-constructed elements. + std::move(RHS.begin(), RHS.begin() + CurSize, this->begin()); + } + + // Move-construct the new elements in place. + this->uninitialized_move( + RHS.begin() + CurSize, RHS.end(), this->begin() + CurSize); + + // Set end. + this->setEnd(this->begin() + RHSSize); + + RHS.clear(); + return *this; +} + +/// Storage for the SmallVector elements which aren't contained in +/// SmallVectorTemplateCommon. There are 'N-1' elements here. The remaining '1' +/// element is in the base class. This is specialized for the N=1 and N=0 cases +/// to avoid allocating unnecessary storage. +template +struct SmallVectorStorage { + typename SmallVectorTemplateCommon::U InlineElts[N - 1]; +}; +template +struct SmallVectorStorage {}; +template +struct SmallVectorStorage {}; + +/// This is a 'vector' (really, a variable-sized array), optimized +/// for the case when the array is small. It contains some number of elements +/// in-place, which allows it to avoid heap allocation when the actual number of +/// elements is below that threshold. This allows normal "small" cases to be +/// fast without losing generality for large inputs. +/// +/// Note that this does not attempt to be exception safe. +/// +template +class SmallVector : public SmallVectorImpl { + /// Inline space for elements which aren't stored in the base class. + SmallVectorStorage Storage; + + public: + SmallVector() : SmallVectorImpl(N) {} + + explicit SmallVector(size_t Size, const T& Value = T()) + : SmallVectorImpl(N) { + this->assign(Size, Value); + } + + template < + typename ItTy, + typename = typename std::enable_if::iterator_category, + std::input_iterator_tag>::value>::type> + SmallVector(ItTy S, ItTy E) : SmallVectorImpl(N) { + this->append(S, E); + } + + template + explicit SmallVector(Container&& c) : SmallVectorImpl(N) { + this->append(c.begin(), c.end()); + } + + SmallVector(std::initializer_list IL) : SmallVectorImpl(N) { + this->assign(IL); + } + + SmallVector(const SmallVector& RHS) : SmallVectorImpl(N) { + if (!RHS.empty()) + SmallVectorImpl::operator=(RHS); + } + + const SmallVector& operator=(const SmallVector& RHS) { + SmallVectorImpl::operator=(RHS); + return *this; + } + + SmallVector(SmallVector&& RHS) : SmallVectorImpl(N) { + if (!RHS.empty()) + SmallVectorImpl::operator=(::std::move(RHS)); + } + + template + const SmallVector& operator=(const Container& RHS) { + this->assign(RHS.begin(), RHS.end()); + return *this; + } + + SmallVector(SmallVectorImpl&& RHS) : SmallVectorImpl(N) { + if (!RHS.empty()) + SmallVectorImpl::operator=(::std::move(RHS)); + } + + const SmallVector& operator=(SmallVector&& RHS) { + SmallVectorImpl::operator=(::std::move(RHS)); + return *this; + } + + const SmallVector& operator=(SmallVectorImpl&& RHS) { + SmallVectorImpl::operator=(::std::move(RHS)); + return *this; + } + + template + const SmallVector& operator=(Container&& C) { + this->assign(C.begin(), C.end()); + return *this; + } + + const SmallVector& operator=(std::initializer_list IL) { + this->assign(IL); + return *this; + } +}; + +template +inline size_t capacity_in_bytes(const SmallVector& X) { + return X.capacity_in_bytes(); +} + +} // end namespace at + +namespace std { + +/// Implement std::swap in terms of SmallVector swap. +template +inline void swap(at::SmallVectorImpl& LHS, at::SmallVectorImpl& RHS) { + LHS.swap(RHS); +} + +/// Implement std::swap in terms of SmallVector swap. +template +inline void swap(at::SmallVector& LHS, at::SmallVector& RHS) { + LHS.swap(RHS); +} + +} // end namespace std diff --git a/aten/src/ATen/core/UniqueVoidPtr.cpp b/aten/src/ATen/core/UniqueVoidPtr.cpp new file mode 100644 index 00000000000000..fd08f7e13d2bf8 --- /dev/null +++ b/aten/src/ATen/core/UniqueVoidPtr.cpp @@ -0,0 +1,9 @@ +#include + +namespace at { +namespace detail { + +void deleteNothing(void*) {} + +} // namespace detail +} // namespace at diff --git a/aten/src/ATen/detail/UniqueVoidPtr.h b/aten/src/ATen/core/UniqueVoidPtr.h similarity index 77% rename from aten/src/ATen/detail/UniqueVoidPtr.h rename to aten/src/ATen/core/UniqueVoidPtr.h index e277014a7935d6..299c729e125a58 100644 --- a/aten/src/ATen/detail/UniqueVoidPtr.h +++ b/aten/src/ATen/core/UniqueVoidPtr.h @@ -1,15 +1,15 @@ #include -#include +#include namespace at { -using DeleterFnPtr = void(*)(void*); +using DeleterFnPtr = void (*)(void*); namespace detail { // Does not delete anything -AT_API void deleteNothing(void*); +AT_CORE_API void deleteNothing(void*); // A detail::UniqueVoidPtr is an owning smart pointer like unique_ptr, but // with three major differences: @@ -35,33 +35,47 @@ AT_API void deleteNothing(void*); // to reflect this. // class UniqueVoidPtr { -private: + private: // Lifetime tied to ctx_ void* data_; std::unique_ptr ctx_; -public: + + public: UniqueVoidPtr() : data_(nullptr), ctx_(nullptr, &deleteNothing) {} - explicit UniqueVoidPtr(void* data) : data_(data), ctx_(nullptr, &deleteNothing) {} + explicit UniqueVoidPtr(void* data) + : data_(data), ctx_(nullptr, &deleteNothing) {} UniqueVoidPtr(void* data, void* ctx, DeleterFnPtr ctx_deleter) - : data_(data), ctx_(ctx, ctx_deleter ? ctx_deleter : &deleteNothing) {} - void* operator->() const { return data_; } + : data_(data), ctx_(ctx, ctx_deleter ? ctx_deleter : &deleteNothing) {} + void* operator->() const { + return data_; + } void clear() { ctx_ = nullptr; data_ = nullptr; } - void* get() const { return data_; } - void* get_context() const { return ctx_.get(); } - void* release_context() { return ctx_.release(); } + void* get() const { + return data_; + } + void* get_context() const { + return ctx_.get(); + } + void* release_context() { + return ctx_.release(); + } template T* cast_context(DeleterFnPtr expected_deleter) const { - if (get_deleter() != expected_deleter) return nullptr; + if (get_deleter() != expected_deleter) + return nullptr; return static_cast(get_context()); } - operator bool() const { return data_ || ctx_; } - DeleterFnPtr get_deleter() const { return ctx_.get_deleter(); } + operator bool() const { + return data_ || ctx_; + } + DeleterFnPtr get_deleter() const { + return ctx_.get_deleter(); + } }; - // Note [How UniqueVoidPtr is implemented] // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // UniqueVoidPtr solves a common problem for allocators of tensor data, which @@ -80,9 +94,18 @@ class UniqueVoidPtr { // pointer itself. In simple cases, the context pointer is just the pointer // itself. -inline bool operator==(const UniqueVoidPtr& sp, std::nullptr_t) noexcept { return !sp; } -inline bool operator==(std::nullptr_t, const UniqueVoidPtr& sp) noexcept { return !sp; } -inline bool operator!=(const UniqueVoidPtr& sp, std::nullptr_t) noexcept { return sp; } -inline bool operator!=(std::nullptr_t, const UniqueVoidPtr& sp) noexcept { return sp; } +inline bool operator==(const UniqueVoidPtr& sp, std::nullptr_t) noexcept { + return !sp; +} +inline bool operator==(std::nullptr_t, const UniqueVoidPtr& sp) noexcept { + return !sp; +} +inline bool operator!=(const UniqueVoidPtr& sp, std::nullptr_t) noexcept { + return sp; +} +inline bool operator!=(std::nullptr_t, const UniqueVoidPtr& sp) noexcept { + return sp; +} -}} // namespace at::detail +} // namespace detail +} // namespace at diff --git a/aten/src/ATen/core/optional.h b/aten/src/ATen/core/optional.h new file mode 100644 index 00000000000000..8b0a7bfc4ead31 --- /dev/null +++ b/aten/src/ATen/core/optional.h @@ -0,0 +1,1027 @@ +// Copyright (C) 2011 - 2012 Andrzej Krzemienski. +// +// Use, modification, and distribution is subject to the Boost Software +// License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +// +// The idea and interface is based on Boost.Optional library +// authored by Fernando Luis Cacciola Carballal +// +// From https://github.com/akrzemi1/Optional +// +// ATen: +// - Move to `at` namespace. +// - Remove macro use in line 478 because the nvcc device compiler cannot handle +// it. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#define TR2_OPTIONAL_REQUIRES(...) \ + typename std::enable_if<__VA_ARGS__::value, bool>::type = false + +#if defined __GNUC__ // NOTE: GNUC is also defined for Clang +#if (__GNUC__ == 4) && (__GNUC_MINOR__ >= 8) +#define TR2_OPTIONAL_GCC_4_8_AND_HIGHER___ +#elif (__GNUC__ > 4) +#define TR2_OPTIONAL_GCC_4_8_AND_HIGHER___ +#endif +# +#if (__GNUC__ == 4) && (__GNUC_MINOR__ >= 7) +#define TR2_OPTIONAL_GCC_4_7_AND_HIGHER___ +#elif (__GNUC__ > 4) +#define TR2_OPTIONAL_GCC_4_7_AND_HIGHER___ +#endif +# +#if (__GNUC__ == 4) && (__GNUC_MINOR__ == 8) && (__GNUC_PATCHLEVEL__ >= 1) +#define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ +#elif (__GNUC__ == 4) && (__GNUC_MINOR__ >= 9) +#define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ +#elif (__GNUC__ > 4) +#define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ +#endif +#endif +# +#if defined __clang_major__ +#if (__clang_major__ == 3 && __clang_minor__ >= 5) +#define TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ +#elif (__clang_major__ > 3) +#define TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ +#endif +#if defined TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ +#define TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_ +#elif ( \ + __clang_major__ == 3 && __clang_minor__ == 4 && __clang_patchlevel__ >= 2) +#define TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_ +#endif +#endif +# +#if defined _MSC_VER +#if (_MSC_VER >= 1900) +#define TR2_OPTIONAL_MSVC_2015_AND_HIGHER___ +#endif +#endif + +#if defined __clang__ +#if (__clang_major__ > 2) || (__clang_major__ == 2) && (__clang_minor__ >= 9) +#define OPTIONAL_HAS_THIS_RVALUE_REFS 1 +#else +#define OPTIONAL_HAS_THIS_RVALUE_REFS 0 +#endif +#elif defined TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ +#define OPTIONAL_HAS_THIS_RVALUE_REFS 1 +#elif defined TR2_OPTIONAL_MSVC_2015_AND_HIGHER___ +#define OPTIONAL_HAS_THIS_RVALUE_REFS 1 +#else +#define OPTIONAL_HAS_THIS_RVALUE_REFS 0 +#endif + +#if defined TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ +#define OPTIONAL_HAS_CONSTEXPR_INIT_LIST 1 +#define OPTIONAL_CONSTEXPR_INIT_LIST constexpr +#else +#define OPTIONAL_HAS_CONSTEXPR_INIT_LIST 0 +#define OPTIONAL_CONSTEXPR_INIT_LIST +#endif + +#if defined TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ && (defined __cplusplus) && \ + (__cplusplus != 201103L) +#define OPTIONAL_HAS_MOVE_ACCESSORS 1 +#else +#define OPTIONAL_HAS_MOVE_ACCESSORS 0 +#endif + +#// In C++11 constexpr implies const, so we need to make non-const members also non-constexpr +#if (defined __cplusplus) && (__cplusplus == 201103L) +#define OPTIONAL_MUTABLE_CONSTEXPR +#else +#define OPTIONAL_MUTABLE_CONSTEXPR constexpr +#endif + +namespace at { + +// 20.5.4, optional for object types +template +class optional; + +// 20.5.5, optional for lvalue reference types +template +class optional; + +// workaround: std utility functions aren't constexpr yet +template +inline constexpr T&& constexpr_forward( + typename std::remove_reference::type& t) noexcept { + return static_cast(t); +} + +template +inline constexpr T&& constexpr_forward( + typename std::remove_reference::type&& t) noexcept { + static_assert(!std::is_lvalue_reference::value, "!!"); + return static_cast(t); +} + +template +inline constexpr typename std::remove_reference::type&& constexpr_move( + T&& t) noexcept { + return static_cast::type&&>(t); +} + +#if defined NDEBUG +#define TR2_OPTIONAL_ASSERTED_EXPRESSION(CHECK, EXPR) (EXPR) +#else +#define TR2_OPTIONAL_ASSERTED_EXPRESSION(CHECK, EXPR) \ + ((CHECK) ? (EXPR) : ([] { assert(!#CHECK); }(), (EXPR))) +#endif + +namespace detail_ { + +// static_addressof: a constexpr version of addressof +template +struct has_overloaded_addressof { + template + constexpr static bool has_overload(...) { + return false; + } + + template ().operator&())> + constexpr static bool has_overload(bool) { + return true; + } + + constexpr static bool value = has_overload(true); +}; + +template )> +constexpr T* static_addressof(T& ref) { + return &ref; +} + +template )> +T* static_addressof(T& ref) { + return std::addressof(ref); +} + +// the call to convert(b) has return type A and converts b to type A iff b +// decltype(b) is implicitly convertible to A +template +constexpr U convert(U v) { + return v; +} + +} // namespace detail_ + +constexpr struct trivial_init_t { +} trivial_init{}; + +// 20.5.6, In-place construction +constexpr struct in_place_t { +} in_place{}; + +// 20.5.7, Disengaged state indicator +struct nullopt_t { + struct init {}; + constexpr explicit nullopt_t(init) {} +}; +constexpr nullopt_t nullopt{nullopt_t::init()}; + +// 20.5.8, class bad_optional_access +class bad_optional_access : public std::logic_error { + public: + explicit bad_optional_access(const std::string& what_arg) + : logic_error{what_arg} {} + explicit bad_optional_access(const char* what_arg) : logic_error{what_arg} {} +}; + +template +union storage_t { + unsigned char dummy_; + T value_; + + constexpr storage_t(trivial_init_t) noexcept : dummy_(){}; + + template + constexpr storage_t(Args&&... args) + : value_(constexpr_forward(args)...) {} + + ~storage_t() {} +}; + +template +union constexpr_storage_t { + unsigned char dummy_; + T value_; + + constexpr constexpr_storage_t(trivial_init_t) noexcept : dummy_(){}; + + template + constexpr constexpr_storage_t(Args&&... args) + : value_(constexpr_forward(args)...) {} + + ~constexpr_storage_t() = default; +}; + +template +struct optional_base { + bool init_; + storage_t storage_; + + constexpr optional_base() noexcept : init_(false), storage_(trivial_init){}; + + explicit constexpr optional_base(const T& v) : init_(true), storage_(v) {} + + explicit constexpr optional_base(T&& v) + : init_(true), storage_(constexpr_move(v)) {} + + template + explicit optional_base(in_place_t, Args&&... args) + : init_(true), storage_(constexpr_forward(args)...) {} + + template < + class U, + class... Args, + TR2_OPTIONAL_REQUIRES(std::is_constructible>)> + explicit optional_base( + in_place_t, + std::initializer_list il, + Args&&... args) + : init_(true), storage_(il, std::forward(args)...) {} + + ~optional_base() { + if (init_) + storage_.value_.T::~T(); + } +}; + +template +struct constexpr_optional_base { + bool init_; + constexpr_storage_t storage_; + + constexpr constexpr_optional_base() noexcept + : init_(false), storage_(trivial_init){}; + + explicit constexpr constexpr_optional_base(const T& v) + : init_(true), storage_(v) {} + + explicit constexpr constexpr_optional_base(T&& v) + : init_(true), storage_(constexpr_move(v)) {} + + template + explicit constexpr constexpr_optional_base(in_place_t, Args&&... args) + : init_(true), storage_(constexpr_forward(args)...) {} + + template < + class U, + class... Args, + TR2_OPTIONAL_REQUIRES(std::is_constructible>)> + OPTIONAL_CONSTEXPR_INIT_LIST explicit constexpr_optional_base( + in_place_t, + std::initializer_list il, + Args&&... args) + : init_(true), storage_(il, std::forward(args)...) {} + + ~constexpr_optional_base() = default; +}; + +template +using OptionalBase = typename std::conditional< + std::is_trivially_destructible::value, // if possible + constexpr_optional_base::type>, // use base with trivial destructor + optional_base::type>>::type; + +template +class optional : private OptionalBase { + static_assert( + !std::is_same::type, nullopt_t>::value, + "bad T"); + static_assert( + !std::is_same::type, in_place_t>::value, + "bad T"); + + constexpr bool initialized() const noexcept { + return OptionalBase::init_; + } + typename std::remove_const::type* dataptr() { + return std::addressof(OptionalBase::storage_.value_); + } + constexpr const T* dataptr() const { + return detail_::static_addressof(OptionalBase::storage_.value_); + } + +#if OPTIONAL_HAS_THIS_RVALUE_REFS == 1 + constexpr const T& contained_val() const& { + return OptionalBase::storage_.value_; + } +#if OPTIONAL_HAS_MOVE_ACCESSORS == 1 + OPTIONAL_MUTABLE_CONSTEXPR T&& contained_val() && { + return std::move(OptionalBase::storage_.value_); + } + OPTIONAL_MUTABLE_CONSTEXPR T& contained_val() & { + return OptionalBase::storage_.value_; + } +#else + T& contained_val() & { + return OptionalBase::storage_.value_; + } + T&& contained_val() && { + return std::move(OptionalBase::storage_.value_); + } +#endif +#else + constexpr const T& contained_val() const { + return OptionalBase::storage_.value_; + } + T& contained_val() { + return OptionalBase::storage_.value_; + } +#endif + + void clear() noexcept { + if (initialized()) + dataptr()->T::~T(); + OptionalBase::init_ = false; + } + + template + void initialize(Args&&... args) noexcept( + noexcept(T(std::forward(args)...))) { + assert(!OptionalBase::init_); + ::new (static_cast(dataptr())) T(std::forward(args)...); + OptionalBase::init_ = true; + } + + template + void initialize(std::initializer_list il, Args&&... args) noexcept( + noexcept(T(il, std::forward(args)...))) { + assert(!OptionalBase::init_); + ::new (static_cast(dataptr())) T(il, std::forward(args)...); + OptionalBase::init_ = true; + } + + public: + typedef T value_type; + + // 20.5.5.1, constructors + constexpr optional() noexcept : OptionalBase(){}; + constexpr optional(nullopt_t) noexcept : OptionalBase(){}; + + optional(const optional& rhs) : OptionalBase() { + if (rhs.initialized()) { + ::new (static_cast(dataptr())) T(*rhs); + OptionalBase::init_ = true; + } + } + + optional(optional&& rhs) noexcept( + std::is_nothrow_move_constructible::value) + : OptionalBase() { + if (rhs.initialized()) { + ::new (static_cast(dataptr())) T(std::move(*rhs)); + OptionalBase::init_ = true; + } + } + + constexpr optional(const T& v) : OptionalBase(v) {} + + constexpr optional(T&& v) : OptionalBase(constexpr_move(v)) {} + + template + explicit constexpr optional(in_place_t, Args&&... args) + : OptionalBase(in_place_t{}, constexpr_forward(args)...) {} + + template < + class U, + class... Args, + TR2_OPTIONAL_REQUIRES(std::is_constructible>)> + OPTIONAL_CONSTEXPR_INIT_LIST explicit optional( + in_place_t, + std::initializer_list il, + Args&&... args) + : OptionalBase(in_place_t{}, il, constexpr_forward(args)...) {} + + // 20.5.4.2, Destructor + ~optional() = default; + + // 20.5.4.3, assignment + optional& operator=(nullopt_t) noexcept { + clear(); + return *this; + } + + optional& operator=(const optional& rhs) { + if (initialized() == true && rhs.initialized() == false) + clear(); + else if (initialized() == false && rhs.initialized() == true) + initialize(*rhs); + else if (initialized() == true && rhs.initialized() == true) + contained_val() = *rhs; + return *this; + } + + optional& operator=(optional&& rhs) noexcept( + std::is_nothrow_move_assignable::value&& + std::is_nothrow_move_constructible::value) { + if (initialized() == true && rhs.initialized() == false) + clear(); + else if (initialized() == false && rhs.initialized() == true) + initialize(std::move(*rhs)); + else if (initialized() == true && rhs.initialized() == true) + contained_val() = std::move(*rhs); + return *this; + } + + template + auto operator=(U&& v) -> typename std::enable_if< + std::is_same::type, T>::value, + optional&>::type { + if (initialized()) { + contained_val() = std::forward(v); + } else { + initialize(std::forward(v)); + } + return *this; + } + + template + void emplace(Args&&... args) { + clear(); + initialize(std::forward(args)...); + } + + template + void emplace(std::initializer_list il, Args&&... args) { + clear(); + initialize(il, std::forward(args)...); + } + + // 20.5.4.4, Swap + void swap(optional& rhs) noexcept( + std::is_nothrow_move_constructible::value&& noexcept( + swap(std::declval(), std::declval()))) { + if (initialized() == true && rhs.initialized() == false) { + rhs.initialize(std::move(**this)); + clear(); + } else if (initialized() == false && rhs.initialized() == true) { + initialize(std::move(*rhs)); + rhs.clear(); + } else if (initialized() == true && rhs.initialized() == true) { + using std::swap; + swap(**this, *rhs); + } + } + + // 20.5.4.5, Observers + + explicit constexpr operator bool() const noexcept { + return initialized(); + } + constexpr bool has_value() const noexcept { + return initialized(); + } + + constexpr T const* operator->() const { + return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), dataptr()); + } + +#if OPTIONAL_HAS_MOVE_ACCESSORS == 1 + + OPTIONAL_MUTABLE_CONSTEXPR T* operator->() { + assert(initialized()); + return dataptr(); + } + + constexpr T const& operator*() const& { + return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), contained_val()); + } + + OPTIONAL_MUTABLE_CONSTEXPR T& operator*() & { + assert(initialized()); + return contained_val(); + } + + OPTIONAL_MUTABLE_CONSTEXPR T&& operator*() && { + assert(initialized()); + return constexpr_move(contained_val()); + } + + constexpr T const& value() const& { + return initialized() + ? contained_val() + : (throw bad_optional_access("bad optional access"), contained_val()); + } + + OPTIONAL_MUTABLE_CONSTEXPR T& value() & { + return initialized() + ? contained_val() + : (throw bad_optional_access("bad optional access"), contained_val()); + } + + OPTIONAL_MUTABLE_CONSTEXPR T&& value() && { + if (!initialized()) + throw bad_optional_access("bad optional access"); + return std::move(contained_val()); + } + +#else + + T* operator->() { + assert(initialized()); + return dataptr(); + } + + constexpr T const& operator*() const { + return contained_val(); + } + + T& operator*() { + assert(initialized()); + return contained_val(); + } + + constexpr T const& value() const { + return initialized() + ? contained_val() + : (throw bad_optional_access("bad optional access"), contained_val()); + } + + T& value() { + return initialized() + ? contained_val() + : (throw bad_optional_access("bad optional access"), contained_val()); + } + +#endif + +#if OPTIONAL_HAS_THIS_RVALUE_REFS == 1 + + template + constexpr T value_or(V&& v) const& { + return *this ? **this : detail_::convert(constexpr_forward(v)); + } + +#if OPTIONAL_HAS_MOVE_ACCESSORS == 1 + + template + OPTIONAL_MUTABLE_CONSTEXPR T value_or(V&& v) && { + return *this + ? constexpr_move(const_cast&>(*this).contained_val()) + : detail_::convert(constexpr_forward(v)); + } + +#else + + template + T value_or(V&& v) && { + return *this + ? constexpr_move(const_cast&>(*this).contained_val()) + : detail_::convert(constexpr_forward(v)); + } + +#endif + +#else + + template + constexpr T value_or(V&& v) const { + return *this ? **this : detail_::convert(constexpr_forward(v)); + } + +#endif + + // 20.6.3.6, modifiers + void reset() noexcept { + clear(); + } +}; + +template +class optional { + static_assert(!std::is_same::value, "bad T"); + static_assert(!std::is_same::value, "bad T"); + T* ref; + + public: + // 20.5.5.1, construction/destruction + constexpr optional() noexcept : ref(nullptr) {} + + constexpr optional(nullopt_t) noexcept : ref(nullptr) {} + + constexpr optional(T& v) noexcept : ref(detail_::static_addressof(v)) {} + + optional(T&&) = delete; + + constexpr optional(const optional& rhs) noexcept : ref(rhs.ref) {} + + explicit constexpr optional(in_place_t, T& v) noexcept + : ref(detail_::static_addressof(v)) {} + + explicit optional(in_place_t, T&&) = delete; + + ~optional() = default; + + // 20.5.5.2, mutation + optional& operator=(nullopt_t) noexcept { + ref = nullptr; + return *this; + } + + // optional& operator=(const optional& rhs) noexcept { + // ref = rhs.ref; + // return *this; + // } + + // optional& operator=(optional&& rhs) noexcept { + // ref = rhs.ref; + // return *this; + // } + + template + auto operator=(U&& rhs) noexcept -> typename std::enable_if< + std::is_same::type, optional>::value, + optional&>::type { + ref = rhs.ref; + return *this; + } + + template + auto operator=(U&& rhs) noexcept -> typename std::enable_if< + !std::is_same::type, optional>::value, + optional&>::type = delete; + + void emplace(T& v) noexcept { + ref = detail_::static_addressof(v); + } + + void emplace(T&&) = delete; + + void swap(optional& rhs) noexcept { + std::swap(ref, rhs.ref); + } + + // 20.5.5.3, observers + constexpr T* operator->() const { + return TR2_OPTIONAL_ASSERTED_EXPRESSION(ref, ref); + } + + constexpr T& operator*() const { + return TR2_OPTIONAL_ASSERTED_EXPRESSION(ref, *ref); + } + + constexpr T& value() const { + return ref ? *ref + : (throw bad_optional_access("bad optional access"), *ref); + } + + explicit constexpr operator bool() const noexcept { + return ref != nullptr; + } + + constexpr bool has_value() const noexcept { + return ref != nullptr; + } + + template + constexpr typename std::decay::type value_or(V&& v) const { + return *this ? **this + : detail_::convert::type>( + constexpr_forward(v)); + } + + // x.x.x.x, modifiers + void reset() noexcept { + ref = nullptr; + } +}; + +template +class optional { + static_assert(sizeof(T) == 0, "optional rvalue references disallowed"); +}; + +// 20.5.8, Relational operators +template +constexpr bool operator==(const optional& x, const optional& y) { + return bool(x) != bool(y) ? false : bool(x) == false ? true : *x == *y; +} + +template +constexpr bool operator!=(const optional& x, const optional& y) { + return !(x == y); +} + +template +constexpr bool operator<(const optional& x, const optional& y) { + return (!y) ? false : (!x) ? true : *x < *y; +} + +template +constexpr bool operator>(const optional& x, const optional& y) { + return (y < x); +} + +template +constexpr bool operator<=(const optional& x, const optional& y) { + return !(y < x); +} + +template +constexpr bool operator>=(const optional& x, const optional& y) { + return !(x < y); +} + +// 20.5.9, Comparison with nullopt +template +constexpr bool operator==(const optional& x, nullopt_t) noexcept { + return (!x); +} + +template +constexpr bool operator==(nullopt_t, const optional& x) noexcept { + return (!x); +} + +template +constexpr bool operator!=(const optional& x, nullopt_t) noexcept { + return bool(x); +} + +template +constexpr bool operator!=(nullopt_t, const optional& x) noexcept { + return bool(x); +} + +template +constexpr bool operator<(const optional&, nullopt_t) noexcept { + return false; +} + +template +constexpr bool operator<(nullopt_t, const optional& x) noexcept { + return bool(x); +} + +template +constexpr bool operator<=(const optional& x, nullopt_t) noexcept { + return (!x); +} + +template +constexpr bool operator<=(nullopt_t, const optional&) noexcept { + return true; +} + +template +constexpr bool operator>(const optional& x, nullopt_t) noexcept { + return bool(x); +} + +template +constexpr bool operator>(nullopt_t, const optional&) noexcept { + return false; +} + +template +constexpr bool operator>=(const optional&, nullopt_t) noexcept { + return true; +} + +template +constexpr bool operator>=(nullopt_t, const optional& x) noexcept { + return (!x); +} + +// 20.5.10, Comparison with T +template +constexpr bool operator==(const optional& x, const T& v) { + return bool(x) ? *x == v : false; +} + +template +constexpr bool operator==(const T& v, const optional& x) { + return bool(x) ? v == *x : false; +} + +template +constexpr bool operator!=(const optional& x, const T& v) { + return bool(x) ? *x != v : true; +} + +template +constexpr bool operator!=(const T& v, const optional& x) { + return bool(x) ? v != *x : true; +} + +template +constexpr bool operator<(const optional& x, const T& v) { + return bool(x) ? *x < v : true; +} + +template +constexpr bool operator>(const T& v, const optional& x) { + return bool(x) ? v > *x : true; +} + +template +constexpr bool operator>(const optional& x, const T& v) { + return bool(x) ? *x > v : false; +} + +template +constexpr bool operator<(const T& v, const optional& x) { + return bool(x) ? v < *x : false; +} + +template +constexpr bool operator>=(const optional& x, const T& v) { + return bool(x) ? *x >= v : false; +} + +template +constexpr bool operator<=(const T& v, const optional& x) { + return bool(x) ? v <= *x : false; +} + +template +constexpr bool operator<=(const optional& x, const T& v) { + return bool(x) ? *x <= v : true; +} + +template +constexpr bool operator>=(const T& v, const optional& x) { + return bool(x) ? v >= *x : true; +} + +// Comparison of optional with T +template +constexpr bool operator==(const optional& x, const T& v) { + return bool(x) ? *x == v : false; +} + +template +constexpr bool operator==(const T& v, const optional& x) { + return bool(x) ? v == *x : false; +} + +template +constexpr bool operator!=(const optional& x, const T& v) { + return bool(x) ? *x != v : true; +} + +template +constexpr bool operator!=(const T& v, const optional& x) { + return bool(x) ? v != *x : true; +} + +template +constexpr bool operator<(const optional& x, const T& v) { + return bool(x) ? *x < v : true; +} + +template +constexpr bool operator>(const T& v, const optional& x) { + return bool(x) ? v > *x : true; +} + +template +constexpr bool operator>(const optional& x, const T& v) { + return bool(x) ? *x > v : false; +} + +template +constexpr bool operator<(const T& v, const optional& x) { + return bool(x) ? v < *x : false; +} + +template +constexpr bool operator>=(const optional& x, const T& v) { + return bool(x) ? *x >= v : false; +} + +template +constexpr bool operator<=(const T& v, const optional& x) { + return bool(x) ? v <= *x : false; +} + +template +constexpr bool operator<=(const optional& x, const T& v) { + return bool(x) ? *x <= v : true; +} + +template +constexpr bool operator>=(const T& v, const optional& x) { + return bool(x) ? v >= *x : true; +} + +// Comparison of optional with T +template +constexpr bool operator==(const optional& x, const T& v) { + return bool(x) ? *x == v : false; +} + +template +constexpr bool operator==(const T& v, const optional& x) { + return bool(x) ? v == *x : false; +} + +template +constexpr bool operator!=(const optional& x, const T& v) { + return bool(x) ? *x != v : true; +} + +template +constexpr bool operator!=(const T& v, const optional& x) { + return bool(x) ? v != *x : true; +} + +template +constexpr bool operator<(const optional& x, const T& v) { + return bool(x) ? *x < v : true; +} + +template +constexpr bool operator>(const T& v, const optional& x) { + return bool(x) ? v > *x : true; +} + +template +constexpr bool operator>(const optional& x, const T& v) { + return bool(x) ? *x > v : false; +} + +template +constexpr bool operator<(const T& v, const optional& x) { + return bool(x) ? v < *x : false; +} + +template +constexpr bool operator>=(const optional& x, const T& v) { + return bool(x) ? *x >= v : false; +} + +template +constexpr bool operator<=(const T& v, const optional& x) { + return bool(x) ? v <= *x : false; +} + +template +constexpr bool operator<=(const optional& x, const T& v) { + return bool(x) ? *x <= v : true; +} + +template +constexpr bool operator>=(const T& v, const optional& x) { + return bool(x) ? v >= *x : true; +} + +// 20.5.12, Specialized algorithms +template +void swap(optional& x, optional& y) noexcept(noexcept(x.swap(y))) { + x.swap(y); +} + +template +constexpr optional::type> make_optional(T&& v) { + return optional::type>(constexpr_forward(v)); +} + +template +constexpr optional make_optional(std::reference_wrapper v) { + return optional(v.get()); +} + +} // namespace at + +namespace std { +template +struct hash> { + typedef typename hash::result_type result_type; + typedef at::optional argument_type; + + constexpr result_type operator()(argument_type const& arg) const { + return arg ? std::hash{}(*arg) : result_type{}; + } +}; + +template +struct hash> { + typedef typename hash::result_type result_type; + typedef at::optional argument_type; + + constexpr result_type operator()(argument_type const& arg) const { + return arg ? std::hash{}(*arg) : result_type{}; + } +}; +} // namespace std + +#undef TR2_OPTIONAL_REQUIRES +#undef TR2_OPTIONAL_ASSERTED_EXPRESSION diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h new file mode 100644 index 00000000000000..eed9f677a2ef18 --- /dev/null +++ b/aten/src/ATen/cuda/detail/KernelUtils.h @@ -0,0 +1,20 @@ +#pragma once +// Contents of this file are copied from THCUNN/common.h for the ease of porting +// THCUNN functions into ATen. + +namespace at { namespace cuda { namespace detail { + +// CUDA: grid stride looping +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) + +// Use 1024 threads per block, which requires cuda sm_2x or above +constexpr int CUDA_NUM_THREADS = 1024; + +// CUDA: number of blocks for threads. +inline int GET_BLOCKS(const int N) +{ + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + +}}} // namespace at::cuda::detail diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h index 085f2723bf0455..7ce3da3c9e051c 100644 --- a/aten/src/ATen/cudnn/Descriptors.h +++ b/aten/src/ATen/cudnn/Descriptors.h @@ -319,6 +319,20 @@ struct AT_CUDA_API RNNDescriptor } }; +#if CUDNN_VERSION >= 7000 + +struct AT_CUDA_API CTCLossDescriptor + : public Descriptor +{ + void set(cudnnDataType_t datatype) { + AT_CUDNN_CHECK(cudnnSetCTCLossDescriptor(mut_desc(), datatype)); + } +}; + +#endif + union Constant { float f; diff --git a/aten/src/ATen/detail/UniqueVoidPtr.cpp b/aten/src/ATen/detail/UniqueVoidPtr.cpp deleted file mode 100644 index 07531d826367ae..00000000000000 --- a/aten/src/ATen/detail/UniqueVoidPtr.cpp +++ /dev/null @@ -1,7 +0,0 @@ -#include - -namespace at { namespace detail { - -void deleteNothing(void*) {} - -}} // namespace at diff --git a/aten/src/ATen/detail/VariableHooksInterface.h b/aten/src/ATen/detail/VariableHooksInterface.h index 287116490397f3..836dacb97766ec 100644 --- a/aten/src/ATen/detail/VariableHooksInterface.h +++ b/aten/src/ATen/detail/VariableHooksInterface.h @@ -3,6 +3,7 @@ #include #include #include +#include namespace at { class Context; @@ -25,6 +26,10 @@ struct AT_API VariableHooksInterface { // squelch -Werror=non-virtual-dtor virtual ~VariableHooksInterface() {} + virtual Type& getVariableType(const at::Type& baseType) const { + AT_ERROR("cannot getVariableType without libtorch"); + } + virtual void registerVariableTypeFor(Context*, Backend backend, ScalarType scalar_type) const { // no-op if Variable not available; it'll get handled (if at all) when // libtorch.so gets loaded diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py index 93c20d4be032f4..b012de25194361 100644 --- a/aten/src/ATen/function_wrapper.py +++ b/aten/src/ATen/function_wrapper.py @@ -290,7 +290,7 @@ def __init__(self, reason): 'Backend::${DenseBackend}, ScalarType::Long)'), 'THStorage*': CodeTemplate( - 'checked_cast_storage<${Storage}>(' + 'checked_cast_storage(' '&${arg_name},"${arg_name}",${arg_pos}, ' 'Backend::${Backend}, ScalarType::${ScalarName})'), 'THGenerator*': diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py index 0f2aaffd6eac9d..209cca57c293ff 100644 --- a/aten/src/ATen/gen.py +++ b/aten/src/ATen/gen.py @@ -103,10 +103,6 @@ def check_all_files_written(self): TEMPLATE_PATH = options.source_path + "/templates" GENERATOR_DERIVED = CodeTemplate.from_file( TEMPLATE_PATH + "/GeneratorDerived.h") -STORAGE_DERIVED_CPP = CodeTemplate.from_file( - TEMPLATE_PATH + "/StorageDerived.cpp") -STORAGE_DERIVED_H = CodeTemplate.from_file(TEMPLATE_PATH + "/StorageDerived.h") - TYPE_DERIVED_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDerived.cpp") SPARSE_TYPE_DERIVED_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/SparseTypeDerived.cpp") TYPE_DERIVED_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDerived.h") @@ -237,7 +233,6 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations env['isFloatingType'] = is_floating_type env['isIntegralType'] = not is_floating_type if density == 'Dense': - env['Storage'] = "{}{}Storage".format(backend, scalar_name) env['Tensor'] = "{}{}{}Tensor".format(density_tag, backend, scalar_name) env['Type'] = "{}{}{}Type".format(density_tag, backend, scalar_name) env['DenseTensor'] = "{}{}Tensor".format(backend, scalar_name) @@ -246,7 +241,6 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations env['storage_tensor_headers'] = [] if density != 'Sparse': env['storage_tensor_headers'] = [ - '#include "ATen/{}.h"'.format(env['Storage']), '#include "ATen/{}.h"'.format(env['Tensor']), '#include "ATen/{}ByteTensor.h"'.format(env['Backend']), '#include "ATen/{}IntTensor.h"'.format(env['Backend']), @@ -322,8 +316,6 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations if density != 'Sparse': # there are no storage or tensor types for sparse; it's all uniform - fm.write(env['Storage'] + ".cpp", STORAGE_DERIVED_CPP, env) - fm.write(env['Storage'] + ".h", STORAGE_DERIVED_H, env) env['TensorDenseOrSparse'] = TENSOR_DENSE_CPP.substitute(env) fm.write(env['Tensor'] + ".cpp", TENSOR_DERIVED_CPP, env) fm.write(env['Tensor'] + ".h", TENSOR_DERIVED_H, env) @@ -379,7 +371,7 @@ def declare_outputs(): for backend, density, scalar_types in iterate_types(): scalar_name = scalar_types[0] full_backend = "Sparse" + backend if density == "Sparse" else backend - for kind in ["Storage", "Type", "Tensor"]: + for kind in ["Type", "Tensor"]: if kind != 'Type' and density == "Sparse": # No Storage or Tensor for sparse continue diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp index a3dc735ab1e4cb..36f1e4c0bf86de 100644 --- a/aten/src/ATen/native/Activation.cpp +++ b/aten/src/ATen/native/Activation.cpp @@ -25,6 +25,16 @@ Tensor & selu_(Tensor & self) { return at::elu_(self, SELU_ALPHA, SELU_SCALE); } +Tensor celu(const Tensor & self, Scalar alpha) { + double inv_alpha = 1. / alpha.to(); + return at::elu(self, 1.0, alpha, Scalar(inv_alpha)); +} + +Tensor & celu_(Tensor & self, Scalar alpha) { + double inv_alpha = 1. / alpha.to(); + return at::elu_(self, 1.0, alpha, Scalar(inv_alpha)); +} + Tensor rrelu(const Tensor & self, Scalar lower, Scalar upper, bool training, Generator* generator) { return at::rrelu_with_noise(self, self.type().tensor(), lower, upper, training, generator); } diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp index a537691f748171..4028e989b87022 100644 --- a/aten/src/ATen/native/Convolution.cpp +++ b/aten/src/ATen/native/Convolution.cpp @@ -402,11 +402,11 @@ at::Tensor _convolution_nogroup( bool transposed, IntList output_padding) { ConvParams params; - params.stride = stride; - params.padding = padding; - params.dilation = dilation; + params.stride = stride.vec(); + params.padding = padding.vec(); + params.dilation = dilation.vec(); params.transposed = transposed; - params.output_padding = output_padding; + params.output_padding = output_padding.vec(); params.groups = 1; params.benchmark = false; params.deterministic = false; @@ -474,11 +474,11 @@ std::tuple _convolution_double_backward( auto weight = weight_r; ConvParams params; - params.stride = stride_; - params.padding = padding_; - params.dilation = dilation_; + params.stride = stride_.vec(); + params.padding = padding_.vec(); + params.dilation = dilation_.vec(); params.transposed = transposed_; - params.output_padding = output_padding_; + params.output_padding = output_padding_.vec(); params.groups = groups_; params.benchmark = benchmark; params.deterministic = deterministic; diff --git a/aten/src/ATen/native/Distributions.h b/aten/src/ATen/native/Distributions.h index 7a6e0788531172..c374740a3ce7d1 100644 --- a/aten/src/ATen/native/Distributions.h +++ b/aten/src/ATen/native/Distributions.h @@ -57,6 +57,7 @@ deviceforcuda scalar_t sample_gamma(scalar_t alpha, BaseSampler& st // Boost alpha for higher acceptance probability. if (alpha < 1.0f) { + if (alpha == 0.f) return 0.f; scale *= std::pow(1 - standard_uniform.sample(), 1.0f / alpha); alpha += 1.0f; } diff --git a/aten/src/ATen/native/Embedding.cpp b/aten/src/ATen/native/Embedding.cpp index 7599386ee74172..0026a9907d7eca 100644 --- a/aten/src/ATen/native/Embedding.cpp +++ b/aten/src/ATen/native/Embedding.cpp @@ -24,7 +24,7 @@ Tensor embedding(const Tensor & weight, const Tensor & indices, return weight.index_select(0, indices); } - auto size = std::vector(indices.sizes()); + auto size = indices.sizes().vec(); for (auto d : weight.sizes().slice(1)) { size.push_back(d); } diff --git a/aten/src/ATen/native/GridSampler.cpp b/aten/src/ATen/native/GridSampler.cpp new file mode 100644 index 00000000000000..5f1c8255772dcf --- /dev/null +++ b/aten/src/ATen/native/GridSampler.cpp @@ -0,0 +1,780 @@ +#include "ATen/ATen.h" +#include "ATen/NativeFunctions.h" +#include "ATen/detail/CUDAHooksInterface.h" +#include "ATen/native/GridSampler.h" + +#ifdef _OPENMP +#include +#endif + +namespace at { namespace native { + +using at::native::detail::GridSamplerInterpolation; +using at::native::detail::GridSamplerPadding; + +namespace { + static inline int64_t clip_coordinates(int64_t in, int64_t clip_limit) { + return std::min(clip_limit - 1, std::max(in, static_cast(0))); + } + + static inline bool within_bounds_2d(int64_t h, int64_t w, int64_t H, int64_t W) { + return h >= 0 && h < H && w >= 0 && w < W; + } + + static inline bool within_bounds_3d(int64_t d, int64_t h, int64_t w, int64_t D, int64_t H, int64_t W) { + return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W; + } + + template + static inline void safe_add_2d(scalar_t *data, int64_t h, int64_t w, + int64_t sH, int64_t sW, int64_t H, int64_t W, + scalar_t delta) { + if (within_bounds_2d(h, w, H, W)) { + data[h * sH + w * sW] += delta; + } + } + + template + static inline void safe_add_3d(scalar_t *data, int64_t d, int64_t h, int64_t w, + int64_t sD, int64_t sH, int64_t sW, + int64_t D, int64_t H, int64_t W, + scalar_t delta) { + if (within_bounds_3d(d, h, w, D, H, W)) { + data[d * sD + h * sH + w * sW] += delta; + } + } + + template + Tensor grid_sampler2d_cpu_impl(const Tensor& input, const Tensor& grid, + GridSamplerInterpolation interpolation_mode, + GridSamplerPadding padding_mode) { + int64_t N = input.size(0); + int64_t C = input.size(1); + int64_t inp_H = input.size(2); + int64_t inp_W = input.size(3); + int64_t out_H = grid.size(1); + int64_t out_W = grid.size(2); + auto output = at::empty({N, C, out_H, out_W}, input.options()); + int64_t inp_sN = input.stride(0); + int64_t inp_sC = input.stride(1); + int64_t inp_sH = input.stride(2); + int64_t inp_sW = input.stride(3); + int64_t grid_sN = grid.stride(0); + int64_t grid_sH = grid.stride(1); + int64_t grid_sW = grid.stride(2); + int64_t grid_sCoor = grid.stride(3); + int64_t out_sN = output.stride(0); + int64_t out_sC = output.stride(1); + int64_t out_sH = output.stride(2); + int64_t out_sW = output.stride(3); + scalar_t *inp_ptr = input.data(); + scalar_t *out_ptr = output.data(); + scalar_t *grid_ptr = grid.data(); + // loop over each output pixel + #ifdef _OPENMP + #pragma omp parallel for + #endif + for (int64_t n = 0; n < N; ++n) { + scalar_t *grid_ptr_N = grid_ptr + n * grid_sN; + scalar_t *inp_ptr_N = inp_ptr + n * inp_sN; + for (int64_t h = 0; h < out_H; ++h) { + for (int64_t w = 0; w < out_W; ++w) { + // get the corresponding input x, y co-ordinates from grid + scalar_t ix = grid_ptr_N[h * grid_sH + w * grid_sW]; + scalar_t iy = grid_ptr_N[h * grid_sH + w * grid_sW + grid_sCoor]; + + // normalize ix, iy from [-1, 1] to [0, inp_W-1] & [0, inp_H-1] + ix = ((ix + 1) / 2) * (inp_W - 1); + iy = ((iy + 1) / 2) * (inp_H - 1); + + // get NE, NW, SE, SW pixel values from (x, y) + int64_t ix_nw = static_cast(std::floor(ix)); + int64_t iy_nw = static_cast(std::floor(iy)); + int64_t ix_ne = ix_nw + 1; + int64_t iy_ne = iy_nw; + int64_t ix_sw = ix_nw; + int64_t iy_sw = iy_nw + 1; + int64_t ix_se = ix_nw + 1; + int64_t iy_se = iy_nw + 1; + + // get surfaces to each neighbor: + scalar_t nw = (ix_se - ix) * (iy_se - iy); + scalar_t ne = (ix - ix_sw) * (iy_sw - iy); + scalar_t sw = (ix_ne - ix) * (iy - iy_ne); + scalar_t se = (ix - ix_nw) * (iy - iy_nw); + + if (padding_mode == GridSamplerPadding::Border) { + // clip coordinates to image borders + ix_nw = clip_coordinates(ix_nw, inp_W); + iy_nw = clip_coordinates(iy_nw, inp_H); + ix_ne = clip_coordinates(ix_ne, inp_W); + iy_ne = clip_coordinates(iy_ne, inp_H); + ix_sw = clip_coordinates(ix_sw, inp_W); + iy_sw = clip_coordinates(iy_sw, inp_H); + ix_se = clip_coordinates(ix_se, inp_W); + iy_se = clip_coordinates(iy_se, inp_H); + } + + // calculate bilinear weighted pixel value and set output pixel + scalar_t *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW; + scalar_t *inp_ptr_NC = inp_ptr_N; + for (int c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) { + // (c, iy_nw, ix_nw) * nw + (c, iy_ne, ix_ne) * ne + // + (c, iy_sw, ix_sw) * sw + (c, iy_se, ix_se) * se + *out_ptr_NCHW = static_cast(0); + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) { + *out_ptr_NCHW += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) { + *out_ptr_NCHW += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) { + *out_ptr_NCHW += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) { + *out_ptr_NCHW += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se; + } + } + } + } + } + return output; + } + + template + Tensor grid_sampler3d_cpu_impl(const Tensor& input, const Tensor& grid, + GridSamplerInterpolation interpolation_mode, + GridSamplerPadding padding_mode) { + int64_t N = input.size(0); + int64_t C = input.size(1); + int64_t inp_D = input.size(2); + int64_t inp_H = input.size(3); + int64_t inp_W = input.size(4); + int64_t out_D = grid.size(1); + int64_t out_H = grid.size(2); + int64_t out_W = grid.size(3); + auto output = at::empty({N, C, out_D, out_H, out_W}, input.options()); + int64_t inp_sN = input.stride(0); + int64_t inp_sC = input.stride(1); + int64_t inp_sD = input.stride(2); + int64_t inp_sH = input.stride(3); + int64_t inp_sW = input.stride(4); + int64_t grid_sN = grid.stride(0); + int64_t grid_sD = grid.stride(1); + int64_t grid_sH = grid.stride(2); + int64_t grid_sW = grid.stride(3); + int64_t grid_sCoor = grid.stride(4); + int64_t out_sN = output.stride(0); + int64_t out_sC = output.stride(1); + int64_t out_sD = output.stride(2); + int64_t out_sH = output.stride(3); + int64_t out_sW = output.stride(4); + scalar_t *inp_ptr = input.data(); + scalar_t *out_ptr = output.data(); + scalar_t *grid_ptr = grid.data(); + // loop over each output pixel + #ifdef _OPENMP + #pragma omp parallel for + #endif + for (int64_t n = 0; n < N; ++n) { + scalar_t *grid_ptr_N = grid_ptr + n * grid_sN; + scalar_t *inp_ptr_N = inp_ptr + n * inp_sN; + for (int64_t d = 0; d < out_D; ++d) { + for (int64_t h = 0; h < out_H; ++h) { + for (int64_t w = 0; w < out_W; ++w) { + // get the corresponding input x, y, z co-ordinates from grid + scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW; + scalar_t ix = *grid_ptr_NDHW; + scalar_t iy = grid_ptr_NDHW[grid_sCoor]; + scalar_t iz = grid_ptr_NDHW[2 * grid_sCoor]; + + // normalize ix, iy, iz from [-1, 1] to [0, inp_W-1] & [0, inp_H-1] & [0, inp_D-1] + ix = ((ix + 1) / 2) * (inp_W - 1); + iy = ((iy + 1) / 2) * (inp_H - 1); + iz = ((iz + 1) / 2) * (inp_D - 1); + + // get corner pixel values from (x, y, z) + // for 4d, we used north-east-south-west + // for 5d, we add top-bottom + int64_t ix_tnw = static_cast(std::floor(ix)); + int64_t iy_tnw = static_cast(std::floor(iy)); + int64_t iz_tnw = static_cast(std::floor(iz)); + + int64_t ix_tne = ix_tnw + 1; + int64_t iy_tne = iy_tnw; + int64_t iz_tne = iz_tnw; + + int64_t ix_tsw = ix_tnw; + int64_t iy_tsw = iy_tnw + 1; + int64_t iz_tsw = iz_tnw; + + int64_t ix_tse = ix_tnw + 1; + int64_t iy_tse = iy_tnw + 1; + int64_t iz_tse = iz_tnw; + + int64_t ix_bnw = ix_tnw; + int64_t iy_bnw = iy_tnw; + int64_t iz_bnw = iz_tnw + 1; + + int64_t ix_bne = ix_tnw + 1; + int64_t iy_bne = iy_tnw; + int64_t iz_bne = iz_tnw + 1; + + int64_t ix_bsw = ix_tnw; + int64_t iy_bsw = iy_tnw + 1; + int64_t iz_bsw = iz_tnw + 1; + + int64_t ix_bse = ix_tnw + 1; + int64_t iy_bse = iy_tnw + 1; + int64_t iz_bse = iz_tnw + 1; + + // get surfaces to each neighbor: + scalar_t tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz); + scalar_t tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz); + scalar_t tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz); + scalar_t tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz); + scalar_t bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse); + scalar_t bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw); + scalar_t bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne); + scalar_t bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw); + + if (padding_mode == GridSamplerPadding::Border) { + // clip coordinates to image borders + ix_tnw = clip_coordinates(ix_tnw, inp_W); + iy_tnw = clip_coordinates(iy_tnw, inp_H); + iz_tnw = clip_coordinates(iz_tnw, inp_D); + ix_tne = clip_coordinates(ix_tne, inp_W); + iy_tne = clip_coordinates(iy_tne, inp_H); + iz_tne = clip_coordinates(iz_tne, inp_D); + ix_tsw = clip_coordinates(ix_tsw, inp_W); + iy_tsw = clip_coordinates(iy_tsw, inp_H); + iz_tsw = clip_coordinates(iz_tsw, inp_D); + ix_tse = clip_coordinates(ix_tse, inp_W); + iy_tse = clip_coordinates(iy_tse, inp_H); + iz_tse = clip_coordinates(iz_tse, inp_D); + ix_bnw = clip_coordinates(ix_bnw, inp_W); + iy_bnw = clip_coordinates(iy_bnw, inp_H); + iz_bnw = clip_coordinates(iz_bnw, inp_D); + ix_bne = clip_coordinates(ix_bne, inp_W); + iy_bne = clip_coordinates(iy_bne, inp_H); + iz_bne = clip_coordinates(iz_bne, inp_D); + ix_bsw = clip_coordinates(ix_bsw, inp_W); + iy_bsw = clip_coordinates(iy_bsw, inp_H); + iz_bsw = clip_coordinates(iz_bsw, inp_D); + ix_bse = clip_coordinates(ix_bse, inp_W); + iy_bse = clip_coordinates(iy_bse, inp_H); + iz_bse = clip_coordinates(iz_bse, inp_D); + } + + // calculate bilinear weighted pixel value and set output pixel + scalar_t *out_ptr_NCDHW = out_ptr + n * out_sN + d * out_sD + h * out_sH + w * out_sW; + scalar_t *inp_ptr_NC = inp_ptr_N; + for (int c = 0; c < C; ++c, out_ptr_NCDHW += out_sC, inp_ptr_NC += inp_sC) { + // (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) * tne + // + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) * tse + // + (c, iz_bnw, iy_bnw, ix_bnw) * bnw + (c, iz_bne, iy_bne, ix_bne) * bne + // + (c, iz_bsw, iy_bsw, ix_bsw) * bsw + (c, iz_bse, iy_bse, ix_bse) * bse + *out_ptr_NCDHW = static_cast(0); + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) { + *out_ptr_NCDHW += inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] * tnw; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) { + *out_ptr_NCDHW += inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] * tne; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) { + *out_ptr_NCDHW += inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] * tsw; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) { + *out_ptr_NCDHW += inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] * tse; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) { + *out_ptr_NCDHW += inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] * bnw; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) { + *out_ptr_NCDHW += inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] * bne; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) { + *out_ptr_NCDHW += inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] * bsw; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) { + *out_ptr_NCDHW += inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] * bse; + } + } + } + } + } + } + return output; + } + + template + std::tuple + grid_sampler2d_backward_cpu_impl(const Tensor& grad_output, + const Tensor& input, const Tensor& grid, + GridSamplerInterpolation interpolation_mode, + GridSamplerPadding padding_mode) { + auto grad_input = at::zeros_like(input); + auto grad_grid = at::empty_like(grid); + int64_t N = input.size(0); + int64_t C = input.size(1); + int64_t inp_H = input.size(2); + int64_t inp_W = input.size(3); + int64_t out_H = grid.size(1); + int64_t out_W = grid.size(2); + int64_t inp_sN = input.stride(0); + int64_t inp_sC = input.stride(1); + int64_t inp_sH = input.stride(2); + int64_t inp_sW = input.stride(3); + int64_t grid_sN = grid.stride(0); + int64_t grid_sH = grid.stride(1); + int64_t grid_sW = grid.stride(2); + int64_t grid_sCoor = grid.stride(3); + int64_t gOut_sN = grad_output.stride(0); + int64_t gOut_sC = grad_output.stride(1); + int64_t gOut_sH = grad_output.stride(2); + int64_t gOut_sW = grad_output.stride(3); + int64_t gInp_sN = grad_input.stride(0); + int64_t gInp_sC = grad_input.stride(1); + int64_t gInp_sH = grad_input.stride(2); + int64_t gInp_sW = grad_input.stride(3); + int64_t gGrid_sN = grad_grid.stride(0); + int64_t gGrid_sW = grad_grid.stride(2); + scalar_t *inp_ptr = input.data(); + scalar_t *grid_ptr = grid.data(); + scalar_t *gOut_ptr = grad_output.data(); + scalar_t *gInp_ptr = grad_input.data(); + scalar_t *gGrid_ptr = grad_grid.data(); + // loop over each output pixel + #ifdef _OPENMP + #pragma omp parallel for + #endif + for (int64_t n = 0; n < N; ++n) { + scalar_t *grid_ptr_N = grid_ptr + n * grid_sN; + scalar_t *inp_ptr_N = inp_ptr + n * inp_sN; + scalar_t *gGrid_ptr_NHW = gGrid_ptr + n * gGrid_sN; + for (int64_t h = 0; h < out_H; ++h) { + for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NHW += gGrid_sW /* grad_grid is contiguous */ ) { + // get the corresponding input x, y co-ordinates from grid + scalar_t ix = grid_ptr_N[h * grid_sH + w * grid_sW]; + scalar_t iy = grid_ptr_N[h * grid_sH + w * grid_sW + grid_sCoor]; + + // normalize ix, iy from [-1, 1] to [0, inp_W-1] & [0, inp_H-1] + ix = ((ix + 1) / 2) * (inp_W - 1); + iy = ((iy + 1) / 2) * (inp_H - 1); + + // get NE, NW, SE, SW pixel values from (x, y) + int64_t ix_nw = static_cast(std::floor(ix)); + int64_t iy_nw = static_cast(std::floor(iy)); + int64_t ix_ne = ix_nw + 1; + int64_t iy_ne = iy_nw; + int64_t ix_sw = ix_nw; + int64_t iy_sw = iy_nw + 1; + int64_t ix_se = ix_nw + 1; + int64_t iy_se = iy_nw + 1; + + // get surfaces to each neighbor: + scalar_t nw = (ix_se - ix) * (iy_se - iy); + scalar_t ne = (ix - ix_sw) * (iy_sw - iy); + scalar_t sw = (ix_ne - ix) * (iy - iy_ne); + scalar_t se = (ix - ix_nw) * (iy - iy_nw); + + int64_t ix_nw_cl, iy_nw_cl, ix_ne_cl, iy_ne_cl, ix_sw_cl, iy_sw_cl, ix_se_cl, iy_se_cl; + + if (padding_mode == GridSamplerPadding::Border) { + // get clipped NE, NW, SE, SW pixel values from (x, y) + ix_nw_cl = clip_coordinates(ix_nw, inp_W); + iy_nw_cl = clip_coordinates(iy_nw, inp_H); + ix_ne_cl = clip_coordinates(ix_ne, inp_W); + iy_ne_cl = clip_coordinates(iy_ne, inp_H); + ix_sw_cl = clip_coordinates(ix_sw, inp_W); + iy_sw_cl = clip_coordinates(iy_sw, inp_H); + ix_se_cl = clip_coordinates(ix_se, inp_W); + iy_se_cl = clip_coordinates(iy_se, inp_H); + } else { + ix_nw_cl = ix_nw; + iy_nw_cl = iy_nw; + ix_ne_cl = ix_ne; + iy_ne_cl = iy_ne; + ix_sw_cl = ix_sw; + iy_sw_cl = iy_sw; + ix_se_cl = ix_se; + iy_se_cl = iy_se; + } + + scalar_t gix = static_cast(0), giy = static_cast(0); + scalar_t *gOut_ptr_NCHW = gOut_ptr + n * gOut_sN + h * gOut_sH + w * gOut_sW; + scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN; + scalar_t *inp_ptr_NC = inp_ptr_N; + // calculate bilinear weighted pixel value and set output pixel + for (int c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC += inp_sC) { + scalar_t gOut = *gOut_ptr_NCHW; + + // calculate and set grad_input + safe_add_2d(gInp_ptr_NC, iy_nw_cl, ix_nw_cl, gInp_sH, gInp_sW, inp_H, inp_W, nw * gOut); + safe_add_2d(gInp_ptr_NC, iy_ne_cl, ix_ne_cl, gInp_sH, gInp_sW, inp_H, inp_W, ne * gOut); + safe_add_2d(gInp_ptr_NC, iy_sw_cl, ix_sw_cl, gInp_sH, gInp_sW, inp_H, inp_W, sw * gOut); + safe_add_2d(gInp_ptr_NC, iy_se_cl, ix_se_cl, gInp_sH, gInp_sW, inp_H, inp_W, se * gOut); + + // calculate grad_grid + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_nw_cl, ix_nw_cl, inp_H, inp_W)) { + scalar_t nw_val = inp_ptr_NC[iy_nw_cl * inp_sH + ix_nw_cl * inp_sW]; + gix -= nw_val * (iy_se - iy) * gOut; + giy -= nw_val * (ix_se - ix) * gOut; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_ne_cl, ix_ne_cl, inp_H, inp_W)) { + scalar_t ne_val = inp_ptr_NC[iy_ne_cl * inp_sH + ix_ne_cl * inp_sW]; + gix += ne_val * (iy_sw - iy) * gOut; + giy -= ne_val * (ix - ix_sw) * gOut; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_sw_cl, ix_sw_cl, inp_H, inp_W)) { + scalar_t sw_val = inp_ptr_NC[iy_sw_cl * inp_sH + ix_sw_cl * inp_sW]; + gix -= sw_val * (iy - iy_ne) * gOut; + giy += sw_val * (ix_ne - ix) * gOut; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_se_cl, ix_se_cl, inp_H, inp_W)) { + scalar_t se_val = inp_ptr_NC[iy_se_cl * inp_sH + ix_se_cl * inp_sW]; + gix += se_val * (iy - iy_nw) * gOut; + giy += se_val * (ix - ix_nw) * gOut; + } + } + + // un-normalize grad_grid values back to [-1, 1] constraints + gix = gix * (inp_W - 1) / 2; + giy = giy * (inp_H - 1) / 2; + + // assuming grad_grid is contiguous + gGrid_ptr_NHW[0] = gix; + gGrid_ptr_NHW[1] = giy; + } + } + } + return std::make_tuple(grad_input, grad_grid); + } + + template + std::tuple + grid_sampler3d_backward_cpu_impl(const Tensor& grad_output, + const Tensor& input, const Tensor& grid, + GridSamplerInterpolation interpolation_mode, + GridSamplerPadding padding_mode) { + auto grad_input = at::zeros_like(input); + auto grad_grid = at::empty_like(grid); + int64_t N = input.size(0); + int64_t C = input.size(1); + int64_t inp_D = input.size(2); + int64_t inp_H = input.size(3); + int64_t inp_W = input.size(4); + int64_t out_D = grid.size(1); + int64_t out_H = grid.size(2); + int64_t out_W = grid.size(3); + int64_t inp_sN = input.stride(0); + int64_t inp_sC = input.stride(1); + int64_t inp_sD = input.stride(2); + int64_t inp_sH = input.stride(3); + int64_t inp_sW = input.stride(4); + int64_t grid_sN = grid.stride(0); + int64_t grid_sD = grid.stride(1); + int64_t grid_sH = grid.stride(2); + int64_t grid_sW = grid.stride(3); + int64_t grid_sCoor = grid.stride(4); + int64_t gOut_sN = grad_output.stride(0); + int64_t gOut_sC = grad_output.stride(1); + int64_t gOut_sD = grad_output.stride(2); + int64_t gOut_sH = grad_output.stride(3); + int64_t gOut_sW = grad_output.stride(4); + int64_t gInp_sN = grad_input.stride(0); + int64_t gInp_sC = grad_input.stride(1); + int64_t gInp_sD = grad_input.stride(2); + int64_t gInp_sH = grad_input.stride(3); + int64_t gInp_sW = grad_input.stride(4); + int64_t gGrid_sN = grad_grid.stride(0); + int64_t gGrid_sW = grad_grid.stride(3); + scalar_t *inp_ptr = input.data(); + scalar_t *grid_ptr = grid.data(); + scalar_t *gOut_ptr = grad_output.data(); + scalar_t *gInp_ptr = grad_input.data(); + scalar_t *gGrid_ptr = grad_grid.data(); + // loop over each output pixel + #ifdef _OPENMP + #pragma omp parallel for + #endif + for (int64_t n = 0; n < N; ++n) { + scalar_t *grid_ptr_N = grid_ptr + n * grid_sN; + scalar_t *inp_ptr_N = inp_ptr + n * inp_sN; + scalar_t *gGrid_ptr_NDHW = gGrid_ptr + n * gGrid_sN; + for (int64_t d = 0; d < out_D; ++d) { + for (int64_t h = 0; h < out_H; ++h) { + for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NDHW += gGrid_sW /* grad_grid is contiguous */ ) { + // get the corresponding input x, y, z co-ordinates from grid + scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW; + scalar_t ix = *grid_ptr_NDHW; + scalar_t iy = grid_ptr_NDHW[grid_sCoor]; + scalar_t iz = grid_ptr_NDHW[2 * grid_sCoor]; + + // normalize ix, iy, iz from [-1, 1] to [0, inp_W-1] & [0, inp_H-1] & [0, inp_D-1] + ix = ((ix + 1) / 2) * (inp_W - 1); + iy = ((iy + 1) / 2) * (inp_H - 1); + iz = ((iz + 1) / 2) * (inp_D - 1); + + // get corner pixel values from (x, y, z) + // for 4d, we used north-east-south-west + // for 5d, we add top-bottom + int64_t ix_tnw = static_cast(std::floor(ix)); + int64_t iy_tnw = static_cast(std::floor(iy)); + int64_t iz_tnw = static_cast(std::floor(iz)); + + int64_t ix_tne = ix_tnw + 1; + int64_t iy_tne = iy_tnw; + int64_t iz_tne = iz_tnw; + + int64_t ix_tsw = ix_tnw; + int64_t iy_tsw = iy_tnw + 1; + int64_t iz_tsw = iz_tnw; + + int64_t ix_tse = ix_tnw + 1; + int64_t iy_tse = iy_tnw + 1; + int64_t iz_tse = iz_tnw; + + int64_t ix_bnw = ix_tnw; + int64_t iy_bnw = iy_tnw; + int64_t iz_bnw = iz_tnw + 1; + + int64_t ix_bne = ix_tnw + 1; + int64_t iy_bne = iy_tnw; + int64_t iz_bne = iz_tnw + 1; + + int64_t ix_bsw = ix_tnw; + int64_t iy_bsw = iy_tnw + 1; + int64_t iz_bsw = iz_tnw + 1; + + int64_t ix_bse = ix_tnw + 1; + int64_t iy_bse = iy_tnw + 1; + int64_t iz_bse = iz_tnw + 1; + + // get surfaces to each neighbor: + scalar_t tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz); + scalar_t tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz); + scalar_t tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz); + scalar_t tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz); + scalar_t bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse); + scalar_t bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw); + scalar_t bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne); + scalar_t bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw); + + int64_t ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, ix_tne_cl, iy_tne_cl, iz_tne_cl; + int64_t ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, ix_tse_cl, iy_tse_cl, iz_tse_cl; + int64_t ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, ix_bne_cl, iy_bne_cl, iz_bne_cl; + int64_t ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, ix_bse_cl, iy_bse_cl, iz_bse_cl; + + if (padding_mode == GridSamplerPadding::Border) { + // clip coordinates to image borders + ix_tnw_cl = clip_coordinates(ix_tnw, inp_W); + iy_tnw_cl = clip_coordinates(iy_tnw, inp_H); + iz_tnw_cl = clip_coordinates(iz_tnw, inp_D); + ix_tne_cl = clip_coordinates(ix_tne, inp_W); + iy_tne_cl = clip_coordinates(iy_tne, inp_H); + iz_tne_cl = clip_coordinates(iz_tne, inp_D); + ix_tsw_cl = clip_coordinates(ix_tsw, inp_W); + iy_tsw_cl = clip_coordinates(iy_tsw, inp_H); + iz_tsw_cl = clip_coordinates(iz_tsw, inp_D); + ix_tse_cl = clip_coordinates(ix_tse, inp_W); + iy_tse_cl = clip_coordinates(iy_tse, inp_H); + iz_tse_cl = clip_coordinates(iz_tse, inp_D); + ix_bnw_cl = clip_coordinates(ix_bnw, inp_W); + iy_bnw_cl = clip_coordinates(iy_bnw, inp_H); + iz_bnw_cl = clip_coordinates(iz_bnw, inp_D); + ix_bne_cl = clip_coordinates(ix_bne, inp_W); + iy_bne_cl = clip_coordinates(iy_bne, inp_H); + iz_bne_cl = clip_coordinates(iz_bne, inp_D); + ix_bsw_cl = clip_coordinates(ix_bsw, inp_W); + iy_bsw_cl = clip_coordinates(iy_bsw, inp_H); + iz_bsw_cl = clip_coordinates(iz_bsw, inp_D); + ix_bse_cl = clip_coordinates(ix_bse, inp_W); + iy_bse_cl = clip_coordinates(iy_bse, inp_H); + iz_bse_cl = clip_coordinates(iz_bse, inp_D); + } else { + ix_tnw_cl = ix_tnw; + iy_tnw_cl = iy_tnw; + iz_tnw_cl = iz_tnw; + ix_tne_cl = ix_tne; + iy_tne_cl = iy_tne; + iz_tne_cl = iz_tne; + ix_tsw_cl = ix_tsw; + iy_tsw_cl = iy_tsw; + iz_tsw_cl = iz_tsw; + ix_tse_cl = ix_tse; + iy_tse_cl = iy_tse; + iz_tse_cl = iz_tse; + ix_bnw_cl = ix_bnw; + iy_bnw_cl = iy_bnw; + iz_bnw_cl = iz_bnw; + ix_bne_cl = ix_bne; + iy_bne_cl = iy_bne; + iz_bne_cl = iz_bne; + ix_bsw_cl = ix_bsw; + iy_bsw_cl = iy_bsw; + iz_bsw_cl = iz_bsw; + ix_bse_cl = ix_bse; + iy_bse_cl = iy_bse; + iz_bse_cl = iz_bse; + } + + scalar_t gix = static_cast(0), giy = static_cast(0), giz = static_cast(0); + scalar_t *gOut_ptr_NCDHW = gOut_ptr + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW; + scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN; + scalar_t *inp_ptr_NC = inp_ptr_N; + // calculate bilinear weighted pixel value and set output pixel + for (int c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC += inp_sC) { + scalar_t gOut = *gOut_ptr_NCDHW; + + // calculate and set grad_input + safe_add_3d(gInp_ptr_NC, iz_tnw_cl, iy_tnw_cl, ix_tnw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tnw * gOut); + safe_add_3d(gInp_ptr_NC, iz_tne_cl, iy_tne_cl, ix_tne_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tne * gOut); + safe_add_3d(gInp_ptr_NC, iz_tsw_cl, iy_tsw_cl, ix_tsw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tsw * gOut); + safe_add_3d(gInp_ptr_NC, iz_tse_cl, iy_tse_cl, ix_tse_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tse * gOut); + safe_add_3d(gInp_ptr_NC, iz_bnw_cl, iy_bnw_cl, ix_bnw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bnw * gOut); + safe_add_3d(gInp_ptr_NC, iz_bne_cl, iy_bne_cl, ix_bne_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bne * gOut); + safe_add_3d(gInp_ptr_NC, iz_bsw_cl, iy_bsw_cl, ix_bsw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bsw * gOut); + safe_add_3d(gInp_ptr_NC, iz_bse_cl, iy_bse_cl, ix_bse_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bse * gOut); + + // calculate grad_grid + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tnw_cl, iy_tnw_cl, ix_tnw_cl, inp_D, inp_H, inp_W)) { + scalar_t tnw_val = inp_ptr_NC[iz_tnw_cl * inp_sD + iy_tnw_cl * inp_sH + ix_tnw_cl * inp_sW]; + gix -= tnw_val * (iy_bse - iy) * (iz_bse - iz) * gOut; + giy -= tnw_val * (ix_bse - ix) * (iz_bse - iz) * gOut; + giz -= tnw_val * (ix_bse - ix) * (iy_bse - iy) * gOut; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tne_cl, iy_tne_cl, ix_tne_cl, inp_D, inp_H, inp_W)) { + scalar_t tne_val = inp_ptr_NC[iz_tne_cl * inp_sD + iy_tne_cl * inp_sH + ix_tne_cl * inp_sW]; + gix += tne_val * (iy_bsw - iy) * (iz_bsw - iz) * gOut; + giy -= tne_val * (ix - ix_bsw) * (iz_bsw - iz) * gOut; + giz -= tne_val * (ix - ix_bsw) * (iy_bsw - iy) * gOut; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tsw_cl, iy_tsw_cl, ix_tsw_cl, inp_D, inp_H, inp_W)) { + scalar_t tsw_val = inp_ptr_NC[iz_tsw_cl * inp_sD + iy_tsw_cl * inp_sH + ix_tsw_cl * inp_sW]; + gix -= tsw_val * (iy - iy_bne) * (iz_bne - iz) * gOut; + giy += tsw_val * (ix_bne - ix) * (iz_bne - iz) * gOut; + giz -= tsw_val * (ix_bne - ix) * (iy - iy_bne) * gOut; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tse_cl, iy_tse_cl, ix_tse_cl, inp_D, inp_H, inp_W)) { + scalar_t tse_val = inp_ptr_NC[iz_tse_cl * inp_sD + iy_tse_cl * inp_sH + ix_tse_cl * inp_sW]; + gix += tse_val * (iy - iy_bnw) * (iz_bnw - iz) * gOut; + giy += tse_val * (ix - ix_bnw) * (iz_bnw - iz) * gOut; + giz -= tse_val * (ix - ix_bnw) * (iy - iy_bnw) * gOut; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bnw_cl, iy_bnw_cl, ix_bnw_cl, inp_D, inp_H, inp_W)) { + scalar_t bnw_val = inp_ptr_NC[iz_bnw_cl * inp_sD + iy_bnw_cl * inp_sH + ix_bnw_cl * inp_sW]; + gix -= bnw_val * (iy_tse - iy) * (iz - iz_tse) * gOut; + giy -= bnw_val * (ix_tse - ix) * (iz - iz_tse) * gOut; + giz += bnw_val * (ix_tse - ix) * (iy_tse - iy) * gOut; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bne_cl, iy_bne_cl, ix_bne_cl, inp_D, inp_H, inp_W)) { + scalar_t bne_val = inp_ptr_NC[iz_bne_cl * inp_sD + iy_bne_cl * inp_sH + ix_bne_cl * inp_sW]; + gix += bne_val * (iy_tsw - iy) * (iz - iz_tsw) * gOut; + giy -= bne_val * (ix - ix_tsw) * (iz - iz_tsw) * gOut; + giz += bne_val * (ix - ix_tsw) * (iy_tsw - iy) * gOut; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bsw_cl, iy_bsw_cl, ix_bsw_cl, inp_D, inp_H, inp_W)) { + scalar_t bsw_val = inp_ptr_NC[iz_bsw_cl * inp_sD + iy_bsw_cl * inp_sH + ix_bsw_cl * inp_sW]; + gix -= bsw_val * (iy - iy_tne) * (iz - iz_tne) * gOut; + giy += bsw_val * (ix_tne - ix) * (iz - iz_tne) * gOut; + giz += bsw_val * (ix_tne - ix) * (iy - iy_tne) * gOut; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bse_cl, iy_bse_cl, ix_bse_cl, inp_D, inp_H, inp_W)) { + scalar_t bse_val = inp_ptr_NC[iz_bse_cl * inp_sD + iy_bse_cl * inp_sH + ix_bse_cl * inp_sW]; + gix += bse_val * (iy - iy_tnw) * (iz - iz_tnw) * gOut; + giy += bse_val * (ix - ix_tnw) * (iz - iz_tnw) * gOut; + giz += bse_val * (ix - ix_tnw) * (iy - iy_tnw) * gOut; + } + } + + // un-normalize grad_grid values back to [-1, 1] constraints + gix = gix * (inp_W - 1) / 2; + giy = giy * (inp_H - 1) / 2; + giz = giz * (inp_D - 1) / 2; + + // assuming grad_grid is contiguous + gGrid_ptr_NDHW[0] = gix; + gGrid_ptr_NDHW[1] = giy; + gGrid_ptr_NDHW[2] = giz; + } + } + } + } + return std::make_tuple(grad_input, grad_grid); + } +} + +// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. +Tensor grid_sampler_2d_cpu(const Tensor& input, const Tensor& grid, + int64_t interpolation_mode, int64_t padding_mode) { + return AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler2d_cpu", [&] { + return grid_sampler2d_cpu_impl( + input, grid, static_cast(interpolation_mode), + static_cast(padding_mode)); + }); +} + +// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. +Tensor grid_sampler_3d_cpu(const Tensor& input, const Tensor& grid, + int64_t interpolation_mode, int64_t padding_mode) { + return AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler3d_cpu", [&] { + return grid_sampler3d_cpu_impl( + input, grid, static_cast(interpolation_mode), + static_cast(padding_mode)); + }); +} + +// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. +std::tuple +grid_sampler_2d_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& grid, + int64_t interpolation_mode, int64_t padding_mode) { + return AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler_2d_backward_cpu", [&] { + return grid_sampler2d_backward_cpu_impl( + grad_output, input, grid, + static_cast(interpolation_mode), + static_cast(padding_mode)); + }); +} + +// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. +std::tuple +grid_sampler_3d_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& grid, + int64_t interpolation_mode, int64_t padding_mode) { + return AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler_3d_backward_cpu", [&] { + return grid_sampler3d_backward_cpu_impl( + grad_output, input, grid, + static_cast(interpolation_mode), + static_cast(padding_mode)); + }); +} + +Tensor grid_sampler(const Tensor& input, const Tensor& grid, int64_t padding_mode) { + AT_CHECK( + (input.dim() == 4 || input.dim() == 5) && input.dim() == grid.dim(), + "grid_sampler(): expected 4D or 5D input and grid with same number " + "dimensions, but got input with sizes ", input.sizes(), + " and grid with sizes ", grid.sizes()); + AT_CHECK( + input.size(0) == grid.size(0), + "grid_sampler(): expected grid and input to have same batch size, but got " + "input with sizes ", input.sizes(), " and grid with sizes ", grid.sizes()); + AT_CHECK( + grid.size(-1) == input.dim() - 2, + "grid_sampler(): expected grid to have size ", input.dim() - 2, " in last " + "dimension, but got grid with sizes ", grid.sizes()); + // cudnn does not support inputs larger than 1024 + if (at::native::cudnn_is_acceptable(input) && + static_cast(padding_mode) == GridSamplerPadding::Zeros && + input.dim() == 4 && + input.size(1) <= 1024) { + return cudnn_grid_sampler(input, grid); + } + if (input.dim() == 4) { + return at::grid_sampler_2d(input, grid, 0, padding_mode); + } else { + return at::grid_sampler_3d(input, grid, 0, padding_mode); + } +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/GridSampler.h b/aten/src/ATen/native/GridSampler.h new file mode 100644 index 00000000000000..f39b4e996469fa --- /dev/null +++ b/aten/src/ATen/native/GridSampler.h @@ -0,0 +1,9 @@ +#include "ATen/ATen.h" +#include "ATen/NativeFunctions.h" + +namespace at { namespace native { namespace detail { + + enum class GridSamplerInterpolation {Bilinear, Nearest}; + enum class GridSamplerPadding {Zeros, Border, Reflection}; + +}}} // namespace at::native::detail diff --git a/aten/src/ATen/native/Indexing.cpp b/aten/src/ATen/native/Indexing.cpp index 9720adb4895769..e4eb336cd5f453 100644 --- a/aten/src/ATen/native/Indexing.cpp +++ b/aten/src/ATen/native/Indexing.cpp @@ -69,11 +69,7 @@ static std::vector expandByteTensors(const Tensor & self, TensorList ind } // Replace with nonzeros auto nonzero = index.nonzero(); -#ifndef USE_TH_SIZE_ZERO_DIM - auto special_empty = nonzero.numel() == 0; -#else auto special_empty = false; -#endif for (int64_t j = 0; j < index.dim(); j++) { if (special_empty) { // We can't call select on an empty tensor so we just create an empty @@ -214,26 +210,10 @@ static Tensor computeLinearIndex(const Tensor & src, TensorList indices) { return linearIndex; } -#ifndef USE_TH_SIZE_ZERO_DIM -static bool hasEmptyTensor(TensorList tensors) { - for (auto& tensor : tensors) { - if (tensor.defined() && tensor.numel() == 0) { - return true; - } - } - return false; -} -#endif - static std::tuple makeLinearIndex(Tensor self, TensorList orig) { checkIndexTensorTypes(orig); // first expand ByteTensor (boolean masks) into 1 or more LongTensors auto indices = expandByteTensors(self, orig); -#ifndef USE_TH_SIZE_ZERO_DIM - if (hasEmptyTensor(indices)) { - return std::make_tuple(self, self.type().toScalarType(kLong).tensor()); - } -#endif // next broadcast all index tensors together indices = expand_outplace(indices); // add missing null Tensors so that it matches self.dim() @@ -299,11 +279,11 @@ Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Ten } // Check that source and destination slices have the same size - auto selfSlicedSizes = std::vector(self.sizes()); + auto selfSlicedSizes = self.sizes().vec(); if (selfSlicedSizes.size() > 0) { selfSlicedSizes.erase(selfSlicedSizes.begin() + dim); } - auto sourceSlicedSizes = std::vector(source.sizes()); + auto sourceSlicedSizes = source.sizes().vec(); if (sourceSlicedSizes.size() > 0) { sourceSlicedSizes.erase(sourceSlicedSizes.begin() + dim); } diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp index cb24e71119f9b1..c82bf8ba0ae043 100644 --- a/aten/src/ATen/native/Linear.cpp +++ b/aten/src/ATen/native/Linear.cpp @@ -1,6 +1,7 @@ #include "ATen/ATen.h" #include "ATen/NativeFunctions.h" #include "ATen/WrapDimUtilsMulti.h" +#include namespace at { namespace native { @@ -136,6 +137,8 @@ Tensor einsum(std::string eqn, TensorList tensors) { } else { in_eqn = eqn; } + // remove spaces for einsum compatibility (#9929) + in_eqn.erase(std::remove_if(in_eqn.begin(), in_eqn.end(), isspace), in_eqn.end()); // next we parse in_eq (the left hand side) by iterating. It is a string of comma separated terms per index int64_t operand = 0; @@ -212,7 +215,7 @@ Tensor einsum(std::string eqn, TensorList tensors) { num_output_dims++; } } - } else { // letter (hopefully) + } else if (! isspace(c)) { // letter (hopefully) AT_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis in the right hand side"); AT_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices"); int64_t letter_num = c-'a'; diff --git a/aten/src/ATen/native/LossCTC.cpp b/aten/src/ATen/native/LossCTC.cpp new file mode 100644 index 00000000000000..092b7255eb4a0d --- /dev/null +++ b/aten/src/ATen/native/LossCTC.cpp @@ -0,0 +1,365 @@ +// Copyright (c) 2018 MathInf GmbH, Thomas Viehmann +// Licensed under the BSD-3-Clause license +// This is the CPU implementation of the Connectionist Temporal Loss. +// We mostly follow Graves. +// 1. Graves et al: http://www.cs.toronto.edu/~graves/icml_2006.pdf +// We use the equations from above link, but note that [1] has 1-based indexing and we (of course) use 0-based. +// Graves et al call the probabilities y, we use log_probs (also calling them inputs) + +#include +#include "ATen/Dispatch.h" +#include "ATen/TensorUtils.h" + +#include +#include + +namespace at { +namespace native { + +namespace { + +// this ad-hoc converts from targets (l in [1]) to augmented targets (l' in [1]) note that no bound-checking is done +template +static inline int64_t get_target_prime(target_t* target, int64_t offset, int64_t stride, int64_t idx, int64_t BLANK) { + if (idx % 2 == 0) { + return BLANK; + } else { + return target[offset + stride * (idx / 2)]; + } +} + +// This kernel is a relatively straightforward implementation of the alpha calculation in the forward backward algorithm (section 4.1). +// A (minor) twist is that we are using log-calculations to enhance numerical stability (log_probs and log_alpha). +// The function returns the loss and the alphas, the alphas are kept for the backward step. The wrapper (ctc_loss below) hides +// the alphas from the user by only returning the loss. +template +std::tuple ctc_loss_cpu_template(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK) { + // log_probs: input_len x batch_size x num_labels + // targets [int64]: batch_size x target_length OR sum(target_lengths) + constexpr scalar_t neginf = -std::numeric_limits::infinity(); + using target_t = typename std::conditional::type; + + CheckedFrom c = "ctc_loss_cpu"; + auto log_probs_arg = TensorArg(log_probs, "log_probs", 1); + auto targets_arg = TensorArg(targets, "targets", 2); + checkScalarType(c, targets_arg, target_scalar_type); + checkDim(c, log_probs_arg, 3); + checkDimRange(c, targets_arg, 1, 3); + + int64_t batch_size = log_probs.size(1); + int64_t num_labels = log_probs.size(2); + AT_CHECK(BLANK < num_labels, "blank must be in label range"); + AT_CHECK((int64_t) input_lengths.size() == batch_size, "input_lengths must be of size batch_size"); + AT_CHECK((int64_t) target_lengths.size() == batch_size, "target_lengths must be of size batch_size"); + + size_t tg_target_stride; + int64_t max_target_length; + std::vector tg_batch_offsets(batch_size); + if (targets.dim() == 1) { // concatenated targets + int64_t pos = 0; + max_target_length = 0; + for (int64_t i = 0; i < batch_size; i++) { + tg_batch_offsets[i] = pos; + pos += target_lengths[i]; + if (max_target_length < target_lengths[i]) + max_target_length = target_lengths[i]; + } + tg_target_stride = targets.stride(0); + checkSize(c, targets_arg, 0, pos); + } + else { // batch x max_target_length + // dim is 2 + int64_t tg_batch_stride = targets.stride(0); + for (int64_t i = 0; i < batch_size; i++) { + tg_batch_offsets[i] = i * tg_batch_stride; + } + tg_target_stride = targets.stride(1); + max_target_length = targets.size(1); + checkSize(c, targets_arg, 0, batch_size); + AT_CHECK(targets.size(1) >= max_target_length, + "Expected tensor to have size at least ", max_target_length, " at dimension 1, but got size ", targets.size(1), " for ", targets_arg, + " (while checking arguments for ", c, ")"); + } + int64_t max_input_length = log_probs.size(0); + for (int64_t b = 0; b < batch_size; b++) { + AT_CHECK(input_lengths[b] <= max_input_length, + "Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", targets.size(0), " for ", targets_arg, + " (while checking arguments for ", c, ")"); + } + + Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2*max_target_length+1}, log_probs.options()); + Tensor neg_log_likelihood = at::empty({batch_size}, log_probs.options()); + + auto lpp = log_probs.permute({1,0,2}); + auto log_probs_a_global = lpp.accessor(); + auto log_alpha_a_global = log_alpha.accessor(); + auto targets_data = targets.data(); + auto neg_log_likelihood_a = neg_log_likelihood.accessor(); + + // alpha calculation for the first row, the three equations for alpha_1 above eq (6) + // first the default + log_alpha.narrow(1, 0, 1).fill_(neginf); + #pragma omp parallel for + for (int64_t b = 0; b < batch_size; b++) { + int64_t input_length = input_lengths[b]; + int64_t target_length = target_lengths[b]; + auto log_probs_a = log_probs_a_global[b]; + auto log_alpha_a = log_alpha_a_global[b]; + int64_t tg_batch_offset = tg_batch_offsets[b]; + + // the first two items of alpha_t above eq (6) + log_alpha_a[0][0] = log_probs_a[0][BLANK]; + if (target_length > 0) + log_alpha_a[0][1] = log_probs_a[0][get_target_prime(targets_data, tg_batch_offset, tg_target_stride, 1, BLANK)]; + + // now the loop over the inputs + for (int64_t t=1; t 0) { + la2 = log_alpha_a[t-1][s-1]; + if (la2 > lamax) + lamax = la2; + } else { + la2 = neginf; + } + if ((s > 1) && (get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s-2, BLANK) != + current_target_prime)) { + la3 = log_alpha_a[t-1][s-2]; + if (la3 > lamax) + lamax = la3; + } else { + la3 = neginf; + } + if (lamax == neginf) // cannot do neginf-neginf + lamax = 0; + // this is the assignment of eq (6) + log_alpha_a[t][s] = std::log(std::exp(la1-lamax)+std::exp(la2-lamax)+std::exp(la3-lamax))+lamax + log_probs_a[t][current_target_prime]; + } + } + // the likelihood is the the sum of the last two alphas, eq (8), the loss is the negative log likelihood + scalar_t l1 = log_alpha_a[input_length-1][target_length*2]; + scalar_t l2 = log_alpha_a[input_length-1][target_length*2-1]; + scalar_t m = std::max(l1, l2); + m = ((m == neginf) ? 0 : m); + scalar_t log_likelihood = std::log(std::exp(l1-m)+std::exp(l2-m))+m; + neg_log_likelihood_a[b] = -log_likelihood; + } + + return std::make_tuple(neg_log_likelihood, log_alpha); +} + +// This is the backward. It consists of two phases: +// a) computing the beta analogous to the alphas in the forward (backward half of the forward-backward algorithm) (eq (10) and (11)) +// b) collecting the per-activation characters for all s and wrapping the gradient (eq (16), the collection is the sum) +template +Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, + const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK) { + constexpr scalar_t neginf = -std::numeric_limits::infinity(); + using target_t = typename std::conditional::type; + int64_t max_input_length = log_probs.size(0); + int64_t batch_size = log_probs.size(1); + int64_t num_labels = log_probs.size(2); + Tensor grad = at::full_like(log_probs, neginf); // at this point, this is log of empty sum + + // The admin bits. We don't do much checking and assume that the forward did. + int64_t tg_target_stride; + int64_t max_target_length; + std::vector tg_batch_offsets(batch_size); + + if (targets.dim() == 1) { // concatenated targets + int64_t pos = 0; + max_target_length = 0; + for (int64_t i = 0; i < batch_size; i++) { + tg_batch_offsets[i] = pos; + pos += target_lengths[i]; + if (max_target_length < target_lengths[i]) + max_target_length = target_lengths[i]; + } + tg_target_stride = targets.stride(0); + } + else { // batch x max_target_length + // dim is 2 + int64_t tg_batch_stride = targets.stride(0); + for (int64_t i = 0; i < batch_size; i++) { + tg_batch_offsets[i] = i * tg_batch_stride; + } + tg_target_stride = targets.stride(1); + max_target_length = targets.size(1); + } + + Tensor log_beta = at::empty_like(log_alpha); // could be optimized to use only 2 rows + auto lpp = log_probs.permute({1,0,2}); + auto log_probs_a_global = lpp.accessor(); + auto log_alpha_a_global = log_alpha.accessor(); + auto log_beta_a_global = log_beta.accessor(); + auto gp = grad.permute({1,0,2}); + auto grad_a_global = gp.accessor(); + auto targets_data = targets.data(); + + #pragma omp parallel for + for (int64_t b = 0; b < batch_size; b++) { + auto log_probs_a = log_probs_a_global[b]; + auto log_alpha_a = log_alpha_a_global[b]; + auto log_beta_a = log_beta_a_global[b]; + auto grad_a = grad_a_global[b]; + int64_t input_length = input_lengths[b]; + int64_t target_length = target_lengths[b]; + int64_t tg_batch_offset = tg_batch_offsets[b]; + + // the initialization of beta before eq (10) + // here we do the fill for each batch item separately, as the input lengths will differ, so the t in which + // we start varies + if (input_length > 0) { + log_beta.narrow(0, b, 1).narrow(1, input_length-1, 1).fill_(neginf); + log_beta_a[input_length-1][2*target_length] = log_probs_a[input_length-1][BLANK]; + grad_a[input_length-1][BLANK] = log_alpha_a[input_length-1][2*target_length] + log_beta_a[input_length-1][2*target_length]; + + if (target_length > 0) { + auto current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, 2*target_length-1, BLANK); + log_beta_a[input_length-1][2*target_length-1] = log_probs_a[input_length-1][current_target_prime]; + + // the first two are a blank and a non-blank, so we know they are different and we don't need to do log+ + grad_a[input_length-1][current_target_prime] = log_alpha_a[input_length-1][2*target_length-1] + log_beta_a[input_length-1][2*target_length-1]; + } + } + + // now loop applying eq (10) / (11) + for (int64_t t=input_length-2; t>=0; t--) { + // this loop over s could be parallel/vectorized and doesn't really need to be descending... + // alternatively, one might consider moving s to the outer loop to cache current_target_prime more (but then it needs to be descending) + // for the cuda implementation, that gave a speed boost. + for (int64_t s=2*target_length; s>=0; s--) { + scalar_t lb1 = log_beta_a[t+1][s]; + scalar_t lbmax = lb1; + scalar_t lb2, lb3; + auto current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK); + if (s < 2*target_length) { + lb2 = log_beta_a[t+1][s+1]; + if (lb2 > lbmax) + lbmax = lb2; + } else { + lb2 = neginf; + } + if ((s < 2*target_length-1) && (get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s+2, BLANK) != + current_target_prime)) { + lb3 = log_beta_a[t+1][s+2]; + if (lb3 > lbmax) + lbmax = lb3; + } else { + lb3 = neginf; + } + if (lbmax == neginf) + lbmax = 0; + + log_beta_a[t][s] = std::log(std::exp(lb1-lbmax)+std::exp(lb2-lbmax)+std::exp(lb3-lbmax))+lbmax + log_probs_a[t][current_target_prime]; + // one might check whether one can vectorize this better when done after the t-loop... + // now that we have beta, we fill in the sum of alpha*beta in eq (16) + // in contrast to the cuda implementation, we only parallelize over the batch, so we don't have a concurrency + // issue (several s can map to the same target character) + // collected[b, t, target'[s]] "log+=" log_alpha[t, s]+log_beta[t, s] + scalar_t log_alpha_beta = log_alpha_a[t][s] + log_beta_a[t][s]; + scalar_t &lcab = grad_a[t][current_target_prime]; + if (lcab == neginf) { + lcab = log_alpha_beta; + } else { + scalar_t max = std::max(lcab, log_alpha_beta); + lcab = std::log(std::exp(lcab-max)+std::exp(log_alpha_beta-max))+max; + } + } + } + + // now grad has the sum of eq (16) + // now we wrap up the calculation by adding in the remaining items of eq (16) + // this could be a great target for further vectorization. + // grad is the output gradient, nll is the loss. Note that the likelihood -nll is the Z of eq (16) + scalar_t nll = neg_log_likelihood.accessor()[b]; + scalar_t gr = grad_out.accessor()[b]; + for (int64_t t = 0; t < input_length; t++) { // or go for the full thing? + for (int64_t c = 0; c < num_labels; c++) { + scalar_t& res = grad_a[t][c]; + scalar_t lp = log_probs_a[t][c]; + res = std::exp(lp)-std::exp(res + nll - lp) * gr; + } + } + // zero the remainder + if (input_length < max_input_length) { + grad.narrow(0, input_length, max_input_length - input_length).narrow(1, b, 1).zero_(); + } + } + return grad; +} + +} // namespace + +std::tuple ctc_loss_cpu(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK) { + return AT_DISPATCH_FLOATING_TYPES(log_probs.type(), "ctc_loss", [&] { + if (targets.type().scalarType() == kLong) { + return ctc_loss_cpu_template(log_probs, targets, input_lengths, target_lengths, BLANK); + } else { + return ctc_loss_cpu_template(log_probs, targets, input_lengths, target_lengths, BLANK); + } + }); +} + +Tensor ctc_loss_backward_cpu(const Tensor& grad, const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, + const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK) { + return AT_DISPATCH_FLOATING_TYPES(log_probs.type(), "ctc_loss_backward", [&] { + if (targets.type().scalarType() == kLong) { + return ctc_loss_backward_cpu_template(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK); + } else { + return ctc_loss_backward_cpu_template(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK); + } + }); +} + +// this wrapper function dispatches to the native and cudnn implementations and hides the alpha/grad from the user (by just returning the loss) +// the gradient is implemented for _cudnn_ctc_loss (just in derivatives.yaml) and _ctc_loss and this function has automatic gradients +// it also handles the reduction if desired +Tensor ctc_loss(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK, int64_t reduction) { + auto& ctx = at::globalContext(); + + bool use_cudnn = + detail::getCUDAHooks().compiledWithCuDNN() && + (detail::getCUDAHooks().versionCuDNN() >= 7000) && + ctx.userEnabledCuDNN() && + (BLANK == 0) && (targets.dim()==1) && + (log_probs.type().scalarType() == at::kFloat) && + (targets.type().scalarType() == at::kInt) && + (log_probs.type().backend() == Backend::CUDA); + + if (use_cudnn) { + // we don't know that input_lengths and target_lengths have the same size (they should, but we didn't check yet) + int64_t max_input_length = log_probs.size(0); + for (int64_t b = 0; b < input_lengths.size(); b++) { + use_cudnn &= (input_lengths[b] == max_input_length); + } + for (int64_t b = 0; b < target_lengths.size(); b++) { + use_cudnn &= (target_lengths[b] <= 256); + } + } + + Tensor res; + if (use_cudnn) { + res = std::get<0>(at::_cudnn_ctc_loss(log_probs, targets, input_lengths, target_lengths, BLANK, ctx.deterministicCuDNN())); + } else { + res = std::get<0>(at::_ctc_loss(log_probs, targets, input_lengths, target_lengths, BLANK)); + } + if (reduction == Reduction::ElementwiseMean) { + auto target_lengths_t = at::tensor(target_lengths, res.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(res.type()); + return (res / target_lengths_t).mean(); + } else if (reduction == Reduction::Sum) { + return res.sum(); + } + return res; +} + +} } // at::native diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp index d6ebbd4573a70c..b84b9c3f36b3ea 100644 --- a/aten/src/ATen/native/TensorFactories.cpp +++ b/aten/src/ATen/native/TensorFactories.cpp @@ -141,17 +141,9 @@ Tensor& eye_out_cpu(Tensor& result, int64_t n) { } Tensor& eye_out_cpu(Tensor& result, int64_t n, int64_t m) { -#ifndef USE_TH_SIZE_ZERO_DIM - AT_CHECK(n > 0, "n must be greater than 0, got ", n); -#else AT_CHECK(n >= 0, "n must be greater or equal to 0, got ", n); -#endif -#ifndef USE_TH_SIZE_ZERO_DIM - if(m <= 0) { -#else if(m < 0) { -#endif m = n; } diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index f7ced03c5ab6fc..be7e626fa1b748 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -12,6 +12,10 @@ namespace at { namespace native { +std::vector broadcast_tensors(TensorList tensors) { + return expand_outplace(tensors); +} + static void check_cat_no_zero_dim(TensorList tensors) { for(size_t i = 0; i < tensors.size(); ++i) { auto& t = tensors[i]; @@ -78,9 +82,6 @@ Tensor diagonal(const Tensor& self, int64_t offset, int64_t dim1_, int64_t dim2_ } else { diag_size = std::max(std::min(self.size(dim1)+offset, self.size(dim2)), 0); } -#ifndef USE_TH_SIZE_ZERO_DIM - AT_CHECK(diag_size > 0, "invalid diagonal offset ", offset); // the diagonal offset was too large in magnitude -#endif // NumPy allows you to specify offsets "off the end"; let's just be careful not to // set a ridiculous storage_offset in that case (technically it shouldn't matter @@ -95,8 +96,8 @@ Tensor diagonal(const Tensor& self, int64_t offset, int64_t dim1_, int64_t dim2_ // construct new size and stride: we drop dim1 and dim2 (maximum first for not changing the index of the minumum) // the new ("joint") dimension is appended to the end of the shape / stride to match numpy semantics - auto sizes = std::vector(self.sizes()); - auto strides = std::vector(self.strides()); + auto sizes = self.sizes().vec(); + auto strides = self.strides().vec(); sizes.erase(sizes.begin() + std::max(dim1, dim2)); strides.erase(strides.begin() + std::max(dim1, dim2)); sizes.erase(sizes.begin() + std::min(dim1, dim2)); @@ -157,11 +158,7 @@ Tensor narrow(const Tensor& self, int64_t dim, int64_t start, int64_t length) { if (start != cur_size) { // start being the end is valid, but not a valid dim specification. start = maybe_wrap_dim(start, cur_size); } -#ifndef USE_TH_SIZE_ZERO_DIM - if (length <= 0 || start > cur_size - length) { -#else if (length < 0 || start > cur_size - length) { -#endif AT_ERROR("start (", start, ") + length (", length, ") exceeds dimension size (", cur_size, ")."); } return at::slice(self, dim, start, start + length, 1); @@ -246,14 +243,6 @@ static std::vector infer_size(IntList shape, int64_t numel) { AT_CHECK(newsize != 0, "cannot reshape tensor of 0 elements into shape ", shape); res[*infer_dim] = numel / newsize; } -#ifndef USE_TH_SIZE_ZERO_DIM - if (numel == 0) { - // Collapse zero-element shapes into one dimension because TH handles zeros - // in sizes strangely: x.resize_(1, 0) has shape (1,). TODO: remove this - // once we have multi-dimensional empty tensors. - return {0}; - } -#endif return res; } @@ -291,8 +280,8 @@ Tensor select(const Tensor& self, int64_t dim, int64_t index) { if (index < 0) { index += size; } - auto sizes = std::vector(self.sizes()); - auto strides = std::vector(self.strides()); + auto sizes = self.sizes().vec(); + auto strides = self.strides().vec(); auto storage_offset = self.storage_offset() + index * strides[dim]; sizes.erase(sizes.begin() + dim); strides.erase(strides.begin() + dim); @@ -303,8 +292,8 @@ Tensor slice(const Tensor& self, int64_t dim, int64_t start, int64_t end, int64_ int64_t ndim = self.dim(); AT_CHECK(ndim > 0, "slice() cannot be applied to a 0-dim tensor."); dim = maybe_wrap_dim(dim, ndim); - auto sizes = std::vector(self.sizes()); - auto strides = std::vector(self.strides()); + auto sizes = self.sizes().vec(); + auto strides = self.strides().vec(); if (step <= 0) { // TODO: support negative strides throw std::runtime_error("slice step must be positive"); @@ -327,12 +316,6 @@ Tensor slice(const Tensor& self, int64_t dim, int64_t start, int64_t end, int64_ } auto storage_offset = self.storage_offset() + start * strides[dim]; auto len = end - start; -#ifndef USE_TH_SIZE_ZERO_DIM - if (len == 0) { - // TODO: currently we don't have support for 0-sized dims, return size 0 tensor for now - return self.type().tensor(); - } -#endif sizes[dim] = (len + step - 1) / step; // round-up strides[dim] *= step; return self.as_strided(sizes, strides, storage_offset); @@ -424,7 +407,7 @@ static inline Tensor & sparse_transpose_(Tensor & self, int64_t dim0, int64_t di } if (self._indices().numel() == 0 && self._values().numel() == 0) { - std::vector sizes(self.sizes()); + auto sizes = self.sizes().vec(); std::swap(sizes[dim0], sizes[dim1]); return self.sparse_raw_resize_(sizes, self._sparseDims(), self._denseDims()); @@ -439,7 +422,7 @@ static inline Tensor & sparse_transpose_(Tensor & self, int64_t dim0, int64_t di row0.copy_(row1); row1.copy_(tmp); - std::vector sizes(self.sizes()); + auto sizes = self.sizes().vec(); std::swap(sizes[dim0], sizes[dim1]); return self.sparse_raw_resize_(sizes, -1, -1); @@ -458,8 +441,8 @@ Tensor & transpose_(Tensor & self, int64_t dim0, int64_t dim1) { return sparse_transpose_(self, dim0, dim1); } - std::vector strides(self.strides()); - std::vector sizes(self.sizes()); + auto strides = self.strides().vec(); + auto sizes = self.sizes().vec(); std::swap(strides[dim0], strides[dim1]); std::swap(sizes[dim0], sizes[dim1]); return self.as_strided_(sizes, strides); @@ -478,8 +461,8 @@ Tensor transpose(const Tensor & self, int64_t dim0, int64_t dim1) { return sparse_transpose_(self_clone, dim0, dim1); } - std::vector strides(self.strides()); - std::vector sizes(self.sizes()); + auto strides = self.strides().vec(); + auto sizes = self.sizes().vec(); std::swap(strides[dim0], strides[dim1]); std::swap(sizes[dim0], sizes[dim1]); return self.as_strided(sizes, strides); @@ -539,13 +522,8 @@ inferSqueezeGeometry(const Tensor& tensor, int64_t dim) { std::tuple, std::vector > inferUnsqueezeGeometry(const Tensor& tensor, int64_t dim) { -#ifndef USE_TH_SIZE_ZERO_DIM - if (tensor.numel() == 0) { - throw std::runtime_error("cannot unsqueeze empty tensor"); - } -#endif - std::vector sizes(tensor.sizes()); - std::vector strides(tensor.strides()); + auto sizes = tensor.sizes().vec(); + auto strides = tensor.strides().vec(); int64_t new_stride = dim >= tensor.dim() ? 1 : sizes[dim] * strides[dim]; sizes.insert(sizes.begin() + dim, 1); strides.insert(strides.begin() + dim, new_stride); @@ -563,7 +541,7 @@ Tensor squeeze(const Tensor& self, int64_t dim) { dim = maybe_wrap_dim(dim, dims); if (dims == 0 || self.sizes()[dim] != 1) { - return self.as_strided(self.sizes().vec(), self.strides().vec()); + return self.as_strided(self.sizes(), self.strides()); } auto g = inferSqueezeGeometry(self, dim); return self.as_strided(std::get<0>(g), std::get<1>(g)); @@ -579,7 +557,7 @@ Tensor & squeeze_(Tensor& self, int64_t dim) { dim = maybe_wrap_dim(dim, self.dim()); if (dims == 0 || self.sizes()[dim] != 1) { - return self.as_strided_(self.sizes().vec(), self.strides().vec()); + return self.as_strided_(self.sizes(), self.strides()); } auto g = inferSqueezeGeometry(self, dim); return self.as_strided_(std::get<0>(g), std::get<1>(g)); diff --git a/aten/src/ATen/native/TensorTransformations.cpp b/aten/src/ATen/native/TensorTransformations.cpp index 84759874ef5355..0648387b35d5ae 100644 --- a/aten/src/ATen/native/TensorTransformations.cpp +++ b/aten/src/ATen/native/TensorTransformations.cpp @@ -13,7 +13,7 @@ Tensor flip_cpu(const Tensor& self, IntList dims) { const int64_t total_dims = self.dim(), flip_dims_size = dims.size(); flip_check_errors(total_dims, flip_dims_size, dims); - auto flip_dims_v = std::vector(dims); + auto flip_dims_v = dims.vec(); wrap_all_dims(flip_dims_v, total_dims); std::sort(flip_dims_v.begin(), flip_dims_v.end()); auto final_indices = std::vector(total_dims); diff --git a/aten/src/ATen/native/TensorTransformations.h b/aten/src/ATen/native/TensorTransformations.h index 2504a2c3f201b8..9b8c7d62b585c6 100644 --- a/aten/src/ATen/native/TensorTransformations.h +++ b/aten/src/ATen/native/TensorTransformations.h @@ -14,7 +14,7 @@ static inline void flip_check_errors(int64_t total_dims, int64_t flip_dims_size, AT_CHECK(flip_dims_size > 0 && flip_dims_size <= total_dims, "flip dims size out of range, got flip dims size=", flip_dims_size); - auto flip_dims_v = std::vector(dims); + auto flip_dims_v = dims.vec(); // check if dims axis within range auto min_max_d = std::minmax_element(flip_dims_v.begin(), flip_dims_v.end()); diff --git a/aten/src/ATen/native/Vision.cpp b/aten/src/ATen/native/Vision.cpp deleted file mode 100644 index 458e9aca23f0fe..00000000000000 --- a/aten/src/ATen/native/Vision.cpp +++ /dev/null @@ -1,28 +0,0 @@ -#include "ATen/ATen.h" -#include "ATen/NativeFunctions.h" -#include "ATen/detail/CUDAHooksInterface.h" - -namespace { - enum GridSamplerMode {GridSamplerModeZeros, GridSamplerModeBorder}; -} - -namespace at { namespace native { - -Tensor grid_sampler(const Tensor& input, const Tensor& grid, int64_t padding_mode) { - // cudnn does not support inputs larger than 1024 - if (at::native::cudnn_is_acceptable(input) && - padding_mode == GridSamplerModeZeros && - input.dim() == 4 && - input.size(1) <= 1024) { - return cudnn_grid_sampler(input, grid); - } - if (input.dim() == 4) { - return thnn_grid_sampler_bilinear2d(input, grid, padding_mode); - } - if (input.dim() == 5) { - return thnn_grid_sampler_bilinear3d(input, grid, padding_mode); - } - AT_ERROR("grid_sampler(): input must be 4d or 5d but got input of shape: ", input.dim()); -} - -}} // namespace at::native diff --git a/aten/src/ATen/native/cuda/GridSampler.cu b/aten/src/ATen/native/cuda/GridSampler.cu new file mode 100644 index 00000000000000..a47865f2023474 --- /dev/null +++ b/aten/src/ATen/native/cuda/GridSampler.cu @@ -0,0 +1,788 @@ +#include "ATen/ATen.h" +#include "ATen/native/GridSampler.h" +#include "ATen/cuda/CUDAContext.h" +#include "ATen/cuda/CUDAApplyUtils.cuh" +#include "ATen/cuda/detail/TensorInfo.cuh" +#include "ATen/cuda/detail/IndexUtils.cuh" +#include "ATen/cuda/detail/KernelUtils.h" + +namespace at { namespace native { + +using namespace at::cuda::detail; + +using at::native::detail::GridSamplerInterpolation; +using at::native::detail::GridSamplerPadding; + +namespace { + static __forceinline__ __device__ + int clip_coordinates(int in, int clip_limit) { + return ::min(clip_limit - 1, ::max(in, static_cast(0))); + } + + static __forceinline__ __device__ + bool within_bounds_2d(int h, int w, int H, int W) { + return h >= 0 && h < H && w >= 0 && w < W; + } + + static __forceinline__ __device__ + bool within_bounds_3d(int d, int h, int w, int D, int H, int W) { + return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W; + } + + template + static __forceinline__ __device__ + void safe_add_2d(scalar_t *data, int h, int w, + int sH, int sW, int H, int W, + scalar_t delta) { + if (within_bounds_2d(h, w, H, W)) { + atomicAdd(data + h * sH + w * sW, delta); + } + } + + template + static __forceinline__ __device__ + void safe_add_3d(scalar_t *data, int d, int h, int w, + int sD, int sH, int sW, int D, int H, int W, + scalar_t delta) { + if (within_bounds_3d(d, h, w, D, H, W)) { + atomicAdd(data + d * sD + h * sH + w * sW, delta); + } + } + + template + __launch_bounds__(1024) + __global__ void grid_sampler_2d_kernel( + const int nthreads, + TensorInfo input, + TensorInfo grid, + TensorInfo output, + const GridSamplerPadding padding_mode) { + + int C = input.sizes[1]; + int inp_H = input.sizes[2]; + int inp_W = input.sizes[3]; + int out_H = grid.sizes[1]; + int out_W = grid.sizes[2]; + int inp_sN = input.strides[0]; + int inp_sC = input.strides[1]; + int inp_sH = input.strides[2]; + int inp_sW = input.strides[3]; + int grid_sN = grid.strides[0]; + int grid_sH = grid.strides[1]; + int grid_sW = grid.strides[2]; + int grid_sCoor = grid.strides[3]; + int out_sN = output.strides[0]; + int out_sC = output.strides[1]; + int out_sH = output.strides[2]; + int out_sW = output.strides[3]; + + CUDA_KERNEL_LOOP(index, nthreads) { + const int w = index % out_W; + const int h = (index / out_W) % out_H; + const int n = index / (out_H * out_W); + const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW; + + // get the corresponding input x, y co-ordinates from grid + scalar_t ix = grid.data[grid_offset]; + scalar_t iy = grid.data[grid_offset + grid_sCoor]; + + // normalize ix, iy from [-1, 1] to [0, IH-1] & [0, IW-1] + float ixf = ((ix + 1.f) / 2) * (inp_W - 1); + float iyf = ((iy + 1.f) / 2) * (inp_H - 1); + + ix = static_cast(ixf); + iy = static_cast(iyf); + + // get NE, NW, SE, SW pixel values from (x, y) + int ix_nw = static_cast(::floor(ixf)); + int iy_nw = static_cast(::floor(iyf)); + int ix_ne = ix_nw + 1; + int iy_ne = iy_nw; + int ix_sw = ix_nw; + int iy_sw = iy_nw + 1; + int ix_se = ix_nw + 1; + int iy_se = iy_nw + 1; + + // get surfaces to each neighbor: + scalar_t nw = (ix_se - ix) * (iy_se - iy); + scalar_t ne = (ix - ix_sw) * (iy_sw - iy); + scalar_t sw = (ix_ne - ix) * (iy - iy_ne); + scalar_t se = (ix - ix_nw) * (iy - iy_nw); + + // calculate bilinear weighted pixel value and set output pixel + if (padding_mode == GridSamplerPadding::Border) { + // clip coordinates to image borders + ix_nw = clip_coordinates(ix_nw, inp_W); + iy_nw = clip_coordinates(iy_nw, inp_H); + ix_ne = clip_coordinates(ix_ne, inp_W); + iy_ne = clip_coordinates(iy_ne, inp_H); + ix_sw = clip_coordinates(ix_sw, inp_W); + iy_sw = clip_coordinates(iy_sw, inp_H); + ix_se = clip_coordinates(ix_se, inp_W); + iy_se = clip_coordinates(iy_se, inp_H); + } + + auto inp_ptr_NC = input.data + n * inp_sN; + auto out_ptr_NCHW = output.data + n * out_sN + h * out_sH + w * out_sW; + for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) { + *out_ptr_NCHW = static_cast(0); + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) { + *out_ptr_NCHW += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) { + *out_ptr_NCHW += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) { + *out_ptr_NCHW += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) { + *out_ptr_NCHW += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se; + } + } + } + } + + template + __launch_bounds__(1024) + __global__ void grid_sampler_3d_kernel( + const int nthreads, + TensorInfo input, + TensorInfo grid, + TensorInfo output, + const GridSamplerPadding padding_mode) { + + int C = input.sizes[1]; + int inp_D = input.sizes[2]; + int inp_H = input.sizes[3]; + int inp_W = input.sizes[4]; + int out_D = grid.sizes[1]; + int out_H = grid.sizes[2]; + int out_W = grid.sizes[3]; + int inp_sN = input.strides[0]; + int inp_sC = input.strides[1]; + int inp_sD = input.strides[2]; + int inp_sH = input.strides[3]; + int inp_sW = input.strides[4]; + int grid_sN = grid.strides[0]; + int grid_sD = grid.strides[1]; + int grid_sH = grid.strides[2]; + int grid_sW = grid.strides[3]; + int grid_sCoor = grid.strides[4]; + int out_sN = output.strides[0]; + int out_sC = output.strides[1]; + int out_sD = output.strides[2]; + int out_sH = output.strides[3]; + int out_sW = output.strides[4]; + + CUDA_KERNEL_LOOP(index, nthreads) { + const int w = index % out_W; + const int h = (index / out_W) % out_H; + const int d = (index / (out_H * out_W)) % out_D; + const int n = index / (out_D * out_H * out_W); + const int grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW; + + // get the corresponding input x, y, z co-ordinates from grid + scalar_t ix = grid.data[grid_offset]; + scalar_t iy = grid.data[grid_offset + grid_sCoor]; + scalar_t iz = grid.data[grid_offset + 2 * grid_sCoor]; + + // normalize ix, iy, iz from [-1, 1] to [0, inp_W-1] & [0, inp_H-1] & [0, inp_D-1] + float ixf = ((ix + 1.f) / 2) * (inp_W - 1); + float iyf = ((iy + 1.f) / 2) * (inp_H - 1); + float izf = ((iz + 1.f) / 2) * (inp_D - 1); + + ix = static_cast(ixf); + iy = static_cast(iyf); + iz = static_cast(izf); + + // get corner pixel values from (x, y, z) + // for 4d, we used north-east-south-west + // for 5d, we add top-bottom + int ix_tnw = static_cast(::floor(ix)); + int iy_tnw = static_cast(::floor(iy)); + int iz_tnw = static_cast(::floor(iz)); + + int ix_tne = ix_tnw + 1; + int iy_tne = iy_tnw; + int iz_tne = iz_tnw; + + int ix_tsw = ix_tnw; + int iy_tsw = iy_tnw + 1; + int iz_tsw = iz_tnw; + + int ix_tse = ix_tnw + 1; + int iy_tse = iy_tnw + 1; + int iz_tse = iz_tnw; + + int ix_bnw = ix_tnw; + int iy_bnw = iy_tnw; + int iz_bnw = iz_tnw + 1; + + int ix_bne = ix_tnw + 1; + int iy_bne = iy_tnw; + int iz_bne = iz_tnw + 1; + + int ix_bsw = ix_tnw; + int iy_bsw = iy_tnw + 1; + int iz_bsw = iz_tnw + 1; + + int ix_bse = ix_tnw + 1; + int iy_bse = iy_tnw + 1; + int iz_bse = iz_tnw + 1; + + // get surfaces to each neighbor: + scalar_t tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz); + scalar_t tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz); + scalar_t tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz); + scalar_t tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz); + scalar_t bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse); + scalar_t bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw); + scalar_t bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne); + scalar_t bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw); + + if (padding_mode == GridSamplerPadding::Border) { + // clip coordinates to image borders + ix_tnw = clip_coordinates(ix_tnw, inp_W); + iy_tnw = clip_coordinates(iy_tnw, inp_H); + iz_tnw = clip_coordinates(iz_tnw, inp_D); + ix_tne = clip_coordinates(ix_tne, inp_W); + iy_tne = clip_coordinates(iy_tne, inp_H); + iz_tne = clip_coordinates(iz_tne, inp_D); + ix_tsw = clip_coordinates(ix_tsw, inp_W); + iy_tsw = clip_coordinates(iy_tsw, inp_H); + iz_tsw = clip_coordinates(iz_tsw, inp_D); + ix_tse = clip_coordinates(ix_tse, inp_W); + iy_tse = clip_coordinates(iy_tse, inp_H); + iz_tse = clip_coordinates(iz_tse, inp_D); + ix_bnw = clip_coordinates(ix_bnw, inp_W); + iy_bnw = clip_coordinates(iy_bnw, inp_H); + iz_bnw = clip_coordinates(iz_bnw, inp_D); + ix_bne = clip_coordinates(ix_bne, inp_W); + iy_bne = clip_coordinates(iy_bne, inp_H); + iz_bne = clip_coordinates(iz_bne, inp_D); + ix_bsw = clip_coordinates(ix_bsw, inp_W); + iy_bsw = clip_coordinates(iy_bsw, inp_H); + iz_bsw = clip_coordinates(iz_bsw, inp_D); + ix_bse = clip_coordinates(ix_bse, inp_W); + iy_bse = clip_coordinates(iy_bse, inp_H); + iz_bse = clip_coordinates(iz_bse, inp_D); + } + + auto inp_ptr_NC = input.data + n * inp_sN; + auto out_ptr_NCDHW = output.data + n * out_sN + d * out_sD + h * out_sH + w * out_sW; + for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) { + // (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) * tne + // + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) * tse + // + (c, iz_bnw, iy_bnw, ix_bnw) * bnw + (c, iz_bne, iy_bne, ix_bne) * bne + // + (c, iz_bsw, iy_bsw, ix_bsw) * bsw + (c, iz_bse, iy_bse, ix_bse) * bse + *out_ptr_NCDHW = static_cast(0); + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) { + *out_ptr_NCDHW += inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] * tnw; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) { + *out_ptr_NCDHW += inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] * tne; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) { + *out_ptr_NCDHW += inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] * tsw; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) { + *out_ptr_NCDHW += inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] * tse; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) { + *out_ptr_NCDHW += inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] * bnw; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) { + *out_ptr_NCDHW += inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] * bne; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) { + *out_ptr_NCDHW += inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] * bsw; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) { + *out_ptr_NCDHW += inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] * bse; + } + } + } + } + + template + __launch_bounds__(1024) + __global__ void grid_sampler_2d_backward_kernel( + const int nthreads, + TensorInfo grad_output, + TensorInfo input, + TensorInfo grid, + TensorInfo grad_input, // initialized to zeros + TensorInfo grad_grid, // initialized to empty + const GridSamplerPadding padding_mode) { + + int C = input.sizes[1]; + int inp_H = input.sizes[2]; + int inp_W = input.sizes[3]; + int out_H = grid.sizes[1]; + int out_W = grid.sizes[2]; + int inp_sN = input.strides[0]; + int inp_sC = input.strides[1]; + int inp_sH = input.strides[2]; + int inp_sW = input.strides[3]; + int grid_sN = grid.strides[0]; + int grid_sH = grid.strides[1]; + int grid_sW = grid.strides[2]; + int grid_sCoor = grid.strides[3]; + int gOut_sN = grad_output.strides[0]; + int gOut_sC = grad_output.strides[1]; + int gOut_sH = grad_output.strides[2]; + int gOut_sW = grad_output.strides[3]; + int gInp_sN = grad_input.strides[0]; + int gInp_sC = grad_input.strides[1]; + int gInp_sH = grad_input.strides[2]; + int gInp_sW = grad_input.strides[3]; + int gGrid_sW = grad_grid.strides[2]; + + CUDA_KERNEL_LOOP(index, nthreads) { + const int w = index % out_W; + const int h = (index / out_W) % out_H; + const int n = index / (out_H * out_W); + const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW; + + // get the corresponding input x, y co-ordinates from grid + scalar_t ix = grid.data[grid_offset]; + scalar_t iy = grid.data[grid_offset + grid_sCoor]; + + // normalize ix, iy from [-1, 1] to [0, IH-1] & [0, IW-1] + float ixf = ((ix + 1.f) / 2) * (inp_W - 1); + float iyf = ((iy + 1.f) / 2) * (inp_H - 1); + + ix = static_cast(ixf); + iy = static_cast(iyf); + + // get NE, NW, SE, SW pixel values from (x, y) + int ix_nw = static_cast(::floor(ixf)); + int iy_nw = static_cast(::floor(iyf)); + int ix_ne = ix_nw + 1; + int iy_ne = iy_nw; + int ix_sw = ix_nw; + int iy_sw = iy_nw + 1; + int ix_se = ix_nw + 1; + int iy_se = iy_nw + 1; + + // get surfaces to each neighbor: + scalar_t nw = (ix_se - ix) * (iy_se - iy); + scalar_t ne = (ix - ix_sw) * (iy_sw - iy); + scalar_t sw = (ix_ne - ix) * (iy - iy_ne); + scalar_t se = (ix - ix_nw) * (iy - iy_nw); + + int ix_nw_cl, iy_nw_cl, ix_ne_cl, iy_ne_cl, ix_sw_cl, iy_sw_cl, ix_se_cl, iy_se_cl; + + // calculate bilinear weighted pixel value and set output pixel + if (padding_mode == GridSamplerPadding::Border) { + // clip coordinates to image borders + ix_nw_cl = clip_coordinates(ix_nw, inp_W); + iy_nw_cl = clip_coordinates(iy_nw, inp_H); + ix_ne_cl = clip_coordinates(ix_ne, inp_W); + iy_ne_cl = clip_coordinates(iy_ne, inp_H); + ix_sw_cl = clip_coordinates(ix_sw, inp_W); + iy_sw_cl = clip_coordinates(iy_sw, inp_H); + ix_se_cl = clip_coordinates(ix_se, inp_W); + iy_se_cl = clip_coordinates(iy_se, inp_H); + } else { + ix_nw_cl = ix_nw; + iy_nw_cl = iy_nw; + ix_ne_cl = ix_ne; + iy_ne_cl = iy_ne; + ix_sw_cl = ix_sw; + iy_sw_cl = iy_sw; + ix_se_cl = ix_se; + iy_se_cl = iy_se; + } + + scalar_t gix = static_cast(0), giy = static_cast(0); + scalar_t *gOut_ptr_NCHW = grad_output.data + n * gOut_sN + h * gOut_sH + w * gOut_sW; + scalar_t *gInp_ptr_NC = grad_input.data + n * gInp_sN; + scalar_t *inp_ptr_NC = input.data + n * inp_sN; + for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, gInp_ptr_NC += gInp_sC, gOut_ptr_NCHW += gOut_sC) { + scalar_t gOut = *gOut_ptr_NCHW; + + // calculate and set grad_input + safe_add_2d(gInp_ptr_NC, iy_nw_cl, ix_nw_cl, gInp_sH, gInp_sW, inp_H, inp_W, nw * gOut); + safe_add_2d(gInp_ptr_NC, iy_ne_cl, ix_ne_cl, gInp_sH, gInp_sW, inp_H, inp_W, ne * gOut); + safe_add_2d(gInp_ptr_NC, iy_sw_cl, ix_sw_cl, gInp_sH, gInp_sW, inp_H, inp_W, sw * gOut); + safe_add_2d(gInp_ptr_NC, iy_se_cl, ix_se_cl, gInp_sH, gInp_sW, inp_H, inp_W, se * gOut); + + // calculate grad_grid + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_nw_cl, ix_nw_cl, inp_H, inp_W)) { + scalar_t nw_val = inp_ptr_NC[iy_nw_cl * inp_sH + ix_nw_cl * inp_sW]; + gix -= nw_val * (iy_se - iy) * gOut; + giy -= nw_val * (ix_se - ix) * gOut; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_ne_cl, ix_ne_cl, inp_H, inp_W)) { + scalar_t ne_val = inp_ptr_NC[iy_ne_cl * inp_sH + ix_ne_cl * inp_sW]; + gix += ne_val * (iy_sw - iy) * gOut; + giy -= ne_val * (ix - ix_sw) * gOut; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_sw_cl, ix_sw_cl, inp_H, inp_W)) { + scalar_t sw_val = inp_ptr_NC[iy_sw_cl * inp_sH + ix_sw_cl * inp_sW]; + gix -= sw_val * (iy - iy_ne) * gOut; + giy += sw_val * (ix_ne - ix) * gOut; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_se_cl, ix_se_cl, inp_H, inp_W)) { + scalar_t se_val = inp_ptr_NC[iy_se_cl * inp_sH + ix_se_cl * inp_sW]; + gix += se_val * (iy - iy_nw) * gOut; + giy += se_val * (ix - ix_nw) * gOut; + } + } + + // un-normalize grad_grid values back to [-1, 1] constraints + gix = gix * (inp_W - 1.f) / 2; + giy = giy * (inp_H - 1.f) / 2; + + // assuming grad_grid is contiguous + // thus we can + // 1. use index with gGrid_sW to diectly compute gGrid_ptr_NHW + // 2. directly assign to gGrid_ptr_NHW[0], gGrid_ptr_NHW[1] + scalar_t *gGrid_ptr_NHW = grad_grid.data + index * gGrid_sW; + gGrid_ptr_NHW[0] = gix; + gGrid_ptr_NHW[1] = giy; + } + } + + template + __launch_bounds__(1024) + __global__ void grid_sampler_3d_backward_kernel( + const int nthreads, + TensorInfo grad_output, + TensorInfo input, + TensorInfo grid, + TensorInfo grad_input, // initialized to zeros + TensorInfo grad_grid, // initialized to empty + const GridSamplerPadding padding_mode) { + + int C = input.sizes[1]; + int inp_D = input.sizes[2]; + int inp_H = input.sizes[3]; + int inp_W = input.sizes[4]; + int out_D = grid.sizes[1]; + int out_H = grid.sizes[2]; + int out_W = grid.sizes[3]; + int inp_sN = input.strides[0]; + int inp_sC = input.strides[1]; + int inp_sD = input.strides[2]; + int inp_sH = input.strides[3]; + int inp_sW = input.strides[4]; + int grid_sN = grid.strides[0]; + int grid_sD = grid.strides[1]; + int grid_sH = grid.strides[2]; + int grid_sW = grid.strides[3]; + int grid_sCoor = grid.strides[4]; + int gOut_sN = grad_output.strides[0]; + int gOut_sC = grad_output.strides[1]; + int gOut_sD = grad_output.strides[2]; + int gOut_sH = grad_output.strides[3]; + int gOut_sW = grad_output.strides[4]; + int gInp_sN = grad_input.strides[0]; + int gInp_sC = grad_input.strides[1]; + int gInp_sD = grad_input.strides[2]; + int gInp_sH = grad_input.strides[3]; + int gInp_sW = grad_input.strides[4]; + int gGrid_sW = grad_grid.strides[3]; + + CUDA_KERNEL_LOOP(index, nthreads) { + const int w = index % out_W; + const int h = (index / out_W) % out_H; + const int d = (index / (out_H * out_W)) % out_D; + const int n = index / (out_D * out_H * out_W); + const int grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW; + + // get the corresponding input x, y, z co-ordinates from grid + scalar_t ix = grid.data[grid_offset]; + scalar_t iy = grid.data[grid_offset + grid_sCoor]; + scalar_t iz = grid.data[grid_offset + 2 * grid_sCoor]; + + // normalize ix, iy, iz from [-1, 1] to [0, inp_W-1] & [0, inp_H-1] & [0, inp_D-1] + float ixf = ((ix + 1.f) / 2) * (inp_W - 1); + float iyf = ((iy + 1.f) / 2) * (inp_H - 1); + float izf = ((iz + 1.f) / 2) * (inp_D - 1); + + ix = static_cast(ixf); + iy = static_cast(iyf); + iz = static_cast(izf); + + // get corner pixel values from (x, y, z) + // for 4d, we used north-east-south-west + // for 5d, we add top-bottom + int ix_tnw = static_cast(::floor(ix)); + int iy_tnw = static_cast(::floor(iy)); + int iz_tnw = static_cast(::floor(iz)); + + int ix_tne = ix_tnw + 1; + int iy_tne = iy_tnw; + int iz_tne = iz_tnw; + + int ix_tsw = ix_tnw; + int iy_tsw = iy_tnw + 1; + int iz_tsw = iz_tnw; + + int ix_tse = ix_tnw + 1; + int iy_tse = iy_tnw + 1; + int iz_tse = iz_tnw; + + int ix_bnw = ix_tnw; + int iy_bnw = iy_tnw; + int iz_bnw = iz_tnw + 1; + + int ix_bne = ix_tnw + 1; + int iy_bne = iy_tnw; + int iz_bne = iz_tnw + 1; + + int ix_bsw = ix_tnw; + int iy_bsw = iy_tnw + 1; + int iz_bsw = iz_tnw + 1; + + int ix_bse = ix_tnw + 1; + int iy_bse = iy_tnw + 1; + int iz_bse = iz_tnw + 1; + + // get surfaces to each neighbor: + scalar_t tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz); + scalar_t tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz); + scalar_t tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz); + scalar_t tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz); + scalar_t bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse); + scalar_t bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw); + scalar_t bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne); + scalar_t bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw); + + int ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, ix_tne_cl, iy_tne_cl, iz_tne_cl; + int ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, ix_tse_cl, iy_tse_cl, iz_tse_cl; + int ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, ix_bne_cl, iy_bne_cl, iz_bne_cl; + int ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, ix_bse_cl, iy_bse_cl, iz_bse_cl; + + if (padding_mode == GridSamplerPadding::Border) { + // clip coordinates to image borders + ix_tnw_cl = clip_coordinates(ix_tnw, inp_W); + iy_tnw_cl = clip_coordinates(iy_tnw, inp_H); + iz_tnw_cl = clip_coordinates(iz_tnw, inp_D); + ix_tne_cl = clip_coordinates(ix_tne, inp_W); + iy_tne_cl = clip_coordinates(iy_tne, inp_H); + iz_tne_cl = clip_coordinates(iz_tne, inp_D); + ix_tsw_cl = clip_coordinates(ix_tsw, inp_W); + iy_tsw_cl = clip_coordinates(iy_tsw, inp_H); + iz_tsw_cl = clip_coordinates(iz_tsw, inp_D); + ix_tse_cl = clip_coordinates(ix_tse, inp_W); + iy_tse_cl = clip_coordinates(iy_tse, inp_H); + iz_tse_cl = clip_coordinates(iz_tse, inp_D); + ix_bnw_cl = clip_coordinates(ix_bnw, inp_W); + iy_bnw_cl = clip_coordinates(iy_bnw, inp_H); + iz_bnw_cl = clip_coordinates(iz_bnw, inp_D); + ix_bne_cl = clip_coordinates(ix_bne, inp_W); + iy_bne_cl = clip_coordinates(iy_bne, inp_H); + iz_bne_cl = clip_coordinates(iz_bne, inp_D); + ix_bsw_cl = clip_coordinates(ix_bsw, inp_W); + iy_bsw_cl = clip_coordinates(iy_bsw, inp_H); + iz_bsw_cl = clip_coordinates(iz_bsw, inp_D); + ix_bse_cl = clip_coordinates(ix_bse, inp_W); + iy_bse_cl = clip_coordinates(iy_bse, inp_H); + iz_bse_cl = clip_coordinates(iz_bse, inp_D); + } else { + ix_tnw_cl = ix_tnw; + iy_tnw_cl = iy_tnw; + iz_tnw_cl = iz_tnw; + ix_tne_cl = ix_tne; + iy_tne_cl = iy_tne; + iz_tne_cl = iz_tne; + ix_tsw_cl = ix_tsw; + iy_tsw_cl = iy_tsw; + iz_tsw_cl = iz_tsw; + ix_tse_cl = ix_tse; + iy_tse_cl = iy_tse; + iz_tse_cl = iz_tse; + ix_bnw_cl = ix_bnw; + iy_bnw_cl = iy_bnw; + iz_bnw_cl = iz_bnw; + ix_bne_cl = ix_bne; + iy_bne_cl = iy_bne; + iz_bne_cl = iz_bne; + ix_bsw_cl = ix_bsw; + iy_bsw_cl = iy_bsw; + iz_bsw_cl = iz_bsw; + ix_bse_cl = ix_bse; + iy_bse_cl = iy_bse; + iz_bse_cl = iz_bse; + } + + scalar_t gix = static_cast(0), giy = static_cast(0), giz = static_cast(0); + scalar_t *gOut_ptr_NCDHW = grad_output.data + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW; + scalar_t *gInp_ptr_NC = grad_input.data + n * gInp_sN; + scalar_t *inp_ptr_NC = input.data + n * inp_sN; + // calculate bilinear weighted pixel value and set output pixel + for (int c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC += inp_sC) { + scalar_t gOut = *gOut_ptr_NCDHW; + + // calculate and set grad_input + safe_add_3d(gInp_ptr_NC, iz_tnw_cl, iy_tnw_cl, ix_tnw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tnw * gOut); + safe_add_3d(gInp_ptr_NC, iz_tne_cl, iy_tne_cl, ix_tne_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tne * gOut); + safe_add_3d(gInp_ptr_NC, iz_tsw_cl, iy_tsw_cl, ix_tsw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tsw * gOut); + safe_add_3d(gInp_ptr_NC, iz_tse_cl, iy_tse_cl, ix_tse_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tse * gOut); + safe_add_3d(gInp_ptr_NC, iz_bnw_cl, iy_bnw_cl, ix_bnw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bnw * gOut); + safe_add_3d(gInp_ptr_NC, iz_bne_cl, iy_bne_cl, ix_bne_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bne * gOut); + safe_add_3d(gInp_ptr_NC, iz_bsw_cl, iy_bsw_cl, ix_bsw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bsw * gOut); + safe_add_3d(gInp_ptr_NC, iz_bse_cl, iy_bse_cl, ix_bse_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bse * gOut); + + // calculate grad_grid + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tnw_cl, iy_tnw_cl, ix_tnw_cl, inp_D, inp_H, inp_W)) { + scalar_t tnw_val = inp_ptr_NC[iz_tnw_cl * inp_sD + iy_tnw_cl * inp_sH + ix_tnw_cl * inp_sW]; + gix -= tnw_val * (iy_bse - iy) * (iz_bse - iz) * gOut; + giy -= tnw_val * (ix_bse - ix) * (iz_bse - iz) * gOut; + giz -= tnw_val * (ix_bse - ix) * (iy_bse - iy) * gOut; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tne_cl, iy_tne_cl, ix_tne_cl, inp_D, inp_H, inp_W)) { + scalar_t tne_val = inp_ptr_NC[iz_tne_cl * inp_sD + iy_tne_cl * inp_sH + ix_tne_cl * inp_sW]; + gix += tne_val * (iy_bsw - iy) * (iz_bsw - iz) * gOut; + giy -= tne_val * (ix - ix_bsw) * (iz_bsw - iz) * gOut; + giz -= tne_val * (ix - ix_bsw) * (iy_bsw - iy) * gOut; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tsw_cl, iy_tsw_cl, ix_tsw_cl, inp_D, inp_H, inp_W)) { + scalar_t tsw_val = inp_ptr_NC[iz_tsw_cl * inp_sD + iy_tsw_cl * inp_sH + ix_tsw_cl * inp_sW]; + gix -= tsw_val * (iy - iy_bne) * (iz_bne - iz) * gOut; + giy += tsw_val * (ix_bne - ix) * (iz_bne - iz) * gOut; + giz -= tsw_val * (ix_bne - ix) * (iy - iy_bne) * gOut; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tse_cl, iy_tse_cl, ix_tse_cl, inp_D, inp_H, inp_W)) { + scalar_t tse_val = inp_ptr_NC[iz_tse_cl * inp_sD + iy_tse_cl * inp_sH + ix_tse_cl * inp_sW]; + gix += tse_val * (iy - iy_bnw) * (iz_bnw - iz) * gOut; + giy += tse_val * (ix - ix_bnw) * (iz_bnw - iz) * gOut; + giz -= tse_val * (ix - ix_bnw) * (iy - iy_bnw) * gOut; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bnw_cl, iy_bnw_cl, ix_bnw_cl, inp_D, inp_H, inp_W)) { + scalar_t bnw_val = inp_ptr_NC[iz_bnw_cl * inp_sD + iy_bnw_cl * inp_sH + ix_bnw_cl * inp_sW]; + gix -= bnw_val * (iy_tse - iy) * (iz - iz_tse) * gOut; + giy -= bnw_val * (ix_tse - ix) * (iz - iz_tse) * gOut; + giz += bnw_val * (ix_tse - ix) * (iy_tse - iy) * gOut; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bne_cl, iy_bne_cl, ix_bne_cl, inp_D, inp_H, inp_W)) { + scalar_t bne_val = inp_ptr_NC[iz_bne_cl * inp_sD + iy_bne_cl * inp_sH + ix_bne_cl * inp_sW]; + gix += bne_val * (iy_tsw - iy) * (iz - iz_tsw) * gOut; + giy -= bne_val * (ix - ix_tsw) * (iz - iz_tsw) * gOut; + giz += bne_val * (ix - ix_tsw) * (iy_tsw - iy) * gOut; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bsw_cl, iy_bsw_cl, ix_bsw_cl, inp_D, inp_H, inp_W)) { + scalar_t bsw_val = inp_ptr_NC[iz_bsw_cl * inp_sD + iy_bsw_cl * inp_sH + ix_bsw_cl * inp_sW]; + gix -= bsw_val * (iy - iy_tne) * (iz - iz_tne) * gOut; + giy += bsw_val * (ix_tne - ix) * (iz - iz_tne) * gOut; + giz += bsw_val * (ix_tne - ix) * (iy - iy_tne) * gOut; + } + if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bse_cl, iy_bse_cl, ix_bse_cl, inp_D, inp_H, inp_W)) { + scalar_t bse_val = inp_ptr_NC[iz_bse_cl * inp_sD + iy_bse_cl * inp_sH + ix_bse_cl * inp_sW]; + gix += bse_val * (iy - iy_tnw) * (iz - iz_tnw) * gOut; + giy += bse_val * (ix - ix_tnw) * (iz - iz_tnw) * gOut; + giz += bse_val * (ix - ix_tnw) * (iy - iy_tnw) * gOut; + } + } + + // un-normalize grad_grid values back to [-1, 1] constraints + gix = gix * (inp_W - 1) / 2; + giy = giy * (inp_H - 1) / 2; + giz = giz * (inp_D - 1) / 2; + + // assuming grad_grid is contiguous + // thus we can + // 1. use index with gGrid_sW to diectly compute gGrid_ptr_NDHW + // 2. directly assign to gGrid_ptr_NDHW[0], gGrid_ptr_NDHW[1], gGrid_ptr_NDHW[2] + scalar_t *gGrid_ptr_NDHW = grad_grid.data + index * gGrid_sW; + gGrid_ptr_NDHW[0] = gix; + gGrid_ptr_NDHW[1] = giy; + gGrid_ptr_NDHW[2] = giz; + } + } +} // namespace + +// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. +Tensor grid_sampler_2d_cuda(const Tensor& input, const Tensor& grid, + int64_t interpolation_mode, int64_t padding_mode) { + auto N = input.size(0); + auto H = grid.size(1); + auto W = grid.size(2); + auto output = at::empty({N, input.size(1), H, W}, input.options()); + AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "grid_sampler_2d_cuda", [&] { + int count = static_cast(N * H * W); + grid_sampler_2d_kernel + <<>>( + count, + getTensorInfo(input), + getTensorInfo(grid), + getTensorInfo(output), + static_cast(padding_mode)); + }); + return output; +} + +// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. +Tensor grid_sampler_3d_cuda(const Tensor& input, const Tensor& grid, + int64_t interpolation_mode, int64_t padding_mode) { + auto N = input.size(0); + auto D = grid.size(1); + auto H = grid.size(2); + auto W = grid.size(3); + auto output = at::empty({N, input.size(1), D, H, W}, input.options()); + AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "grid_sampler_2d_cuda", [&] { + int count = static_cast(N * D * H * W); + grid_sampler_3d_kernel + <<>>( + count, + getTensorInfo(input), + getTensorInfo(grid), + getTensorInfo(output), + static_cast(padding_mode)); + }); + return output; +} + +// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. +std::tuple +grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input, const Tensor& grid, + int64_t interpolation_mode, int64_t padding_mode) { + auto N = input.size(0); + auto H = grid.size(1); + auto W = grid.size(2); + auto grad_input = at::zeros_like(input); + auto grad_grid = at::empty_like(grid); + AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "grid_sampler_2d_backward_cuda", [&] { + int count = static_cast(N * H * W); + grid_sampler_2d_backward_kernel + <<>>( + count, + getTensorInfo(grad_output), + getTensorInfo(input), + getTensorInfo(grid), + getTensorInfo(grad_input), + getTensorInfo(grad_grid), + static_cast(padding_mode)); + }); + return std::make_tuple(grad_input, grad_grid); +} + +// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. +std::tuple +grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input, const Tensor& grid, + int64_t interpolation_mode, int64_t padding_mode) { + auto N = input.size(0); + auto D = grid.size(1); + auto H = grid.size(2); + auto W = grid.size(3); + auto grad_input = at::zeros_like(input); + auto grad_grid = at::empty_like(grid); + AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "grid_sampler_3d_backward_cuda", [&] { + int count = static_cast(N * D * H * W); + grid_sampler_3d_backward_kernel + <<>>( + count, + getTensorInfo(grad_output), + getTensorInfo(input), + getTensorInfo(grid), + getTensorInfo(grad_input), + getTensorInfo(grad_grid), + static_cast(padding_mode)); + }); + return std::make_tuple(grad_input, grad_grid); +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh index 4b474e0c079e77..12f22fcaf2f216 100644 --- a/aten/src/ATen/native/cuda/Loops.cuh +++ b/aten/src/ATen/native/cuda/Loops.cuh @@ -76,6 +76,9 @@ void gpu_nullary_kernel(TensorIterator& iter, const func_t& f) { using arg0_t = typename traits::result_type; int64_t numel = iter.numel(); + if (numel == 0) { + return; + } if (iter.is_trivial_1d()) { auto strides = iter.get_inner_strides(); int stride0 = strides[0]; @@ -105,6 +108,9 @@ void gpu_unary_kernel(TensorIterator& iter, const func_t& f) { using arg1_t = typename traits::arg1_t; int64_t numel = iter.numel(); + if (numel == 0) { + return; + } if (iter.is_cpu_scalar(1)) { auto a = iter.scalar_value(1); iter.remove_operand(1); @@ -152,6 +158,9 @@ void gpu_binary_kernel(TensorIterator& iter, const func_t& f) { using arg2_t = typename traits::arg2_t; int numel = iter.numel(); + if (numel == 0) { + return; + } if (iter.is_cpu_scalar(1)) { auto a = iter.scalar_value(1); iter.remove_operand(1); diff --git a/aten/src/ATen/native/cuda/LossCTC.cu b/aten/src/ATen/native/cuda/LossCTC.cu new file mode 100644 index 00000000000000..70ece3f4440cf7 --- /dev/null +++ b/aten/src/ATen/native/cuda/LossCTC.cu @@ -0,0 +1,625 @@ +// Copyright (c) 2018 MathInf GmbH, Thomas Viehmann +// Licensed under the BSD-3-Clause license +// This is the GPU implementation of the Connectionist Temporal Loss. +// We mostly follow Graves. +// 1. Graves et al: http://www.cs.toronto.edu/~graves/icml_2006.pdf +// We use the equations from above link, but note that [1] has 1-based indexing and we (of course) use 0-based. +// Graves et al call the probabilities y, we use log_probs (also calling them inputs) +// A few optimizations (simmilar to those here, but also some I didn't take) are described in +// 2. Minmin Sun: http://on-demand.gputechconf.com/gtc/2016/presentation/s6383-minmin-sun-speech-recognition.pdf + +#include +#include + +#include +#include "ATen/Dispatch.h" +#include "ATen/cuda/CUDAApplyUtils.cuh" + +#include +#include + +namespace at { +namespace native { + +namespace { + +// this ad-hoc converts from targets (l in [1]) to augmented targets (l' in [1]) note that no bound-checking is done +// __restrict__ impact to be measured, https://devblogs.nvidia.com/cuda-pro-tip-optimize-pointer-aliasing/ +template +__device__ static inline int64_t get_target_prime(const target_t* __restrict__ target, int64_t offset, int64_t stride, int64_t idx, int64_t BLANK) { + if (idx % 2 == 0) { + return BLANK; + } else { + return target[offset + stride * (idx / 2)]; + } +} + +// this kernel is a relatively straightforward implementation of the alpha calculation in the forward backward algorithm (section 4.1). +// A (minor) twist is that we are using log-calculations to enhance numerical stability (log_probs and log_alpha). +// In total it would be more efficient to compute the beta in the same kernel (e.g. cudnn does this). While the beta are not +// needed for the loss itself (just the grad), we can return log_alpha+log_beta (so same space as currently) and the overhead +// is small and the use-case for loss without grad is relatively limited. +// We parallelize by batch and target sequence. Empirically, it is faster to loop over the input (log probs) sequence and do +// target in parallel, even if it means more frequent __syncthreads. +// In contrast to the cuDNN implementation, we allow large target lengths. For this we need that all previous `s` have been +// computed when we start a new block_s. This is why we have our own for loop here. +template +__global__ void ctc_loss_log_alpha_gpu_kernel(scalar_t* __restrict__ log_alpha_data, + const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths, int64_t max_input_length, + const target_t* __restrict__ targets_data, const int64_t* __restrict__ target_lengths, int64_t max_target_length, + scalar_t* __restrict__ neg_log_likelihood_data, + int64_t lp_input_stride, int64_t lp_batch_stride, int64_t lp_char_stride, + int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride, + const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride, + int64_t batch_size, int64_t BLANK) { + + constexpr scalar_t neginf = -INFINITY; + + // bookkeeping + int64_t b = threadIdx.y + blockIdx.y * blockDim.y; + int64_t input_length = input_lengths[b]; + int64_t target_length = target_lengths[b]; + int64_t lp_batch_offset = b*lp_batch_stride; + int64_t la_batch_offset = b*la_batch_stride; + int64_t tg_batch_offset = tg_batch_offsets[b]; + + if (b >= batch_size) + return; + + // first row (t=0), the three equations for alpha_1 above eq (6) + for (int64_t block_s = 0; block_s < 2*max_target_length+1; block_s += blockDim.x) { + int64_t s = threadIdx.x + block_s; + scalar_t la; + switch (s) { + case 0: + la = log_probs_data[lp_batch_offset + lp_char_stride * BLANK]; + break; + case 1: + if (target_length > 0) { + la = log_probs_data[lp_batch_offset + lp_char_stride * get_target_prime(targets_data, tg_batch_offset, tg_target_stride, 1, BLANK)]; + } + else { + la = neginf; + } + break; + default: + la = neginf; + } + if (s < 2*max_target_length+1) + log_alpha_data[la_batch_offset + /* la_input_stride * 0 */ + la_target_stride * s] = la; + } + + for (int64_t block_s = 0; block_s < 2*max_target_length+1; block_s += blockDim.x) { + int64_t s = threadIdx.x + block_s; + + // These two only depend on s, so we can cache them. + int64_t current_char; // l_s in eq (6) + bool have_three; // flag which of the two cases in eq (6) we have + if (s < 2*target_length+1) { + current_char = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK); + have_three = ((s > 1) && (get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s-2, BLANK) != + current_char)); + } else { + current_char = BLANK; + have_three = false; + } + for (int64_t t=1; t < max_input_length; t++) { + __syncthreads(); // on cuda 9 we might use partial synchronization of only the threads within the same batch + if ((t < input_length) && (target_length > 0) && (s < 2*target_length+1)) { + // only for valid t, s. This is equation (6) and (7), la1, la2, la3 are the three summands, + // lamax is the maximum for the logsumexp trick. + scalar_t la1 = log_alpha_data[la_batch_offset + la_input_stride * (t-1) + la_target_stride * s]; + scalar_t lamax = la1; + scalar_t la2, la3; + if (s > 0) { + la2 = log_alpha_data[la_batch_offset + la_input_stride * (t-1) + la_target_stride * (s-1)]; + if (la2 > lamax) + lamax = la2; + } else { + la2 = neginf; + } + if (have_three) { + la3 = log_alpha_data[la_batch_offset + la_input_stride * (t-1) + la_target_stride * (s-2)]; + if (la3 > lamax) + lamax = la3; + } else { + la3 = neginf; + } + if (lamax == neginf) // when all are neginf. (then the whole thing is neginf, but we can pretend) + lamax = 0; + + log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s] = std::log(std::exp(la1-lamax)+std::exp(la2-lamax)+std::exp(la3-lamax))+lamax + + log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * current_char]; + } else { + // otherwise we just set to neginf + if (s < 2*max_target_length+1) + log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s] = neginf; + } + } + } + __syncthreads(); // on cuda 9 we might use partial synchronization of only the threads within the same batch + + // compute the loss (eq (8)) + if (threadIdx.x == 0) { + scalar_t l1 = log_alpha_data[la_batch_offset + la_input_stride * (input_length-1) + la_target_stride * (target_length*2)]; + scalar_t l2 = log_alpha_data[la_batch_offset + la_input_stride * (input_length-1) + la_target_stride * (target_length*2-1)]; + scalar_t m = ((l1 > l2) ? l1 : l2); + m = ((m == neginf) ? 0 : m); + scalar_t log_likelihood = std::log(std::exp(l1-m)+std::exp(l2-m))+m; + neg_log_likelihood_data[b] = -log_likelihood; + } +} + +// The forward computation. Lot's of admin and a call to the alpha kernel. +// Note: we do not check that the labels are in the valid range. As we use +// them for indexing in the kernels, you'll see memory errors when you +// pass corrupt labels. +// We support both a 2-dimensional tensor as targets (one set of targets in each row) and +// a 1-dimensional tensor where all targets are concatenated (and we use target_lengths +// to figure out where they begin). +// We return log_alpha (currently, might change to (log_alpha+log_beta) to be passed to the +// backward. The dispatch function will only return the loss. +template +std::tuple ctc_loss_gpu_template(const Tensor& log_probs, const Tensor& targets_, IntList input_lengths, IntList target_lengths, int64_t BLANK) { + // log_probs: input_len x batch_size x num_labels + // targets [int64]: batch_size x target_length OR sum(target_lengths) + CheckedFrom c = "ctc_loss_gpu"; + using target_t = typename std::conditional::type; + auto targets = targets_.toType(log_probs.type().toScalarType(target_scalar_type)); // to log_probs cuda if it isn't there already + auto log_probs_arg = TensorArg(log_probs, "log_probs", 1); + auto targets_arg = TensorArg(targets, "targets", 2); + checkAllSameGPU(c, {log_probs_arg, targets_arg}); + + checkScalarType(c, targets_arg, target_scalar_type); + checkDim(c, log_probs_arg, 3); + checkDimRange(c, targets_arg, 1, 3); + + int64_t batch_size = log_probs.size(1); + int64_t num_labels = log_probs.size(2); + AT_CHECK(BLANK < num_labels, "blank must be in label range"); + AT_CHECK(input_lengths.size() == batch_size, "input_lengths must be of size batch_size"); + AT_CHECK(target_lengths.size() == batch_size, "target_lengths must be of size batch_size"); + + int64_t lp_input_stride = log_probs.stride(0); + int64_t lp_char_stride = log_probs.stride(2); + int64_t tg_target_stride; + + int64_t max_target_length; + auto tg_batch_offsets = at::empty({batch_size}, TensorOptions(at::CPU(kLong))); + auto tg_batch_offsets_data = tg_batch_offsets.data(); + if (targets.dim() == 1) { // concatenated targets + int64_t pos = 0; + max_target_length = 0; + for (int64_t i = 0; i < batch_size; i++) { + tg_batch_offsets_data[i] = pos; + pos += target_lengths[i]; + if (max_target_length < target_lengths[i]) + max_target_length = target_lengths[i]; + } + tg_target_stride = targets.stride(0); + checkSize(c, targets_arg, 0, pos); + } + else { // batch x max_target_length + // dim is 2 + int64_t tg_batch_stride = targets.stride(0); + for (int64_t i = 0; i < batch_size; i++) { + tg_batch_offsets_data[i] = i * tg_batch_stride; + } + tg_target_stride = targets.stride(1); + max_target_length = targets.size(1); + checkSize(c, targets_arg, 0, batch_size); + AT_CHECK(targets.size(1) >= max_target_length, + "Expected tensor to have size at least ", max_target_length, " at dimension 1, but got size ", targets.size(1), " for ", targets_arg, + " (while checking arguments for ", c, ")"); + } + int64_t max_input_length = log_probs.size(0); + for (int64_t b = 0; b < batch_size; b++) { + AT_CHECK(input_lengths[b] <= max_input_length, + "Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", targets.size(0), " for ", targets_arg, + " (while checking arguments for ", c, ")"); + } + + auto target_lengths_t = at::tensor(target_lengths, targets.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(targets.type().toScalarType(kLong)); + auto input_lengths_t = at::tensor(input_lengths, targets.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(targets.type().toScalarType(kLong)); + tg_batch_offsets = tg_batch_offsets.toType(targets.type().toScalarType(kLong)); + + Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2*max_target_length+1}, log_probs.options()); + Tensor neg_log_likelihood = at::empty({batch_size}, log_probs.options()); + + // Very likely, we could be more clever here, e.g. learning (or genralizing and reusing) from SoftMax.cu... + constexpr int max_threads = 1024; + int threads_target = max_threads; + while (threads_target / 2 >= 2*max_target_length+1) { + threads_target /= 2; + } + int threads_batch = std::min(max_threads / threads_target, (int) batch_size); + + dim3 block(threads_target, threads_batch); + dim3 grid((2*max_target_length+1 + threads_target-1)/threads_target, (batch_size+threads_batch-1)/threads_batch); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + ctc_loss_log_alpha_gpu_kernel<<>>( + log_alpha.data(), + log_probs.data(), input_lengths_t.data(), log_probs.size(0), + targets.data(), target_lengths_t.data(), max_target_length, + neg_log_likelihood.data(), + log_probs.stride(0), log_probs.stride(1), log_probs.stride(2), + log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2), + tg_batch_offsets.data(), tg_target_stride, + batch_size, BLANK); + return std::make_tuple(neg_log_likelihood, log_alpha); +} + +// The second (backward) half of the forward backward algorithm, (10) and (11). This is parallel to the +// alpha kernel above. (As mentioned above, it might make sense do the calculation in the alpha kernel.) +template +__global__ void ctc_loss_backward_log_beta_gpu_kernel(scalar_t* __restrict__ log_beta_data, + const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths, int64_t max_input_length, + const target_t* __restrict__ targets_data, const int64_t* __restrict__ target_lengths, int64_t max_target_length, + int64_t lp_input_stride, int64_t lp_batch_stride, int64_t lp_char_stride, + int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride, + const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride, + int64_t batch_size, int64_t BLANK) { + constexpr scalar_t neginf = -INFINITY; + + int64_t b = threadIdx.y + blockIdx.y * blockDim.y; + + int64_t input_length = input_lengths[b]; + int64_t target_length = target_lengths[b]; + int64_t lp_batch_offset = b*lp_batch_stride; + int64_t lb_batch_offset = b*lb_batch_stride; + int64_t tg_batch_offset = tg_batch_offsets[b]; + + if (b >= batch_size) + return; + + // "first" row, the beta initiaization before eq (10) (t=target_length - differes per batch) + for (int64_t block_s = 2*max_target_length - (2*max_target_length % blockDim.x); block_s >= 0; block_s -= blockDim.x) { + int64_t s = threadIdx.x + block_s; + scalar_t lb; + if (s == 2*target_length) { + lb = log_probs_data[lp_batch_offset + (input_length-1) * lp_input_stride + lp_char_stride * BLANK]; + } else if ((target_length > 0) && (s == 2*target_length-1)) { + int64_t current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK); + lb = log_probs_data[lp_batch_offset + (input_length-1) * lp_input_stride + lp_char_stride * current_target_prime]; + } else { + lb = neginf; + } + if (s < 2*max_target_length+1) { + log_beta_data[lb_batch_offset + (input_length-1) * lb_input_stride + lb_target_stride * s] = lb; + } + } + + // go backward in s + for (int64_t block_s = 2*max_target_length - (2*max_target_length % blockDim.x); block_s >= 0; block_s -= blockDim.x) { + int64_t s = threadIdx.x + block_s; + int64_t current_target_prime; + bool have_three; + if (s < 2*target_length+1) { + current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK); + have_three = ((s < 2*target_length-1) && + (get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s+2, BLANK) != + current_target_prime)); + } else { + current_target_prime = BLANK; + have_three = false; + } + // now go backward in t. Note that we need to skip the last timestep that we did above. + for (int64_t t=max_input_length-2; t>=0; t--) { + __syncthreads(); // on cuda 9 we might use partial synchronization of only the threads within the same batch item + if ((t < input_length-1) && (target_length > 0) && (s < 2*target_length+1)) { + scalar_t lb1 = log_beta_data[lb_batch_offset + lb_input_stride * (t+1) + lb_target_stride * s]; + scalar_t lbmax = lb1; + scalar_t lb2, lb3; + + if (s < 2*target_length) { + lb2 = log_beta_data[lb_batch_offset + lb_input_stride * (t+1) + lb_target_stride * (s+1)]; + if (lb2 > lbmax) + lbmax = lb2; + } else { + lb2 = neginf; + } + if (have_three) { + lb3 = log_beta_data[lb_batch_offset + lb_input_stride * (t+1) + lb_target_stride * (s+2)]; + if (lb3 > lbmax) + lbmax = lb3; + } else { + lb3 = neginf; + } + if (lbmax == neginf) + lbmax = 0; + + scalar_t lb = std::log(std::exp(lb1-lbmax)+std::exp(lb2-lbmax)+std::exp(lb3-lbmax))+lbmax + + log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * current_target_prime]; + + log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s] = lb; + } else if ((s < 2*max_target_length+1) || (t >= input_length)) { + log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s] = neginf; + } + } + } +} + +// This implements the subtrahend of equation (16) for all *nonblank* characters. +// It assumes you have probs in gradient_data when called +// and it modifies gradient_data to be, the gradient. +// In order to facilitate this inplace update, We don't actually do this in logspace. +// (The other variant implemented uses log_space and the differences seem to be +// not so problematic at least with unit normal distributed test activations.) +// Internally this uses atomicAdd because different threads may write to the same +// gradient position. +// This is parallelised over b and s again. +// Note that for us, the Z of eqn (16) is actually constant for all t and it is the +// likelihood - this is why we use the negative log likelihood below. +// We also multiply by the input gradient to keep with standard autograd style. +// I took this trick from [2], for moderate alphabet sizes a log-space +// calculation (with an atomic log add) is similarly in performance, but for large +// alphabets the inplace nature is a considerable advantage. +template +__global__ void ctc_loss_backward_collect_nonblank_gpu_kernel(scalar_t* __restrict__ gradient_data, + const scalar_t* __restrict__ grad_out_data, int64_t grad_out_batch_stride, + const scalar_t* __restrict__ log_alpha_data, const scalar_t* __restrict__ log_beta_data, + const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths, int64_t max_input_length, + const target_t* __restrict__ targets_data, const int64_t* __restrict__ target_lengths, int64_t max_target_length, + const scalar_t* __restrict__ neg_log_likelihood_data, + int64_t gr_input_stride, int64_t gr_batch_stride, int64_t gr_char_stride, + int64_t lp_input_stride, int64_t lp_batch_stride, int64_t lp_char_stride, + int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride, + int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride, + const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride, + int64_t batch_size, int64_t num_labels, int64_t BLANK) { + int64_t b = threadIdx.y + blockIdx.y * blockDim.y; + int64_t s = threadIdx.x + blockIdx.x * blockDim.y; // note, this directly indexes into targets, no targets prime! + + if (b >= batch_size) + return; + + int64_t input_length = input_lengths[b]; + int64_t target_length = target_lengths[b]; + int64_t gr_batch_offset = b*gr_batch_stride; + int64_t lp_batch_offset = b*lp_batch_stride; + int64_t la_batch_offset = b*la_batch_stride; + int64_t lb_batch_offset = b*lb_batch_stride; + int64_t tg_batch_offset = tg_batch_offsets[b]; + + if (s >= target_length) + return; + + int64_t target = targets_data[tg_batch_offset + s * tg_target_stride]; + scalar_t nll = neg_log_likelihood_data[b]; + scalar_t gr = grad_out_data[b * grad_out_batch_stride]; + + for (int64_t t = 0; t < input_length; t++) { + scalar_t lp = log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * target]; + atomicAdd(&gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * target], + -std::exp(log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * (s*2+1)] + + log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * (s*2+1)] + + nll - lp) * gr); + } +} + +// This is the naive implementation of equation (16). It is parallelised in batch and input timestep. +// It appears to be faster than the above method for small batch sizes. +template +__global__ void ctc_loss_backward_collect_gpu_kernel(scalar_t* __restrict__ gradient_data, + const scalar_t* __restrict__ grad_out_data, int64_t grad_out_batch_stride, + const scalar_t* __restrict__ log_alpha_data, const scalar_t* __restrict__ log_beta_data, + const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths, int64_t max_input_length, + const target_t* __restrict__ targets_data, const int64_t* __restrict__ target_lengths, int64_t max_target_length, + const scalar_t* __restrict__ neg_log_likelihood_data, + int64_t gr_input_stride, int64_t gr_batch_stride, int64_t gr_char_stride, + int64_t lp_input_stride, int64_t lp_batch_stride, int64_t lp_char_stride, + int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride, + int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride, + const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride, + int64_t batch_size, int64_t num_labels, int64_t BLANK) { + + constexpr scalar_t neginf = -INFINITY; + int64_t b = threadIdx.y + blockIdx.y * blockDim.y; + int64_t t = threadIdx.x + blockIdx.x * blockDim.x; + + if ((t >= max_input_length) || (b >= batch_size)) + return; + + int64_t input_length = input_lengths[b]; + int64_t target_length = target_lengths[b]; + int64_t gr_batch_offset = b*gr_batch_stride; + int64_t lp_batch_offset = b*lp_batch_stride; + int64_t la_batch_offset = b*la_batch_stride; + int64_t lb_batch_offset = b*lb_batch_stride; + int64_t tg_batch_offset = tg_batch_offsets[b]; + + // collected[b, t, target'[s]] "log+=" log_alpha[t, s]+log_beta[t, s] + for (int s = 0; s < 2*max_target_length+1; s++) { + if ((target_length > 0) && (s < 2*target_length+1)) { + int64_t current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK); + scalar_t log_alpha_beta = (log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s] + + log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s]); + scalar_t& lcab = gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * current_target_prime]; + if (lcab == neginf) { + lcab = log_alpha_beta; + } else { + scalar_t max = ((lcab > log_alpha_beta) ? lcab : log_alpha_beta); + lcab = std::log(std::exp(lcab-max)+std::exp(log_alpha_beta-max))+max; + } + } + } + + scalar_t nll = neg_log_likelihood_data[b]; + scalar_t gr = grad_out_data[b * grad_out_batch_stride]; + + for (int64_t c = 0; c < num_labels; c++) { + scalar_t& res = gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * c]; + if (t < input_length) { + scalar_t lp = log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * c]; + res = std::exp(lp)-std::exp(res + nll - lp) * gr; + } + else { + res = 0.; + } + } +} + +// The backward. It essentially computes eq 16 by using the above kernels. +// We don't do a lot of checking as we envision this to be called only when backpropagating through a (well-checked) forward. +template +Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_probs, const Tensor& targets_, IntList input_lengths, IntList target_lengths, + const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK) { + constexpr scalar_t neginf = -INFINITY; + using target_t = typename std::conditional::type; + auto targets = targets_.toType(log_probs.type().toScalarType(target_scalar_type)); // to cuda if it isn't there already + int64_t batch_size = log_probs.size(1); + int64_t num_labels = log_probs.size(2); + int64_t lp_input_stride = log_probs.stride(0); + int64_t lp_char_stride = log_probs.stride(2); + int64_t tg_target_stride; + + int64_t max_target_length; + auto tg_batch_offsets = at::empty({batch_size}, TensorOptions(at::CPU(kLong))); + auto tg_batch_offsets_data = tg_batch_offsets.data(); + if (targets.dim() == 1) { // concatenated targets + int64_t pos = 0; + max_target_length = 0; + for (int64_t i = 0; i < batch_size; i++) { + tg_batch_offsets_data[i] = pos; + pos += target_lengths[i]; + if (max_target_length < target_lengths[i]) + max_target_length = target_lengths[i]; + } + tg_target_stride = targets.stride(0); + } + else { // batch x max_target_length + // dim is 2 + int64_t tg_batch_stride = targets.stride(0); + for (int64_t i = 0; i < batch_size; i++) { + tg_batch_offsets_data[i] = i * tg_batch_stride; + } + tg_target_stride = targets.stride(1); + max_target_length = targets.size(1); + } + auto target_lengths_t = at::tensor(target_lengths, targets.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(targets.type().toScalarType(kLong)); + auto input_lengths_t = at::tensor(input_lengths, targets.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(targets.type().toScalarType(kLong)); + tg_batch_offsets = tg_batch_offsets.toType(targets.type().toScalarType(kLong)); + + Tensor log_beta = at::empty({batch_size, log_probs.size(0), 2*max_target_length+1}, log_probs.options()); + Tensor grad = at::full_like(log_probs, neginf); // initialization for log(sum (alpha beta)) + + // As above, there may be better configurations to use. + constexpr int max_threads = 1024; + int threads_target = max_threads; + while (threads_target / 2 >= 2*max_target_length+1) { + threads_target /= 2; + } + int threads_batch = std::min(max_threads / threads_target, (int) batch_size); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + { + dim3 block(threads_target, threads_batch); + dim3 grid((2*max_target_length+1 + threads_target-1)/threads_target, (batch_size+threads_batch-1)/threads_batch); + + ctc_loss_backward_log_beta_gpu_kernel<<>> + (log_beta.data(), + log_probs.data(), input_lengths_t.data(), log_probs.size(0), + targets.data(), target_lengths_t.data(), max_target_length, + log_probs.stride(0), log_probs.stride(1), log_probs.stride(2), + log_beta.stride(0), log_beta.stride(1), log_beta.stride(2), + tg_batch_offsets.data(), tg_target_stride, + batch_size, BLANK); + } + + // Very crude heuristic for what is a small problem., based on linearly regressing problem dimensions on + // the (capped) difference of timings. + // Note that for OK problems target length <= input length, so we + // only consider input length. + bool is_large = (2*log_probs.size(0)+(24*batch_size)/10+(2*num_labels)/10) > 450; + if (is_large) { // large alphabet, large batch + // this computes the probs, minuend in (16) + exp_out(grad, log_probs); + // now we compute the subtrahend for the blanks. It is a straightforward reduction because we know that + // blanks are in every other position. + // maybe we should kernelize this, too. + auto grad_blank = grad.narrow(2, BLANK, 1); + grad_blank -= (at::logsumexp(log_alpha.as_strided({batch_size, log_alpha.size(1), max_target_length+1}, + {log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2)*2}) + + log_beta.as_strided({batch_size, log_beta.size(1), max_target_length+1}, + {log_beta.stride(0), log_beta.stride(1), log_beta.stride(2)*2}), + 2, true) + .permute({1, 0, 2}) + .add_(neg_log_likelihood.view({1, batch_size, 1})) + .sub_(log_probs.narrow(2, BLANK, 1)) + .exp_() + ); + // Tor the non-blank characters, we use a kernel to compute the subtrahend. + // Again we might configure block and grid in a better way. + int threads_target = max_threads; + while (threads_target / 2 >= max_target_length) { + threads_target /= 2; + } + int threads_batch = std::min(max_threads / threads_target, (int) batch_size); + dim3 block(threads_target, threads_batch); + dim3 grid((max_target_length + threads_target-1)/threads_target, (batch_size+threads_batch-1)/threads_batch); + ctc_loss_backward_collect_nonblank_gpu_kernel<<>> + (grad.data(), + grad_out.data(), grad_out.stride(0), + log_alpha.data(), log_beta.data(), + log_probs.data(), input_lengths_t.data(), log_probs.size(0), + targets.data(), target_lengths_t.data(), max_target_length, + neg_log_likelihood.data(), + grad.stride(0), grad.stride(1), grad.stride(2), + log_probs.stride(0), log_probs.stride(1), log_probs.stride(2), + log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2), + log_beta.stride(0), log_beta.stride(1), log_beta.stride(2), + tg_batch_offsets.data(), tg_target_stride, + batch_size, num_labels, BLANK); + } else { // small problem, use naive algorithm + // Still no block/grid configuration guru... + int threads_input = max_threads; + while (threads_input / 2 >= log_probs.size(0)) { + threads_input /= 2; + } + threads_batch = std::min(max_threads / threads_input, (int) batch_size); + dim3 block(threads_input, threads_batch); + dim3 grid((log_probs.size(0) + threads_input-1)/threads_input, (batch_size+threads_batch-1)/threads_batch); + + ctc_loss_backward_collect_gpu_kernel<<>> + (grad.data(), + grad_out.data(), grad_out.stride(0), + log_alpha.data(), log_beta.data(), + log_probs.data(), input_lengths_t.data(), log_probs.size(0), + targets.data(), target_lengths_t.data(), max_target_length, + neg_log_likelihood.data(), + grad.stride(0), grad.stride(1), grad.stride(2), + log_probs.stride(0), log_probs.stride(1), log_probs.stride(2), + log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2), + log_beta.stride(0), log_beta.stride(1), log_beta.stride(2), + tg_batch_offsets.data(), tg_target_stride, + batch_size, num_labels, BLANK); + } + return grad; +} + +} // namespace + +std::tuple ctc_loss_gpu(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK) { + return AT_DISPATCH_FLOATING_TYPES(log_probs.type(), "ctc_loss", [&] { + if (targets.type().scalarType() == kLong) { + return ctc_loss_gpu_template(log_probs, targets, input_lengths, target_lengths, BLANK); + } else { + return ctc_loss_gpu_template(log_probs, targets, input_lengths, target_lengths, BLANK); + } + }); +} + +Tensor ctc_loss_backward_gpu(const Tensor& grad, const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, + const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK) { + return AT_DISPATCH_FLOATING_TYPES(log_probs.type(), "ctc_loss_backward", [&] { + if (targets.type().scalarType() == kLong) { + return ctc_loss_backward_gpu_template(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK); + } else { + return ctc_loss_backward_gpu_template(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK); + } + }); +} + +} } // at::native diff --git a/aten/src/ATen/native/cuda/TensorFactories.cu b/aten/src/ATen/native/cuda/TensorFactories.cu index 420733dc558c06..5cde662fba78a6 100644 --- a/aten/src/ATen/native/cuda/TensorFactories.cu +++ b/aten/src/ATen/native/cuda/TensorFactories.cu @@ -20,17 +20,9 @@ Tensor& eye_out_cuda(Tensor& result, int64_t n) { } Tensor& eye_out_cuda(Tensor& result, int64_t n, int64_t m) { -#ifndef USE_TH_SIZE_ZERO_DIM - AT_CHECK(n > 0, "n must be greater than 0, got ", n); -#else AT_CHECK(n >= 0, "n must be greater or equal to 0, got ", n); -#endif -#ifndef USE_TH_SIZE_ZERO_DIM - if(m <= 0) { -#else if(m < 0) { -#endif m = n; } diff --git a/aten/src/ATen/native/cuda/TensorTransformations.cu b/aten/src/ATen/native/cuda/TensorTransformations.cu index 7fa1fe64f28d6f..f97395d6392ca6 100644 --- a/aten/src/ATen/native/cuda/TensorTransformations.cu +++ b/aten/src/ATen/native/cuda/TensorTransformations.cu @@ -80,7 +80,7 @@ Tensor flip_cuda(const Tensor& self, IntList dims) { return out_tensor; } - auto flip_dims = std::vector(dims); + auto flip_dims = dims.vec(); wrap_all_dims(flip_dims, total_dims); // use kernel_pointwise_flip_apply2 only when to-flip dim is the 1st or last dim, where collapseDims can reduce the amount of work @@ -99,10 +99,10 @@ Tensor flip_cuda(const Tensor& self, IntList dims) { auto flip_dims_t = at::CPU(kLong).tensorFromBlob(flip_dims.data(), {static_cast(flip_dims.size())}); - auto shape = std::vector(in_tensor.sizes()); + auto shape = in_tensor.sizes().vec(); auto shape_t = at::CPU(kLong).tensorFromBlob(shape.data(), {static_cast(shape.size())}); - auto strides = std::vector(in_tensor.strides()); + auto strides = in_tensor.strides().vec(); auto strides_t = at::CPU(kLong).tensorFromBlob(strides.data(), {static_cast(strides.size())}); // stride_contiguous is the stride of non-contiguous tensor after calling contiguous(), diff --git a/aten/src/ATen/native/cudnn/LossCTC.cpp b/aten/src/ATen/native/cudnn/LossCTC.cpp new file mode 100644 index 00000000000000..966aa20e0a128d --- /dev/null +++ b/aten/src/ATen/native/cudnn/LossCTC.cpp @@ -0,0 +1,92 @@ +#include +#include +#include +#include +#if AT_CUDNN_ENABLED() + #include +#endif + + +#if !AT_CUDNN_ENABLED() || (CUDNN_VERSION < 7000) + +namespace at { namespace native { + +// See Note [ATen preprocessor philosophy] + +std::tuple _cudnn_ctc_loss(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK, bool deterministic) { + throw std::runtime_error("cudnn_ctc_loss: ATen not compiled with cuDNN >= 7 support"); +} + +}} + +#else // AT_CUDNN_ENABLED + +#include +#include +#include + +#include + +namespace at { namespace native { + +namespace { + +} // namespace + +std::tuple _cudnn_ctc_loss(const Tensor& log_probs_t, const Tensor& targets_t, IntList input_lengths_, IntList target_lengths_, int64_t BLANK, bool deterministic) { + CheckedFrom c = "cudnn_ctc_loss"; + TensorArg log_probs { log_probs_t, "log_probs", 1 }; + TensorArg targets { targets_t, "targets", 2 }; + checkDim(c, log_probs, 3); + checkScalarType(c, log_probs, kFloat); + checkDim(c, targets, 1); + checkScalarType(c, targets, kInt); + checkContiguous(c, targets); // ? + checkBackend(c, {*log_probs}, Backend::CUDA); + checkBackend(c, {*targets}, Backend::CPU); + int64_t batch_size = log_probs->size(1); + AT_CHECK(input_lengths_.size() == batch_size, "input_lengths needs to have size to match batch_size"); + AT_CHECK(target_lengths_.size() == batch_size, "target_lengths needs to have size to match batch_size"); + + std::vector input_lengths(input_lengths_.begin(), input_lengths_.end()); + std::vector target_lengths(target_lengths_.begin(), target_lengths_.end()); + + setCuDNNStreamToCurrent(); + AT_CHECK(BLANK == 0, "blank must be label 0 for cudnn_ctc_loss"); + // checked in dispatch: + // assert other conditions for cudnnCTCLoss: all label lengths <= 256 + // all input lengths = logprob.size(0) + + auto handle = getCudnnHandle(); + + cudnnCTCLossAlgo_t algo = (deterministic ? CUDNN_CTC_LOSS_ALGO_DETERMINISTIC : CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC); + + Tensor probs = log_probs->softmax(2); + TensorDescriptor probs_desc{probs}; + Tensor grad = at::empty_like(probs); + TensorDescriptor grad_desc{grad}; + + CTCLossDescriptor ctc_loss_desc; + ctc_loss_desc.set(CUDNN_DATA_FLOAT); + + size_t workspace_size; + AT_CUDNN_CHECK(cudnnGetCTCLossWorkspaceSize(handle, probs_desc.desc(), grad_desc.desc(), + targets->data(), target_lengths.data(), input_lengths.data(), + algo, ctc_loss_desc.desc(), &workspace_size)); + + + Tensor workspace = log_probs->type().toScalarType(kByte).tensor(workspace_size); // new way of doing this with empty? + Tensor costs = at::empty({log_probs->size(1)}, log_probs->options()); + + AT_CUDNN_CHECK(cudnnCTCLoss(handle, probs_desc.desc(), probs.data_ptr(), + targets->data(), target_lengths.data(), input_lengths.data(), + costs.data_ptr(), grad_desc.desc(), grad.data_ptr(), algo, + ctc_loss_desc.desc(), workspace.data_ptr(), workspace_size)); + + return std::make_tuple(costs, grad); +} + + +}} // namespace at::native + +#endif diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp index 63f0d7a29578f9..08e84618e81db3 100644 --- a/aten/src/ATen/native/cudnn/RNN.cpp +++ b/aten/src/ATen/native/cudnn/RNN.cpp @@ -166,7 +166,7 @@ namespace { std::vector descriptors(batch_sizes.size()); size_t i = 0; // To be mutated in the loop - std::vector batch_tensor_size(tensor.sizes()); + auto batch_tensor_size = tensor.sizes().vec(); for (auto batch_size : batch_sizes) { batch_tensor_size[0] = batch_size; // NB: cuDNN RNN API does not support 2d descriptors, so we @@ -994,7 +994,7 @@ std::tuple> _cudnn_rnn_backward( if (output_mask[3]) { dw = at::native::_cudnn_rnn_backward_weight(input, weight, weight_stride0, weight_buf, hx, cx, output, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve); } - return std::tuple{dx, dhx, dcx, dw}; + return std::tuple>{dx, dhx, dcx, dw}; } // TODO: I am not sure if we actually need the 'dropout' and 'train' parameters diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 8692d6165ff72a..2a8941675d6c9f 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -29,6 +29,11 @@ - func: _cast_Half(Tensor self, bool non_blocking=false) -> Tensor variants: function, method +- func: _cudnn_ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank, bool deterministic) -> (Tensor, Tensor) + variants: function + dispatch: + CUDA: _cudnn_ctc_loss + - func: _cudnn_rnn_flatten_weight(TensorList weight_arr, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, bool bidirectional) -> Tensor variants: function dispatch: @@ -244,6 +249,9 @@ - func: blackman_window(int64_t window_length, bool periodic, TensorOptions options={}) -> Tensor variants: function +- func: broadcast_tensors(TensorList tensors) -> TensorList + variants: function + - func: cat(TensorList tensors, int64_t dim=0) -> Tensor variants: function @@ -504,6 +512,21 @@ - func: cumprod_out(Tensor result, Tensor self, int64_t dim) -> Tensor variants: function +- func: ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank=0, int64_t reduction=Reduction::ElementwiseMean) -> Tensor + variants: function + +- func: _ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank=0) -> (Tensor, Tensor) + variants: function + dispatch: + CPU: ctc_loss_cpu + CUDA: ctc_loss_gpu + +- func: _ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int64_t blank) -> Tensor + variants: function + dispatch: + CPU: ctc_loss_backward_cpu + CUDA: ctc_loss_backward_gpu + - func: det(Tensor self) -> Tensor - func: diagflat(Tensor self, int64_t offset=0) -> Tensor @@ -715,9 +738,45 @@ variants: function deprecated: true +# NOTE [ grid_sampler Native Functions ] +# `grid_sampler` does all the shape checking and then dispatches to one of +# `cudnn_grid_sampler`, `grid_sampler_2d`, or `grid_sampler_3d`, each of which +# has the corresponding backward defined as native functions as well. Therefore, +# in these functions and their backwards, no more shape checking is done. +# +# Additionally, arguments `padding_mode` and `interpolation_mode` are cast to +# enums defined in `native/GridSampler.h`. `cudnn_grid_sampler` doesn't take in +# `interpolation_mode` because it only supports Bilinear interpolation mode. +# +# ssnl: Currently `interpolation_mode` is just a placeholder. It is not really +# used. Everywhere Bilinear is assumed. I will add Nearest soon. - func: grid_sampler(Tensor input, Tensor grid, int64_t padding_mode) -> Tensor variants: function +- func: grid_sampler_2d(Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) -> Tensor + variants: function + dispatch: + CPU: grid_sampler_2d_cpu + CUDA: grid_sampler_2d_cuda + +- func: grid_sampler_2d_backward(Tensor grad_output, Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) -> (Tensor, Tensor) + variants: function + dispatch: + CPU: grid_sampler_2d_backward_cpu + CUDA: grid_sampler_2d_backward_cuda + +- func: grid_sampler_3d(Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) -> Tensor + variants: function + dispatch: + CPU: grid_sampler_3d_cpu + CUDA: grid_sampler_3d_cuda + +- func: grid_sampler_3d_backward(Tensor grad_output, Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) -> (Tensor, Tensor) + variants: function + dispatch: + CPU: grid_sampler_3d_backward_cpu + CUDA: grid_sampler_3d_backward_cuda + - func: hann_window(int64_t window_length, TensorOptions options={}) -> Tensor variants: function @@ -1270,6 +1329,12 @@ - func: selu_(Tensor self) -> Tensor variants: function +- func: celu(Tensor self, Scalar alpha=1.0) -> Tensor + variants: function + +- func: celu_(Tensor self, Scalar alpha=1.0) -> Tensor + variants: function + - func: sigmoid(Tensor self) -> Tensor - func: sigmoid_(Tensor self) -> Tensor diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp index 0cac9bcb9131fa..7a7e8be5c7ff6a 100644 --- a/aten/src/ATen/native/sparse/SparseTensor.cpp +++ b/aten/src/ATen/native/sparse/SparseTensor.cpp @@ -63,7 +63,7 @@ SparseTensor new_sparse(const SparseType& dtype) { AT_ASSERT(!dtype.is_variable()); AT_ASSERT(dtype.is_sparse()); // TODO: Hmm... this const_cast business seems a bit dodgy - return SparseTensor(new SparseTensorImpl(const_cast(&dtype)), /* retain */ false); + return SparseTensor(new SparseTensorImpl(dtype.backend(), dtype.scalarType()), /* retain */ false); } /*** Helper methods ***/ diff --git a/aten/src/ATen/native/sparse/SparseUtils.h b/aten/src/ATen/native/sparse/SparseUtils.h index 226b9084579031..aac948e4940241 100644 --- a/aten/src/ATen/native/sparse/SparseUtils.h +++ b/aten/src/ATen/native/sparse/SparseUtils.h @@ -118,7 +118,7 @@ inline Tensor _new_values_with_size_of(const Tensor& values, int64_t nnz) { // That's the assumption this code makes. return values.type().tensor({nnz}); } else { - std::vector size = values.sizes(); + std::vector size = values.sizes().vec(); size[0] = nnz; return values.type().tensor(size); } diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu index 02b190e4901c55..ff4b0e0c57736c 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu @@ -81,7 +81,7 @@ SparseTensor coalesce_sparse_cuda(const SparseTensor& self) { int64_t newNnz = newEnd.first - indicesIter; indices1D.resize_({1, newNnz}); - std::vector newValues_size(values.sizes()); + auto newValues_size = values.sizes().vec(); newValues_size[0] = newNnz; Tensor newValues = at::empty(newValues_size, values.options()); diff --git a/aten/src/ATen/nn.yaml b/aten/src/ATen/nn.yaml index 86783e4f76dcd6..8a8a8a5dbe954b 100644 --- a/aten/src/ATen/nn.yaml +++ b/aten/src/ATen/nn.yaml @@ -58,7 +58,7 @@ # Activation functions -- name: elu(Tensor self, Scalar alpha=1, Scalar scale=1) +- name: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) cname: ELU has_inplace: True scalar_check: @@ -274,11 +274,3 @@ - name: thnn_conv_dilated3d(Tensor self, Tensor weight, IntList[3] kernel_size, Tensor bias={}, IntList[3] stride=1, IntList[3] padding=0, IntList[3] dilation=1) cname: VolumetricDilatedConvolution buffers: [columns, ones] - -# Vision - -- name: thnn_grid_sampler_bilinear2d(Tensor self, Tensor grid, int64_t padding_mode) - cname: SpatialGridSamplerBilinear - -- name: thnn_grid_sampler_bilinear3d(Tensor self, Tensor grid, int64_t padding_mode) - cname: VolumetricGridSamplerBilinear diff --git a/aten/src/ATen/optional.h b/aten/src/ATen/optional.h index 287ddd8577b340..0a395bae67cda6 100644 --- a/aten/src/ATen/optional.h +++ b/aten/src/ATen/optional.h @@ -1,982 +1 @@ -// Copyright (C) 2011 - 2012 Andrzej Krzemienski. -// -// Use, modification, and distribution is subject to the Boost Software -// License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt) -// -// The idea and interface is based on Boost.Optional library -// authored by Fernando Luis Cacciola Carballal -// -// From https://github.com/akrzemi1/Optional -// -// ATen: -// - Move to `at` namespace. -// - Remove macro use in line 478 because the nvcc device compiler cannot handle it. - -#pragma once - -# include -# include -# include -# include -# include -# include -# include - -# define TR2_OPTIONAL_REQUIRES(...) typename std::enable_if<__VA_ARGS__::value, bool>::type = false - -# if defined __GNUC__ // NOTE: GNUC is also defined for Clang -# if (__GNUC__ == 4) && (__GNUC_MINOR__ >= 8) -# define TR2_OPTIONAL_GCC_4_8_AND_HIGHER___ -# elif (__GNUC__ > 4) -# define TR2_OPTIONAL_GCC_4_8_AND_HIGHER___ -# endif -# -# if (__GNUC__ == 4) && (__GNUC_MINOR__ >= 7) -# define TR2_OPTIONAL_GCC_4_7_AND_HIGHER___ -# elif (__GNUC__ > 4) -# define TR2_OPTIONAL_GCC_4_7_AND_HIGHER___ -# endif -# -# if (__GNUC__ == 4) && (__GNUC_MINOR__ == 8) && (__GNUC_PATCHLEVEL__ >= 1) -# define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ -# elif (__GNUC__ == 4) && (__GNUC_MINOR__ >= 9) -# define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ -# elif (__GNUC__ > 4) -# define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ -# endif -# endif -# -# if defined __clang_major__ -# if (__clang_major__ == 3 && __clang_minor__ >= 5) -# define TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ -# elif (__clang_major__ > 3) -# define TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ -# endif -# if defined TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ -# define TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_ -# elif (__clang_major__ == 3 && __clang_minor__ == 4 && __clang_patchlevel__ >= 2) -# define TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_ -# endif -# endif -# -# if defined _MSC_VER -# if (_MSC_VER >= 1900) -# define TR2_OPTIONAL_MSVC_2015_AND_HIGHER___ -# endif -# endif - -# if defined __clang__ -# if (__clang_major__ > 2) || (__clang_major__ == 2) && (__clang_minor__ >= 9) -# define OPTIONAL_HAS_THIS_RVALUE_REFS 1 -# else -# define OPTIONAL_HAS_THIS_RVALUE_REFS 0 -# endif -# elif defined TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ -# define OPTIONAL_HAS_THIS_RVALUE_REFS 1 -# elif defined TR2_OPTIONAL_MSVC_2015_AND_HIGHER___ -# define OPTIONAL_HAS_THIS_RVALUE_REFS 1 -# else -# define OPTIONAL_HAS_THIS_RVALUE_REFS 0 -# endif - - -# if defined TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ -# define OPTIONAL_HAS_CONSTEXPR_INIT_LIST 1 -# define OPTIONAL_CONSTEXPR_INIT_LIST constexpr -# else -# define OPTIONAL_HAS_CONSTEXPR_INIT_LIST 0 -# define OPTIONAL_CONSTEXPR_INIT_LIST -# endif - -# if defined TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ && (defined __cplusplus) && (__cplusplus != 201103L) -# define OPTIONAL_HAS_MOVE_ACCESSORS 1 -# else -# define OPTIONAL_HAS_MOVE_ACCESSORS 0 -# endif - -# // In C++11 constexpr implies const, so we need to make non-const members also non-constexpr -# if (defined __cplusplus) && (__cplusplus == 201103L) -# define OPTIONAL_MUTABLE_CONSTEXPR -# else -# define OPTIONAL_MUTABLE_CONSTEXPR constexpr -# endif - -namespace at { - -// 20.5.4, optional for object types -template class optional; - -// 20.5.5, optional for lvalue reference types -template class optional; - - -// workaround: std utility functions aren't constexpr yet -template inline constexpr T&& constexpr_forward(typename std::remove_reference::type& t) noexcept -{ - return static_cast(t); -} - -template inline constexpr T&& constexpr_forward(typename std::remove_reference::type&& t) noexcept -{ - static_assert(!std::is_lvalue_reference::value, "!!"); - return static_cast(t); -} - -template inline constexpr typename std::remove_reference::type&& constexpr_move(T&& t) noexcept -{ - return static_cast::type&&>(t); -} - - -#if defined NDEBUG -# define TR2_OPTIONAL_ASSERTED_EXPRESSION(CHECK, EXPR) (EXPR) -#else -# define TR2_OPTIONAL_ASSERTED_EXPRESSION(CHECK, EXPR) ((CHECK) ? (EXPR) : ([]{assert(!#CHECK);}(), (EXPR))) -#endif - - -namespace detail_ -{ - -// static_addressof: a constexpr version of addressof -template -struct has_overloaded_addressof -{ - template - constexpr static bool has_overload(...) { return false; } - - template ().operator&()) > - constexpr static bool has_overload(bool) { return true; } - - constexpr static bool value = has_overload(true); -}; - -template )> -constexpr T* static_addressof(T& ref) -{ - return &ref; -} - -template )> -T* static_addressof(T& ref) -{ - return std::addressof(ref); -} - - -// the call to convert(b) has return type A and converts b to type A iff b decltype(b) is implicitly convertible to A -template -constexpr U convert(U v) { return v; } - -} // namespace detail - - -constexpr struct trivial_init_t{} trivial_init{}; - - -// 20.5.6, In-place construction -constexpr struct in_place_t{} in_place{}; - - -// 20.5.7, Disengaged state indicator -struct nullopt_t -{ - struct init{}; - constexpr explicit nullopt_t(init){} -}; -constexpr nullopt_t nullopt{nullopt_t::init()}; - - -// 20.5.8, class bad_optional_access -class bad_optional_access : public std::logic_error { -public: - explicit bad_optional_access(const std::string& what_arg) : logic_error{what_arg} {} - explicit bad_optional_access(const char* what_arg) : logic_error{what_arg} {} -}; - - -template -union storage_t -{ - unsigned char dummy_; - T value_; - - constexpr storage_t( trivial_init_t ) noexcept : dummy_() {}; - - template - constexpr storage_t( Args&&... args ) : value_(constexpr_forward(args)...) {} - - ~storage_t(){} -}; - - -template -union constexpr_storage_t -{ - unsigned char dummy_; - T value_; - - constexpr constexpr_storage_t( trivial_init_t ) noexcept : dummy_() {}; - - template - constexpr constexpr_storage_t( Args&&... args ) : value_(constexpr_forward(args)...) {} - - ~constexpr_storage_t() = default; -}; - - -template -struct optional_base -{ - bool init_; - storage_t storage_; - - constexpr optional_base() noexcept : init_(false), storage_(trivial_init) {}; - - explicit constexpr optional_base(const T& v) : init_(true), storage_(v) {} - - explicit constexpr optional_base(T&& v) : init_(true), storage_(constexpr_move(v)) {} - - template explicit optional_base(in_place_t, Args&&... args) - : init_(true), storage_(constexpr_forward(args)...) {} - - template >)> - explicit optional_base(in_place_t, std::initializer_list il, Args&&... args) - : init_(true), storage_(il, std::forward(args)...) {} - - ~optional_base() { if (init_) storage_.value_.T::~T(); } -}; - - -template -struct constexpr_optional_base -{ - bool init_; - constexpr_storage_t storage_; - - constexpr constexpr_optional_base() noexcept : init_(false), storage_(trivial_init) {}; - - explicit constexpr constexpr_optional_base(const T& v) : init_(true), storage_(v) {} - - explicit constexpr constexpr_optional_base(T&& v) : init_(true), storage_(constexpr_move(v)) {} - - template explicit constexpr constexpr_optional_base(in_place_t, Args&&... args) - : init_(true), storage_(constexpr_forward(args)...) {} - - template >)> - OPTIONAL_CONSTEXPR_INIT_LIST explicit constexpr_optional_base(in_place_t, std::initializer_list il, Args&&... args) - : init_(true), storage_(il, std::forward(args)...) {} - - ~constexpr_optional_base() = default; -}; - -template -using OptionalBase = typename std::conditional< - std::is_trivially_destructible::value, // if possible - constexpr_optional_base::type>, // use base with trivial destructor - optional_base::type> ->::type; - - - -template -class optional : private OptionalBase -{ - static_assert( !std::is_same::type, nullopt_t>::value, "bad T" ); - static_assert( !std::is_same::type, in_place_t>::value, "bad T" ); - - - constexpr bool initialized() const noexcept { return OptionalBase::init_; } - typename std::remove_const::type* dataptr() { return std::addressof(OptionalBase::storage_.value_); } - constexpr const T* dataptr() const { return detail_::static_addressof(OptionalBase::storage_.value_); } - -# if OPTIONAL_HAS_THIS_RVALUE_REFS == 1 - constexpr const T& contained_val() const& { return OptionalBase::storage_.value_; } -# if OPTIONAL_HAS_MOVE_ACCESSORS == 1 - OPTIONAL_MUTABLE_CONSTEXPR T&& contained_val() && { return std::move(OptionalBase::storage_.value_); } - OPTIONAL_MUTABLE_CONSTEXPR T& contained_val() & { return OptionalBase::storage_.value_; } -# else - T& contained_val() & { return OptionalBase::storage_.value_; } - T&& contained_val() && { return std::move(OptionalBase::storage_.value_); } -# endif -# else - constexpr const T& contained_val() const { return OptionalBase::storage_.value_; } - T& contained_val() { return OptionalBase::storage_.value_; } -# endif - - void clear() noexcept { - if (initialized()) dataptr()->T::~T(); - OptionalBase::init_ = false; - } - - template - void initialize(Args&&... args) noexcept(noexcept(T(std::forward(args)...))) - { - assert(!OptionalBase::init_); - ::new (static_cast(dataptr())) T(std::forward(args)...); - OptionalBase::init_ = true; - } - - template - void initialize(std::initializer_list il, Args&&... args) noexcept(noexcept(T(il, std::forward(args)...))) - { - assert(!OptionalBase::init_); - ::new (static_cast(dataptr())) T(il, std::forward(args)...); - OptionalBase::init_ = true; - } - -public: - typedef T value_type; - - // 20.5.5.1, constructors - constexpr optional() noexcept : OptionalBase() {}; - constexpr optional(nullopt_t) noexcept : OptionalBase() {}; - - optional(const optional& rhs) - : OptionalBase() - { - if (rhs.initialized()) { - ::new (static_cast(dataptr())) T(*rhs); - OptionalBase::init_ = true; - } - } - - optional(optional&& rhs) noexcept(std::is_nothrow_move_constructible::value) - : OptionalBase() - { - if (rhs.initialized()) { - ::new (static_cast(dataptr())) T(std::move(*rhs)); - OptionalBase::init_ = true; - } - } - - constexpr optional(const T& v) : OptionalBase(v) {} - - constexpr optional(T&& v) : OptionalBase(constexpr_move(v)) {} - - template - explicit constexpr optional(in_place_t, Args&&... args) - : OptionalBase(in_place_t{}, constexpr_forward(args)...) {} - - template >)> - OPTIONAL_CONSTEXPR_INIT_LIST explicit optional(in_place_t, std::initializer_list il, Args&&... args) - : OptionalBase(in_place_t{}, il, constexpr_forward(args)...) {} - - // 20.5.4.2, Destructor - ~optional() = default; - - // 20.5.4.3, assignment - optional& operator=(nullopt_t) noexcept - { - clear(); - return *this; - } - - optional& operator=(const optional& rhs) - { - if (initialized() == true && rhs.initialized() == false) clear(); - else if (initialized() == false && rhs.initialized() == true) initialize(*rhs); - else if (initialized() == true && rhs.initialized() == true) contained_val() = *rhs; - return *this; - } - - optional& operator=(optional&& rhs) - noexcept(std::is_nothrow_move_assignable::value && std::is_nothrow_move_constructible::value) - { - if (initialized() == true && rhs.initialized() == false) clear(); - else if (initialized() == false && rhs.initialized() == true) initialize(std::move(*rhs)); - else if (initialized() == true && rhs.initialized() == true) contained_val() = std::move(*rhs); - return *this; - } - - template - auto operator=(U&& v) - -> typename std::enable_if - < - std::is_same::type, T>::value, - optional& - >::type - { - if (initialized()) { contained_val() = std::forward(v); } - else { initialize(std::forward(v)); } - return *this; - } - - - template - void emplace(Args&&... args) - { - clear(); - initialize(std::forward(args)...); - } - - template - void emplace(std::initializer_list il, Args&&... args) - { - clear(); - initialize(il, std::forward(args)...); - } - - // 20.5.4.4, Swap - void swap(optional& rhs) noexcept(std::is_nothrow_move_constructible::value && noexcept(swap(std::declval(), std::declval()))) - { - if (initialized() == true && rhs.initialized() == false) { rhs.initialize(std::move(**this)); clear(); } - else if (initialized() == false && rhs.initialized() == true) { initialize(std::move(*rhs)); rhs.clear(); } - else if (initialized() == true && rhs.initialized() == true) { using std::swap; swap(**this, *rhs); } - } - - // 20.5.4.5, Observers - - explicit constexpr operator bool() const noexcept { return initialized(); } - constexpr bool has_value() const noexcept { return initialized(); } - - constexpr T const* operator ->() const { - return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), dataptr()); - } - -# if OPTIONAL_HAS_MOVE_ACCESSORS == 1 - - OPTIONAL_MUTABLE_CONSTEXPR T* operator ->() { - assert (initialized()); - return dataptr(); - } - - constexpr T const& operator *() const& { - return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), contained_val()); - } - - OPTIONAL_MUTABLE_CONSTEXPR T& operator *() & { - assert (initialized()); - return contained_val(); - } - - OPTIONAL_MUTABLE_CONSTEXPR T&& operator *() && { - assert (initialized()); - return constexpr_move(contained_val()); - } - - constexpr T const& value() const& { - return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val()); - } - - OPTIONAL_MUTABLE_CONSTEXPR T& value() & { - return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val()); - } - - OPTIONAL_MUTABLE_CONSTEXPR T&& value() && { - if (!initialized()) throw bad_optional_access("bad optional access"); - return std::move(contained_val()); - } - -# else - - T* operator ->() { - assert (initialized()); - return dataptr(); - } - - constexpr T const& operator *() const { - return contained_val(); - } - - T& operator *() { - assert (initialized()); - return contained_val(); - } - - constexpr T const& value() const { - return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val()); - } - - T& value() { - return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val()); - } - -# endif - -# if OPTIONAL_HAS_THIS_RVALUE_REFS == 1 - - template - constexpr T value_or(V&& v) const& - { - return *this ? **this : detail_::convert(constexpr_forward(v)); - } - -# if OPTIONAL_HAS_MOVE_ACCESSORS == 1 - - template - OPTIONAL_MUTABLE_CONSTEXPR T value_or(V&& v) && - { - return *this ? constexpr_move(const_cast&>(*this).contained_val()) : detail_::convert(constexpr_forward(v)); - } - -# else - - template - T value_or(V&& v) && - { - return *this ? constexpr_move(const_cast&>(*this).contained_val()) : detail_::convert(constexpr_forward(v)); - } - -# endif - -# else - - template - constexpr T value_or(V&& v) const - { - return *this ? **this : detail_::convert(constexpr_forward(v)); - } - -# endif - - // 20.6.3.6, modifiers - void reset() noexcept { clear(); } -}; - - -template -class optional -{ - static_assert( !std::is_same::value, "bad T" ); - static_assert( !std::is_same::value, "bad T" ); - T* ref; - -public: - - // 20.5.5.1, construction/destruction - constexpr optional() noexcept : ref(nullptr) {} - - constexpr optional(nullopt_t) noexcept : ref(nullptr) {} - - constexpr optional(T& v) noexcept : ref(detail_::static_addressof(v)) {} - - optional(T&&) = delete; - - constexpr optional(const optional& rhs) noexcept : ref(rhs.ref) {} - - explicit constexpr optional(in_place_t, T& v) noexcept : ref(detail_::static_addressof(v)) {} - - explicit optional(in_place_t, T&&) = delete; - - ~optional() = default; - - // 20.5.5.2, mutation - optional& operator=(nullopt_t) noexcept { - ref = nullptr; - return *this; - } - - // optional& operator=(const optional& rhs) noexcept { - // ref = rhs.ref; - // return *this; - // } - - // optional& operator=(optional&& rhs) noexcept { - // ref = rhs.ref; - // return *this; - // } - - template - auto operator=(U&& rhs) noexcept - -> typename std::enable_if - < - std::is_same::type, optional>::value, - optional& - >::type - { - ref = rhs.ref; - return *this; - } - - template - auto operator=(U&& rhs) noexcept - -> typename std::enable_if - < - !std::is_same::type, optional>::value, - optional& - >::type - = delete; - - void emplace(T& v) noexcept { - ref = detail_::static_addressof(v); - } - - void emplace(T&&) = delete; - - - void swap(optional& rhs) noexcept - { - std::swap(ref, rhs.ref); - } - - // 20.5.5.3, observers - constexpr T* operator->() const { - return TR2_OPTIONAL_ASSERTED_EXPRESSION(ref, ref); - } - - constexpr T& operator*() const { - return TR2_OPTIONAL_ASSERTED_EXPRESSION(ref, *ref); - } - - constexpr T& value() const { - return ref ? *ref : (throw bad_optional_access("bad optional access"), *ref); - } - - explicit constexpr operator bool() const noexcept { - return ref != nullptr; - } - - constexpr bool has_value() const noexcept { - return ref != nullptr; - } - - template - constexpr typename std::decay::type value_or(V&& v) const - { - return *this ? **this : detail_::convert::type>(constexpr_forward(v)); - } - - // x.x.x.x, modifiers - void reset() noexcept { ref = nullptr; } -}; - - -template -class optional -{ - static_assert( sizeof(T) == 0, "optional rvalue references disallowed" ); -}; - - -// 20.5.8, Relational operators -template constexpr bool operator==(const optional& x, const optional& y) -{ - return bool(x) != bool(y) ? false : bool(x) == false ? true : *x == *y; -} - -template constexpr bool operator!=(const optional& x, const optional& y) -{ - return !(x == y); -} - -template constexpr bool operator<(const optional& x, const optional& y) -{ - return (!y) ? false : (!x) ? true : *x < *y; -} - -template constexpr bool operator>(const optional& x, const optional& y) -{ - return (y < x); -} - -template constexpr bool operator<=(const optional& x, const optional& y) -{ - return !(y < x); -} - -template constexpr bool operator>=(const optional& x, const optional& y) -{ - return !(x < y); -} - - -// 20.5.9, Comparison with nullopt -template constexpr bool operator==(const optional& x, nullopt_t) noexcept -{ - return (!x); -} - -template constexpr bool operator==(nullopt_t, const optional& x) noexcept -{ - return (!x); -} - -template constexpr bool operator!=(const optional& x, nullopt_t) noexcept -{ - return bool(x); -} - -template constexpr bool operator!=(nullopt_t, const optional& x) noexcept -{ - return bool(x); -} - -template constexpr bool operator<(const optional&, nullopt_t) noexcept -{ - return false; -} - -template constexpr bool operator<(nullopt_t, const optional& x) noexcept -{ - return bool(x); -} - -template constexpr bool operator<=(const optional& x, nullopt_t) noexcept -{ - return (!x); -} - -template constexpr bool operator<=(nullopt_t, const optional&) noexcept -{ - return true; -} - -template constexpr bool operator>(const optional& x, nullopt_t) noexcept -{ - return bool(x); -} - -template constexpr bool operator>(nullopt_t, const optional&) noexcept -{ - return false; -} - -template constexpr bool operator>=(const optional&, nullopt_t) noexcept -{ - return true; -} - -template constexpr bool operator>=(nullopt_t, const optional& x) noexcept -{ - return (!x); -} - - - -// 20.5.10, Comparison with T -template constexpr bool operator==(const optional& x, const T& v) -{ - return bool(x) ? *x == v : false; -} - -template constexpr bool operator==(const T& v, const optional& x) -{ - return bool(x) ? v == *x : false; -} - -template constexpr bool operator!=(const optional& x, const T& v) -{ - return bool(x) ? *x != v : true; -} - -template constexpr bool operator!=(const T& v, const optional& x) -{ - return bool(x) ? v != *x : true; -} - -template constexpr bool operator<(const optional& x, const T& v) -{ - return bool(x) ? *x < v : true; -} - -template constexpr bool operator>(const T& v, const optional& x) -{ - return bool(x) ? v > *x : true; -} - -template constexpr bool operator>(const optional& x, const T& v) -{ - return bool(x) ? *x > v : false; -} - -template constexpr bool operator<(const T& v, const optional& x) -{ - return bool(x) ? v < *x : false; -} - -template constexpr bool operator>=(const optional& x, const T& v) -{ - return bool(x) ? *x >= v : false; -} - -template constexpr bool operator<=(const T& v, const optional& x) -{ - return bool(x) ? v <= *x : false; -} - -template constexpr bool operator<=(const optional& x, const T& v) -{ - return bool(x) ? *x <= v : true; -} - -template constexpr bool operator>=(const T& v, const optional& x) -{ - return bool(x) ? v >= *x : true; -} - - -// Comparison of optional with T -template constexpr bool operator==(const optional& x, const T& v) -{ - return bool(x) ? *x == v : false; -} - -template constexpr bool operator==(const T& v, const optional& x) -{ - return bool(x) ? v == *x : false; -} - -template constexpr bool operator!=(const optional& x, const T& v) -{ - return bool(x) ? *x != v : true; -} - -template constexpr bool operator!=(const T& v, const optional& x) -{ - return bool(x) ? v != *x : true; -} - -template constexpr bool operator<(const optional& x, const T& v) -{ - return bool(x) ? *x < v : true; -} - -template constexpr bool operator>(const T& v, const optional& x) -{ - return bool(x) ? v > *x : true; -} - -template constexpr bool operator>(const optional& x, const T& v) -{ - return bool(x) ? *x > v : false; -} - -template constexpr bool operator<(const T& v, const optional& x) -{ - return bool(x) ? v < *x : false; -} - -template constexpr bool operator>=(const optional& x, const T& v) -{ - return bool(x) ? *x >= v : false; -} - -template constexpr bool operator<=(const T& v, const optional& x) -{ - return bool(x) ? v <= *x : false; -} - -template constexpr bool operator<=(const optional& x, const T& v) -{ - return bool(x) ? *x <= v : true; -} - -template constexpr bool operator>=(const T& v, const optional& x) -{ - return bool(x) ? v >= *x : true; -} - -// Comparison of optional with T -template constexpr bool operator==(const optional& x, const T& v) -{ - return bool(x) ? *x == v : false; -} - -template constexpr bool operator==(const T& v, const optional& x) -{ - return bool(x) ? v == *x : false; -} - -template constexpr bool operator!=(const optional& x, const T& v) -{ - return bool(x) ? *x != v : true; -} - -template constexpr bool operator!=(const T& v, const optional& x) -{ - return bool(x) ? v != *x : true; -} - -template constexpr bool operator<(const optional& x, const T& v) -{ - return bool(x) ? *x < v : true; -} - -template constexpr bool operator>(const T& v, const optional& x) -{ - return bool(x) ? v > *x : true; -} - -template constexpr bool operator>(const optional& x, const T& v) -{ - return bool(x) ? *x > v : false; -} - -template constexpr bool operator<(const T& v, const optional& x) -{ - return bool(x) ? v < *x : false; -} - -template constexpr bool operator>=(const optional& x, const T& v) -{ - return bool(x) ? *x >= v : false; -} - -template constexpr bool operator<=(const T& v, const optional& x) -{ - return bool(x) ? v <= *x : false; -} - -template constexpr bool operator<=(const optional& x, const T& v) -{ - return bool(x) ? *x <= v : true; -} - -template constexpr bool operator>=(const T& v, const optional& x) -{ - return bool(x) ? v >= *x : true; -} - - -// 20.5.12, Specialized algorithms -template -void swap(optional& x, optional& y) noexcept(noexcept(x.swap(y))) -{ - x.swap(y); -} - - -template -constexpr optional::type> make_optional(T&& v) -{ - return optional::type>(constexpr_forward(v)); -} - -template -constexpr optional make_optional(std::reference_wrapper v) -{ - return optional(v.get()); -} - - -} // namespace at - -namespace std -{ - template - struct hash> - { - typedef typename hash::result_type result_type; - typedef at::optional argument_type; - - constexpr result_type operator()(argument_type const& arg) const { - return arg ? std::hash{}(*arg) : result_type{}; - } - }; - - template - struct hash> - { - typedef typename hash::result_type result_type; - typedef at::optional argument_type; - - constexpr result_type operator()(argument_type const& arg) const { - return arg ? std::hash{}(*arg) : result_type{}; - } - }; -} - -# undef TR2_OPTIONAL_REQUIRES -# undef TR2_OPTIONAL_ASSERTED_EXPRESSION +#include diff --git a/aten/src/ATen/templates/StorageDerived.cpp b/aten/src/ATen/templates/StorageDerived.cpp deleted file mode 100644 index 0491203c3286e6..00000000000000 --- a/aten/src/ATen/templates/StorageDerived.cpp +++ /dev/null @@ -1,69 +0,0 @@ -#include "ATen/${Storage}.h" - -// ${generated_comment} - -#include "ATen/Half.h" -#include "ATen/Allocator.h" -#include - -#include "ATen/Config.h" -$extra_cuda_headers - -namespace at { - -${Storage}::${Storage}() - : Storage(new StorageImpl( - ScalarType::${ScalarName}, - 0, -#if ${isCUDA} - globalContext().getTHCState()->cudaDeviceAllocator, -#else - getTHDefaultAllocator(), -#endif - /* resizable */ true)) {} - -${Storage}::${Storage}(size_t size) - : Storage(new StorageImpl( - ScalarType::${ScalarName}, - size, -#if ${isCUDA} - globalContext().getTHCState()->cudaDeviceAllocator, -#else - getTHDefaultAllocator(), -#endif - /* resizable */ true)) {} - -${Storage}::${Storage}(size_t size, Allocator* allocator) - : Storage(new StorageImpl( - ScalarType::${ScalarName}, - size, - allocator, - /* resizable */ false)) {} - -// TODO: Take in Device as an input to the std::function constructor - -#if ${isCUDA} -static int getPointerDevice(void* ptr) { - struct cudaPointerAttributes attr; - THCudaCheck(cudaPointerGetAttributes(&attr, ptr)); - return attr.device; -} -#endif - -${Storage}::${Storage}( - void * data, - size_t size, - const std::function & deleter) - : Storage(new StorageImpl( - ScalarType::${ScalarName}, - size, - InefficientStdFunctionContext::makeDataPtr(data, deleter, -#if ${isCUDA} - Device(kCUDA, getPointerDevice(data)) -#else - kCPU -#endif - ), - /* allocator */ nullptr, - /* resizable */ false)) {} -} diff --git a/aten/src/ATen/templates/StorageDerived.h b/aten/src/ATen/templates/StorageDerived.h deleted file mode 100644 index dddcd5dbf03f21..00000000000000 --- a/aten/src/ATen/templates/StorageDerived.h +++ /dev/null @@ -1,31 +0,0 @@ -#pragma once - -// ${generated_comment} - -$th_headers - -#include "ATen/Storage.h" -#include "ATen/Context.h" - -#include - -namespace at { - -struct Allocator; - -struct ${Storage} final : public Storage { - ${Storage}(); - ${Storage}(StorageImpl* storage_impl) : Storage(storage_impl){}; - ${Storage}(size_t size); - ${Storage}(size_t size, Allocator* allocator); - ${Storage}( - void* data, - size_t size, - const std::function& deleter); - StorageImpl* storage_impl_; - - protected: - friend struct ${Type}; -}; - -} // namespace at diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h index 31e952ebb79ff8..55fb4aec0cbb60 100644 --- a/aten/src/ATen/templates/Tensor.h +++ b/aten/src/ATen/templates/Tensor.h @@ -2,7 +2,6 @@ // ${generated_comment} -#include "ATen/Generator.h" #include "ATen/Scalar.h" #include "ATen/ScalarType.h" #include "ATen/SparseTensorRef.h" @@ -10,12 +9,12 @@ #include "ATen/TensorAccessor.h" #include "ATen/TensorBase.h" #include "ATen/TensorImpl.h" -#include "ATen/Utils.h" #include "ATen/Device.h" #include "ATen/Layout.h" #include "ATen/optional.h" namespace at { +struct Generator; struct Type; struct Tensor; struct TensorOptions; diff --git a/aten/src/ATen/templates/TensorDense.cpp b/aten/src/ATen/templates/TensorDense.cpp index cc2f47a89180ab..aeba9fb22a3653 100644 --- a/aten/src/ATen/templates/TensorDense.cpp +++ b/aten/src/ATen/templates/TensorDense.cpp @@ -3,5 +3,5 @@ std::unique_ptr ${Tensor}::storage() { auto storage = THTensor_getStoragePtr(tensor); THStorage_retain(storage); - return std::unique_ptr(new ${Storage}(storage)); + return std::unique_ptr(new Storage(storage)); } diff --git a/aten/src/ATen/templates/TensorDerived.cpp b/aten/src/ATen/templates/TensorDerived.cpp index d72ba4abde2c12..5fab8bf2226417 100644 --- a/aten/src/ATen/templates/TensorDerived.cpp +++ b/aten/src/ATen/templates/TensorDerived.cpp @@ -5,9 +5,8 @@ // ${generated_comment} -#include "ATen/Config.h" #include "ATen/${Tensor}.h" -#include "ATen/${Storage}.h" +#include "ATen/Storage.h" #include "ATen/Scalar.h" #include "ATen/Half.h" @@ -22,7 +21,7 @@ namespace detail { } ${Tensor}::${Tensor}(${THTensor} * tensor) -: TensorImpl(&globalContext().getType(Backend::${Backend},ScalarType::${ScalarName}), tensor) +: TensorImpl(Backend::${Backend}, ScalarType::${ScalarName}, tensor, /* is variable */ false) {} ${TensorDenseOrSparse} diff --git a/aten/src/ATen/templates/TypeDerived.cpp b/aten/src/ATen/templates/TypeDerived.cpp index 67009473dddefc..ddd1483f0436f3 100644 --- a/aten/src/ATen/templates/TypeDerived.cpp +++ b/aten/src/ATen/templates/TypeDerived.cpp @@ -31,6 +31,14 @@ namespace at { +#if ${isCUDA} +static int getPointerDevice(void* ptr) { + struct cudaPointerAttributes attr; + THCudaCheck(cudaPointerGetAttributes(&attr, ptr)); + return attr.device; +} +#endif + ${Type}::${Type}(Context* context) : Type(context, /*is_variable=*/false, /*is_undefined=*/false) {} ScalarType ${Type}::scalarType() const { @@ -44,18 +52,44 @@ bool ${Type}::is_sparse() const { return backend() == kSparseCPU || backend() == bool ${Type}::is_distributed() const { return false; } std::unique_ptr ${Type}::storage() const { - return std::unique_ptr(new ${Storage}()); + return std::unique_ptr(new Storage( + ScalarType::${ScalarName}, + 0, +#if ${isCUDA} + globalContext().getTHCState()->cudaDeviceAllocator +#else + getTHDefaultAllocator() +#endif + )); } std::unique_ptr ${Type}::storage(size_t size) const { - return std::unique_ptr(new ${Storage}(size)); + return std::unique_ptr(new Storage( + ScalarType::${ScalarName}, + size, +#if ${isCUDA} + globalContext().getTHCState()->cudaDeviceAllocator +#else + getTHDefaultAllocator() +#endif + )); } std::unique_ptr ${Type}::storageFromBlob(void * data, int64_t size, const std::function & deleter) const { return std::unique_ptr( - new ${Storage}(data,size,deleter)); + new Storage( + ScalarType::${ScalarName}, + InefficientStdFunctionContext::makeDataPtr(data, deleter, +#if ${isCUDA} + Device(kCUDA, getPointerDevice(data)) +#else + kCPU +#endif + ), + size, + deleter)); } std::unique_ptr ${Type}::storageWithAllocator(int64_t size, Allocator* allocator) const { return std::unique_ptr( - new ${Storage}(size, allocator)); + new Storage(ScalarType::${ScalarName}, size, allocator)); } Tensor ${Type}::unsafeTensorFromTH(void * th_pointer, bool retain) const { if (retain) @@ -65,7 +99,7 @@ Tensor ${Type}::unsafeTensorFromTH(void * th_pointer, bool retain) const { std::unique_ptr ${Type}::unsafeStorageFromTH(void * th_pointer, bool retain) const { if (retain) ${THStorage}_retain(${state,} (${THStorage}*) th_pointer); - return std::unique_ptr(new ${Storage}((${THStorage}*) th_pointer)); + return std::unique_ptr(new Storage((${THStorage}*) th_pointer)); } std::unique_ptr ${Type}::generator() const { return std::unique_ptr(new ${Generator}(context)); diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp index 6b46c8c0b70018..8e58df97073086 100644 --- a/aten/src/ATen/test/basic.cpp +++ b/aten/src/ATen/test/basic.cpp @@ -270,6 +270,10 @@ static void test(Type & type) { auto result = tensor.m(relu).m(mse_loss, other, Reduction::ElementwiseMean); REQUIRE(result.allclose(mse_loss(relu(tensor), other))); } + SECTION("core") { + int i = CoreTest(); + REQUIRE(i + 1 == CoreTest()); + } } TEST_CASE( "basic tests CPU", "[cpu]" ) { diff --git a/aten/src/ATen/test/scalar_tensor_test.cpp b/aten/src/ATen/test/scalar_tensor_test.cpp index 64098c5bf76c56..4a400e3a517ee6 100644 --- a/aten/src/ATen/test/scalar_tensor_test.cpp +++ b/aten/src/ATen/test/scalar_tensor_test.cpp @@ -65,30 +65,13 @@ void test(Type &T) { require_equal_size_dim(t2, ones({0}, T)); // unsqueeze -#ifndef USE_TH_SIZE_ZERO_DIM - if (t.numel() != 0) { - REQUIRE(t.unsqueeze(0).dim() == t.dim() + 1); - } else { - REQUIRE_THROWS(t.unsqueeze(0)); - } -#else REQUIRE(t.unsqueeze(0).dim() == t.dim() + 1); -#endif // unsqueeze_ { auto t2 = ones(*s, T); -#ifndef USE_TH_SIZE_ZERO_DIM - if (t2.numel() != 0) { - auto r = t2.unsqueeze_(0); - REQUIRE(r.dim() == t.dim() + 1); - } else { - REQUIRE_THROWS(t2.unsqueeze_(0)); - } -#else auto r = t2.unsqueeze_(0); REQUIRE(r.dim() == t.dim() + 1); -#endif } // squeeze (with dimension argument) diff --git a/aten/src/TH/THHalf.cpp b/aten/src/TH/THHalf.cpp index 1c46c59a9977fa..840c97617c4cb2 100644 --- a/aten/src/TH/THHalf.cpp +++ b/aten/src/TH/THHalf.cpp @@ -1,4 +1,5 @@ #include "THHalf.h" +#include /* Copyright 1993-2014 NVIDIA Corporation. All rights reserved. */ @@ -16,85 +17,14 @@ TH_API float TH_half2float(THHalf h) return f; } -// Host functions for converting between FP32 and FP16 formats void TH_halfbits2float(unsigned short* src, float* res) { - unsigned h = *src; - unsigned sign = ((h >> 15) & 1); - unsigned exponent = ((h >> 10) & 0x1f); - unsigned mantissa = ((h & 0x3ff) << 13); - - if (exponent == 0x1f) { /* NaN or Inf */ - mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0); - exponent = 0xff; - } else if (!exponent) { /* Denorm or Zero */ - if (mantissa) { - unsigned int msb; - exponent = 0x71; - do { - msb = (mantissa & 0x400000); - mantissa <<= 1; /* normalize */ - --exponent; - } while (!msb); - mantissa &= 0x7fffff; /* 1.mantissa is implicit */ - } - } else { - exponent += 0x70; - } - - *(unsigned*)res = ((sign << 31) | (exponent << 23) | mantissa); + *res = at::detail::halfbits2float(*src); } + void TH_float2halfbits(float* src, unsigned short* dest) { - unsigned x = *(unsigned*)src; - unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1; - unsigned sign, exponent, mantissa; - - // Get rid of +NaN/-NaN case first. - if (u > 0x7f800000) { - *dest = 0x7fffU; - return ; - } - - sign = ((x >> 16) & 0x8000); - - // Get rid of +Inf/-Inf, +0/-0. - if (u > 0x477fefff) { - *dest = sign | 0x7c00U; - return; - } - if (u < 0x33000001) { - *dest = (sign | 0x0000); - return; - } - - exponent = ((u >> 23) & 0xff); - mantissa = (u & 0x7fffff); - - if (exponent > 0x70) { - shift = 13; - exponent -= 0x70; - } else { - shift = 0x7e - exponent; - exponent = 0; - mantissa |= 0x800000; - } - lsb = (1 << shift); - lsb_s1 = (lsb >> 1); - lsb_m1 = (lsb - 1); - - // Round to nearest even. - remainder = (mantissa & lsb_m1); - mantissa >>= shift; - if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) { - ++mantissa; - if (!(mantissa & 0x3ff)) { - ++exponent; - mantissa = 0; - } - } - - *dest = (sign | (exponent << 10) | mantissa); + *dest = at::detail::float2halfbits(*src); } diff --git a/aten/src/TH/THStorageFunctions.cpp b/aten/src/TH/THStorageFunctions.cpp index 0f05bb466651d3..0c36d5bf97fcf0 100644 --- a/aten/src/TH/THStorageFunctions.cpp +++ b/aten/src/TH/THStorageFunctions.cpp @@ -19,38 +19,25 @@ void THStorage_free(THStorage* storage) { if (!storage) { return; } - - if (--storage->refcount == 0) { - if (storage->finalizer) { - (*storage->finalizer)(); - } - storage->finalizer = nullptr; - storage->data_ptr.clear(); - THStorage_weakFree(storage); - } + storage->release(); } // Manually retains a weak reference void THStorage_weakRetain(THStorage *weak_storage) { - weak_storage->weakcount++; + weak_storage->weak_retain(); } // Releases a weak reference void THStorage_weakFree(THStorage *weak_storage) { - if (--weak_storage->weakcount == 0) { - delete weak_storage; - } + weak_storage->weak_release(); } // Given a weak reference, returns a strong reference to a storage (which must // be freed when done) or null if the storage is already dead. THStorage* THStorage_weakLock(THStorage *weak_storage) { - for (;;) { - int refcount = weak_storage->refcount.load(); - if (refcount == 0) return nullptr; - if (weak_storage->refcount.compare_exchange_strong(refcount, refcount + 1)) break; - } - return weak_storage; + if (weak_storage->weak_lock()) + return weak_storage; + return nullptr; } THDescBuff THLongStorage_sizeDesc(const THLongStorage *size) { @@ -95,7 +82,7 @@ ptrdiff_t THStorage_size(const THStorage *self) void THStorage_retain(THStorage *storage) { if (storage) { - ++storage->refcount; + storage->retain(); } } diff --git a/aten/src/TH/THStorageFunctions.hpp b/aten/src/TH/THStorageFunctions.hpp index 671e2f39fb1c7e..0e8b3e4ab17bee 100644 --- a/aten/src/TH/THStorageFunctions.hpp +++ b/aten/src/TH/THStorageFunctions.hpp @@ -35,8 +35,6 @@ TH_API ptrdiff_t THStorage_size(const THStorage *self); -TH_API void THStorage_setFlag(THStorage *storage, const char flag); -TH_API void THStorage_clearFlag(THStorage *storage, const char flag); TH_API void THStorage_retain(THStorage *storage); TH_API void THStorage_resize(THStorage *storage, ptrdiff_t size); TH_API void THStorage_swap(THStorage *storage1, THStorage *storage2); diff --git a/aten/src/TH/THTensor.cpp b/aten/src/TH/THTensor.cpp index 13df5128e5f5f8..5f3b6ed1fef6cc 100644 --- a/aten/src/TH/THTensor.cpp +++ b/aten/src/TH/THTensor.cpp @@ -32,7 +32,7 @@ THTensor_compute_stride(at::IntList oldshape, at::IntList oldstride, at::IntList // This could perhaps be combined with the below code, but the complexity didn't seem worth it. int64_t numel = std::accumulate(oldshape.begin(), oldshape.end(), 1, std::multiplies()); if (numel == 0 && oldshape.equals(newshape)) { - return std::vector(oldstride); + return oldstride.vec(); } std::vector newstride(newshape.size()); diff --git a/aten/src/TH/THTensor.hpp b/aten/src/TH/THTensor.hpp index 16329f7ed7f621..56204a00e9c3ed 100644 --- a/aten/src/TH/THTensor.hpp +++ b/aten/src/TH/THTensor.hpp @@ -56,6 +56,10 @@ struct THTensor return sizes_.size(); } + at::ScalarType scalar_type() const { + return storage_->scalar_type; + } + ptrdiff_t storage_offset() const { return storage_offset_; } @@ -109,6 +113,17 @@ inline int64_t* THTensor_getStridePtr(THTensor* tensor) { // NB: Non-retaining inline THStorage* THTensor_getStoragePtr(const THTensor* tensor) { + // Within PyTorch, the invariant is that storage_ is always + // initialized; we never have tensors that don't have any storage. + // However, for Caffe2, this is not true, because they have permitted + // tensors to be allocated without specifying what scalar type + // they should be, only to be filled when GetMutableData is called + // for the first time (providing the necessary type). It is an ERROR to + // invoke any PyTorch operations on such a half-constructed storage, + // and this check tests for that case. + AT_CHECK(tensor->storage_, "Cannot use PyTorch operations on a half-constructed " + "tensor. If this tensor came from Caffe2, please call GetMutableData on " + "it first; otherwise, this is a bug, please report it."); return tensor->storage_; } @@ -118,6 +133,7 @@ inline THStorage* THTensor_getStoragePtr(const THTensor* tensor) { inline void THTensor_resizeDim(THTensor* tensor, int64_t ndim) { // NB: This is *truly* a resize; calling code (e.g., squeeze) // assumes that old values are preserved + tensor->is_zero_dim_ = bool(ndim == 0); tensor->sizes_.resize(ndim); tensor->strides_.resize(ndim); } @@ -141,6 +157,9 @@ inline void THTensor_setStorageOffset(THTensor* tensor, ptrdiff_t storage_offset // NB: Steals ownership of storage inline void THTensor_stealAndSetStoragePtr(THTensor* tensor, THStorage* storage) { + // Caffe2 might have tensors whose storages are null, but we + // don't allow it in PyTorch. + AT_ASSERT(storage); tensor->storage_ = storage; } @@ -177,6 +196,19 @@ inline int THTensor_nDimensionLegacyAll(const THTensor* tensor) { } } +inline int64_t THTensor_strideLegacyNoScalars(const THTensor *self, int dim) { + THArgCheck((dim >= 0) && (dim < THTensor_nDimensionLegacyNoScalars(self)), 2, "dimension %d out of range of %dD tensor", + dim+TH_INDEX_BASE, THTensor_nDimensionLegacyNoScalars(self)); + return THTensor_isZeroDim(self) ? 1 : self->stride(dim); +} + +inline int64_t THTensor_sizeLegacyNoScalars(const THTensor *self, int dim) +{ + THArgCheck((dim >= 0) && (dim < THTensor_nDimensionLegacyNoScalars(self)), 2, "dimension %d out of range of %dD tensor", + dim+TH_INDEX_BASE, THTensor_nDimensionLegacyNoScalars(self)); + return THTensor_isZeroDim(self) ? 1 : self->size(dim); +} + TH_API void THTensor_free(THTensor *self); TH_CPP_API at::optional> THTensor_compute_stride(at::IntList oldshape, at::IntList oldstride, at::IntList newshape); diff --git a/aten/src/TH/THTensorDimApply.h b/aten/src/TH/THTensorDimApply.h index 00c24dee51adb8..ff05ed8194979d 100644 --- a/aten/src/TH/THTensorDimApply.h +++ b/aten/src/TH/THTensorDimApply.h @@ -39,8 +39,8 @@ int TH_TENSOR_DIM_APPLY_hasFinished = THTensor_(numel)(TENSOR1) == 0; \ int TH_TENSOR_DIM_APPLY_i; \ \ - if( (DIMENSION < 0) || (DIMENSION >= TENSOR1->dim()) ) \ - THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, TENSOR1->dim()); \ + if( (DIMENSION < 0) || (DIMENSION >= THTensor_nDimensionLegacyNoScalars(TENSOR1)) ) \ + THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, THTensor_nDimensionLegacyNoScalars(TENSOR1)); \ int same_dims = 1; \ if( TENSOR1->dim() != TENSOR2->dim() ) { \ same_dims = 0; \ @@ -56,8 +56,8 @@ if (TH_TENSOR_DIM_APPLY_hasFinished) { \ return; \ } \ - TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(TENSOR1->dim())); \ - for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \ + TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(THTensor_nDimensionLegacyNoScalars(TENSOR1))); \ + for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyNoScalars(TENSOR1); TH_TENSOR_DIM_APPLY_i++) \ TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \ \ TENSOR1##_data = THTensor_getStoragePtr(TENSOR1)->data()+(TENSOR1)->storage_offset(); \ @@ -76,14 +76,14 @@ { \ CODE \ \ - if(TENSOR1->dim() == 1) \ + if(THTensor_nDimensionLegacyNoScalars(TENSOR1) == 1) \ break; \ \ - for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \ + for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyNoScalars(TENSOR1); TH_TENSOR_DIM_APPLY_i++) \ { \ if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \ { \ - if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \ + if(TH_TENSOR_DIM_APPLY_i == THTensor_nDimensionLegacyNoScalars(TENSOR1)-1) \ { \ TH_TENSOR_DIM_APPLY_hasFinished = 1; \ break; \ @@ -98,7 +98,7 @@ \ if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR1->size(TH_TENSOR_DIM_APPLY_i)) \ { \ - if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \ + if(TH_TENSOR_DIM_APPLY_i == THTensor_nDimensionLegacyNoScalars(TENSOR1)-1) \ { \ TH_TENSOR_DIM_APPLY_hasFinished = 1; \ break; \ @@ -145,13 +145,13 @@ int TH_TENSOR_DIM_APPLY_hasFinished = THTensor_(numel)(TENSOR1) == 0; \ int TH_TENSOR_DIM_APPLY_i; \ \ - if( (DIMENSION < 0) || (DIMENSION >= TENSOR1->dim()) ) \ + if( (DIMENSION < 0) || (DIMENSION >= THTensor_nDimensionLegacyNoScalars(TENSOR1)) ) \ THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, THTensor_nDimensionLegacyAll(TENSOR1)); \ if( TENSOR1->dim() != TENSOR2->dim() ) { \ AT_ERROR("inconsistent tensor size, expected ", #TENSOR1, " ", TENSOR1->sizes(), " and ", #TENSOR2, " ", TENSOR2->sizes(), " to have the same number of dimensions"); \ } \ TH_UNUSED int shape_check_flag = 0; \ - for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \ + for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyNoScalars(TENSOR1); TH_TENSOR_DIM_APPLY_i++) \ { \ if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \ continue; \ @@ -163,8 +163,8 @@ if (TH_TENSOR_DIM_APPLY_hasFinished) { \ return; \ } \ - TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(TENSOR1->dim())); \ - for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \ + TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(THTensor_nDimensionLegacyNoScalars(TENSOR1))); \ + for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyNoScalars(TENSOR1); TH_TENSOR_DIM_APPLY_i++) \ TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \ \ TENSOR1##_data = THTensor_getStoragePtr(TENSOR1)->data()+(TENSOR1)->storage_offset(); \ @@ -179,14 +179,14 @@ { \ CODE \ \ - if(TENSOR1->dim() == 1) \ + if(THTensor_nDimensionLegacyNoScalars(TENSOR1) == 1) \ break; \ \ for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \ { \ if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \ { \ - if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \ + if(TH_TENSOR_DIM_APPLY_i == THTensor_nDimensionLegacyNoScalars(TENSOR1)-1) \ { \ TH_TENSOR_DIM_APPLY_hasFinished = 1; \ break; \ @@ -200,7 +200,7 @@ \ if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR1->size(TH_TENSOR_DIM_APPLY_i)) \ { \ - if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \ + if(TH_TENSOR_DIM_APPLY_i == THTensor_nDimensionLegacyNoScalars(TENSOR1)-1) \ { \ TH_TENSOR_DIM_APPLY_hasFinished = 1; \ break; \ diff --git a/aten/src/TH/generic/THTensor.cpp b/aten/src/TH/generic/THTensor.cpp index 58a5d39366c294..e68c60a9455c4f 100644 --- a/aten/src/TH/generic/THTensor.cpp +++ b/aten/src/TH/generic/THTensor.cpp @@ -373,11 +373,7 @@ void THTensor_(narrow)(THTensor *self, THTensor *src, int dimension, int64_t fir THArgCheck( (dimension >= 0) && (dimension < src->dim()), 2, "out of range"); THArgCheck( firstIndex >= 0, 3, "out of range"); -#ifdef USE_TH_SIZE_ZERO_DIM THArgCheck( size >= 0, 4, "out of range"); -#else - THArgCheck( size > 0, 4, "out of range"); -#endif THArgCheck(firstIndex <= src->size(dimension) - size, 4, "out of range"); THTensor_(set)(self, src); @@ -396,12 +392,8 @@ void THTensor_(select)(THTensor *self, THTensor *src, int dimension, int64_t sli if(!src) src = self; -#ifndef USE_TH_SIZE_ZERO_DIM - THArgCheck(THTensor_nDimensionLegacyAll(src) > 1, 1, "cannot select on a vector"); -#else #ifndef USE_TH_SCALAR THArgCheck(src->dim() > 1, 1, "cannot select on a vector"); -#endif #endif THArgCheck((dimension >= 0) && (dimension < src->dim()), 2, "out of range"); THArgCheck((sliceIndex >= 0) && (sliceIndex < src->size(dimension)), 3, "out of range"); @@ -423,8 +415,8 @@ void THTensor_(transpose)(THTensor *self, THTensor *src, int dimension1, int dim if(!src) src = self; - THArgCheck( (dimension1 >= 0) && (dimension1 < src->dim()), 1, "out of range"); - THArgCheck( (dimension2 >= 0) && (dimension2 < src->dim()), 2, "out of range"); + THArgCheck( (dimension1 >= 0) && (dimension1 < THTensor_nDimensionLegacyNoScalars(src)), 1, "out of range"); + THArgCheck( (dimension2 >= 0) && (dimension2 < THTensor_nDimensionLegacyNoScalars(src)), 2, "out of range"); THTensor_(set)(self, src); @@ -446,10 +438,7 @@ void THTensor_(unfold)(THTensor *self, THTensor *src, int dimension, int64_t siz if(!src) src = self; -#ifndef USE_TH_SIZE_ZERO_DIM - THArgCheck(!src->is_empty(), 1, "cannot unfold an empty tensor"); -#endif - THArgCheck((dimension >= 0) && (dimension < src->dim()), 2, "out of range"); + THArgCheck((dimension >= 0) && (dimension < THTensor_nDimensionLegacyNoScalars(src)), 2, "out of range"); THArgCheck(size <= src->size(dimension), 3, "out of range"); THArgCheck(step > 0, 4, "invalid step"); @@ -459,18 +448,20 @@ void THTensor_(unfold)(THTensor *self, THTensor *src, int dimension, int64_t siz std::vector newStride(/* size */ self->dim()+1); newSize[self->dim()] = size; - newStride[self->dim()] = self->stride(dimension); + newStride[self->dim()] = THTensor_strideLegacyNoScalars(self, dimension); for(d = 0; d < self->dim(); d++) { + auto self_size = THTensor_sizeLegacyNoScalars(self, d); + auto self_stride = THTensor_strideLegacyNoScalars(self, d); if(d == dimension) { - newSize[d] = (self->size(d) - size) / step + 1; - newStride[d] = step*self->stride(d); + newSize[d] = (self_size - size) / step + 1; + newStride[d] = step*self_stride; } else { - newSize[d] = self->size(d); - newStride[d] = self->stride(d); + newSize[d] = self_size; + newStride[d] = self_stride; } } @@ -547,9 +538,6 @@ void THTensor_(unsqueeze1d)(THTensor *self, THTensor *src, int dimension) src = self; THArgCheck((dimension >= 0) && (dimension <= src->dim()), 2, "dimension out of range"); -#ifndef USE_TH_SIZE_ZERO_DIM - THArgCheck(!src->is_empty(), 2, "cannot unsqueeze empty tensor"); -#endif THTensor_(set)(self, src); @@ -728,15 +716,6 @@ void THTensor_(resizeNd)(THTensor *self, int nDimension, int64_t *size, int64_t for(d = 0; d < nDimension; d++) { -#ifndef USE_TH_SIZE_ZERO_DIM - // we can't support this unless we have arbitrary 0-sized dimensions, but some calls to this - // currently exist and expect a size [0] tensor to be returned. - if (d == 0 && size[d] == 0) { - nDimension = 1; - } else { - AT_CHECK(size[d] > 0, "sizes must be non-negative"); - } -#endif if((self->dim() > d) && (size[d] != self->size(d))) { hascorrectsize = false; } @@ -790,14 +769,14 @@ void THTensor_(resizeNd)(THTensor *self, int nDimension, int64_t *size, int64_t void THTensor_(set1d)(THTensor *tensor, int64_t x0, real value) { - THArgCheck(THTensor_nDimensionLegacyAll(tensor) == 1, 1, "tensor must have one dimension"); + THArgCheck(THTensor_nDimensionLegacyNoScalars(tensor) == 1, 1, "tensor must have one dimension"); THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range"); THStorage_(set)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0), value); } real THTensor_(get1d)(const THTensor *tensor, int64_t x0) { - THArgCheck(THTensor_nDimensionLegacyAll(tensor) == 1, 1, "tensor must have one dimension"); + THArgCheck(THTensor_nDimensionLegacyNoScalars(tensor) == 1, 1, "tensor must have one dimension"); THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range"); return THStorage_(get)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)); } diff --git a/aten/src/TH/generic/THTensorEvenMoreMath.cpp b/aten/src/TH/generic/THTensorEvenMoreMath.cpp index 644fa541a8f9ae..03946724dcadc6 100644 --- a/aten/src/TH/generic/THTensorEvenMoreMath.cpp +++ b/aten/src/TH/generic/THTensorEvenMoreMath.cpp @@ -149,15 +149,8 @@ void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTens int64_t *index_data; real *tensor_data, *src_data; -#ifndef USE_TH_SIZE_ZERO_DIM - THArgCheck(THTensor_nDimensionLegacyAll(index) <= 1, 3, "Index is supposed to be an empty tensor or a vector"); - THArgCheck(dim < THTensor_nDimensionLegacyAll(src), 4, "Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE); - THArgCheck(THTensor_nDimensionLegacyAll(src) > 0, 2, "Source tensor is empty"); -#else - THArgCheck(index->dim() == 1, 3, "Index is supposed to be 1-dimensional"); - THArgCheck(dim < src->dim(), 4, "Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE); - //THArgCheck(src->dim() > 0, 2, "Source tensor is empty"); -#endif + THArgCheck(THTensor_nDimensionLegacyNoScalars(index) == 1, 3, "Index is supposed to be 1-dimensional"); + THArgCheck(dim < THTensor_nDimensionLegacyNoScalars(src), 4, "Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE); numel = THLongTensor_nElement(index); @@ -188,7 +181,7 @@ void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTens } } - if (src->dim() == 1) { + if (src->dim() <= 1) { #pragma omp parallel for if(numel > TH_OMP_OVERHEAD_THRESHOLD) private(i) for (i=0; idim() == 1) + else if (src->dim() <= 1) { for (i=0; idim() == 1, 3, "Index is supposed to be a vector"); - THArgCheck(dim < src->dim(), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE); -#endif - THArgCheck(numel == src->size(dim),4,"Number of indices should be equal to source:size(dim)"); + THArgCheck(THTensor_nDimensionLegacyNoScalars(index) == 1, 3, "Index is supposed to be a vector"); + THArgCheck(dim < THTensor_nDimensionLegacyNoScalars(src), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE); + THArgCheck(numel == THTensor_sizeLegacyNoScalars(src, dim),4,"Number of indices should be equal to source:size(dim)"); index = THLongTensor_newContiguous(index); index_data = THLongTensor_data(index); @@ -400,13 +388,8 @@ void THTensor_(indexFill)(THTensor *tensor, int dim, THLongTensor *index, real v int64_t *index_data; numel = THLongTensor_nElement(index); -#ifndef USE_TH_SIZE_ZERO_DIM - THArgCheck(THTensor_nDimensionLegacyAll(index) == 1, 3, "Index is supposed to be a vector"); - THArgCheck(dim < THTensor_nDimensionLegacyAll(tensor), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE); -#else - THArgCheck(index->dim() == 1, 3, "Index is supposed to be a vector"); - THArgCheck(dim < tensor->dim(), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE); -#endif + THArgCheck(THTensor_nDimensionLegacyNoScalars(index) == 1, 3, "Index is supposed to be a vector"); + THArgCheck(dim < THTensor_nDimensionLegacyNoScalars(tensor), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE); index = THLongTensor_newContiguous(index); index_data = THLongTensor_data(index); @@ -459,19 +442,11 @@ void THTensor_(scatter)(THTensor *tensor, int dim, THLongTensor *index, THTensor { int64_t elems_per_row, i, idx; -#ifndef USE_TH_SIZE_ZERO_DIM - THArgCheck(dim < THTensor_(nDimensionLegacyAll)(tensor), 2, "Index dimension is out of bounds"); - THArgCheck(THLongTensor_nDimensionLegacyAll(index) == THTensor_(nDimensionLegacyAll)(tensor), 3, - "Index tensor must have same dimensions as output tensor"); - THArgCheck(THTensor_(nDimensionLegacyAll)(src) == THTensor_(nDimensionLegacyAll)(tensor), 4, - "Input tensor must have same dimensions as output tensor"); -#else THArgCheck(dim < THTensor_(nDimensionLegacyNoScalars)(tensor), 2, "Index dimension is out of bounds"); THArgCheck(THLongTensor_nDimensionLegacyNoScalars(index) == THTensor_(nDimensionLegacyNoScalars)(tensor), 3, "Index tensor must have same dimensions as output tensor"); THArgCheck(THTensor_(nDimensionLegacyNoScalars)(src) == THTensor_(nDimensionLegacyNoScalars)(tensor), 4, "Input tensor must have same dimensions as output tensor"); -#endif elems_per_row = THLongTensor_size(index, dim); diff --git a/aten/src/TH/generic/THTensorMath.cpp b/aten/src/TH/generic/THTensorMath.cpp index c521d1da750a43..24d9a7e8c4ea07 100644 --- a/aten/src/TH/generic/THTensorMath.cpp +++ b/aten/src/TH/generic/THTensorMath.cpp @@ -805,11 +805,11 @@ void THTensor_(addcdiv)(THTensor *r_, THTensor *t, real value, THTensor *src1, T void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *mat, THTensor *vec) { - if( (mat->dim() != 2) || (vec->dim() != 1) ) + if( (mat->dim() != 2) || (THTensor_nDimensionLegacyNoScalars(vec) != 1) ) THError("matrix and vector expected, got %dD, %dD", - mat->dim(), vec->dim()); + mat->dim(), THTensor_nDimensionLegacyNoScalars(vec)); - if( mat->size(1) != vec->size(0) ) { + if( mat->size(1) != THTensor_sizeLegacyNoScalars(vec, 0) ) { THDescBuff bm = THTensor_(sizeDesc)(mat); THDescBuff bv = THTensor_(sizeDesc)(vec); THError("size mismatch, %s, %s", bm.str, bv.str); @@ -837,14 +837,14 @@ void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor { THBlas_(gemv)('n', mat->size(0), mat->size(1), alpha, THTensor_(data)(mat), mat->stride(1), - THTensor_(data)(vec), vec->stride(0), + THTensor_(data)(vec), THTensor_strideLegacyNoScalars(vec, 0), beta, THTensor_(data)(r_), r_->stride(0)); } else if(mat->stride(1) == 1 && LDA_COND(mat->size(1), mat->size(0), mat->stride(0))) { THBlas_(gemv)('t', mat->size(1), mat->size(0), alpha, THTensor_(data)(mat), mat->stride(0), - THTensor_(data)(vec), vec->stride(0), + THTensor_(data)(vec), THTensor_strideLegacyNoScalars(vec, 0), beta, THTensor_(data)(r_), r_->stride(0)); } else @@ -853,7 +853,7 @@ void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor THBlas_(gemv)('t', mat->size(1), mat->size(0), alpha, THTensor_(data)(cmat), cmat->stride(0), - THTensor_(data)(vec), vec->stride(0), + THTensor_(data)(vec), THTensor_strideLegacyNoScalars(vec, 0), beta, THTensor_(data)(r_), r_->stride(0)); THTensor_(free)(cmat); @@ -861,7 +861,7 @@ void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor // In gemv (x,0).mv(0) does not // handle beta, whereas gemm does for case where (x,0).mm(0,y). - if (vec->size(0) == 0 && mat->size(0) != 0) { + if (THTensor_sizeLegacyNoScalars(vec, 0) == 0 && mat->size(0) != 0) { if (beta == 0) { THTensor_(zero)(r_); } else if (beta != 1) { @@ -1058,14 +1058,19 @@ void THTensor_(addmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor void THTensor_(addr)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *vec1, THTensor *vec2) { - if( (vec1->dim() != 1) || (vec2->dim() != 1) ) + if( (THTensor_nDimensionLegacyNoScalars(vec1) != 1) || (THTensor_nDimensionLegacyNoScalars(vec2) != 1) ) THError("vector and vector expected, got %dD, %dD tensors", - vec1->dim(), vec2->dim()); + THTensor_nDimensionLegacyNoScalars(vec1), THTensor_nDimensionLegacyNoScalars(vec2)); if(t->dim() != 2) THError("expected matrix, got %dD tensor for t", t->dim()); - if( (t->size(0) != vec1->size(0)) || (t->size(1) != vec2->size(0)) ) { + auto vec1_size = THTensor_sizeLegacyNoScalars(vec1, 0); + auto vec2_size = THTensor_sizeLegacyNoScalars(vec2, 0); + auto vec1_stride = THTensor_strideLegacyNoScalars(vec1, 0); + auto vec2_stride = THTensor_strideLegacyNoScalars(vec2, 0); + + if( (t->size(0) != vec1_size) || (t->size(1) != vec2_size) ) { THDescBuff bt = THTensor_(sizeDesc)(t); THDescBuff bv1 = THTensor_(sizeDesc)(vec1); THDescBuff bv2 = THTensor_(sizeDesc)(vec2); @@ -1087,27 +1092,27 @@ void THTensor_(addr)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor // n == 1 || lda >= max(1, m) #define LDA_COND(M, N, LDA) ((N) == 1 || (LDA) >= THMax(1, (M))) - if(r_->stride(0) == 1 && LDA_COND(vec1->size(0), vec2->size(0), r_->stride(1))) + if(r_->stride(0) == 1 && LDA_COND(vec1_size, vec2_size, r_->stride(1))) { - THBlas_(ger)(vec1->size(0), vec2->size(0), - alpha, THTensor_(data)(vec1), vec1->stride(0), - THTensor_(data)(vec2), vec2->stride(0), + THBlas_(ger)(vec1_size, vec2_size, + alpha, THTensor_(data)(vec1), vec1_stride, + THTensor_(data)(vec2), vec2_stride, THTensor_(data)(r_), r_->stride(1)); } - else if(r_->stride(1) == 1 && LDA_COND(vec2->size(0), vec1->size(0), r_->stride(0))) + else if(r_->stride(1) == 1 && LDA_COND(vec2->size(0), vec1_size, r_->stride(0))) { - THBlas_(ger)(vec2->size(0), vec1->size(0), - alpha, THTensor_(data)(vec2), vec2->stride(0), - THTensor_(data)(vec1), vec1->stride(0), + THBlas_(ger)(vec2_size, vec1_size, + alpha, THTensor_(data)(vec2), vec2_stride, + THTensor_(data)(vec1), vec1_stride, THTensor_(data)(r_), r_->stride(0)); } else { THTensor *cr = THTensor_(newClone)(r_); - THBlas_(ger)(vec2->size(0), vec1->size(0), - alpha, THTensor_(data)(vec2), vec2->stride(0), - THTensor_(data)(vec1), vec1->stride(0), + THBlas_(ger)(vec2_size, vec1_size, + alpha, THTensor_(data)(vec2), vec2_stride, + THTensor_(data)(vec1), vec1_stride, THTensor_(data)(cr), cr->stride(0)); THTensor_(freeCopyTo)(cr, r_); diff --git a/aten/src/TH/generic/THTensorMoreMath.cpp b/aten/src/TH/generic/THTensorMoreMath.cpp index d06ec255644cce..fa8fb0558661ea 100644 --- a/aten/src/TH/generic/THTensorMoreMath.cpp +++ b/aten/src/TH/generic/THTensorMoreMath.cpp @@ -557,9 +557,6 @@ void THTensor_(onesLike)(THTensor *r_, THTensor *input) void THTensor_(diag)(THTensor *r_, THTensor *t, int k) { -#ifndef USE_TH_SIZE_ZERO_DIM - AT_ASSERT(!t->is_empty()) -#endif THArgCheck(THTensor_(nDimensionLegacyNoScalars)(t) == 1 || THTensor_(nDimensionLegacyNoScalars)(t) == 2, 1, "matrix or a vector expected"); if(THTensor_(nDimensionLegacyNoScalars)(t) == 1) @@ -1186,19 +1183,11 @@ void THTensor_(median)(THTensor *values_, THLongTensor *indices_, THTensor *t, i void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int64_t k, int dim, int dir, int sorted) { -#ifndef USE_TH_SIZE_ZERO_DIM - int numDims = THTensor_(nDimensionLegacyAll)(t); -#else int numDims = THTensor_(nDimensionLegacyNoScalars)(t); -#endif THArgCheck(dim >= 0 && dim < numDims, 3, "dim not in range"); int64_t sliceSize = THTensor_(size)(t, dim); -#ifndef USE_TH_SIZE_ZERO_DIM - THArgCheck(k > 0 && k <= sliceSize, 2, "k not in range for dimension"); -#else THArgCheck(k >= 0 && k <= sliceSize, 2, "k not in range for dimension"); -#endif THTensor *tmpResults = THTensor_(new)(); THTensor_(resize1d)(tmpResults, sliceSize); diff --git a/aten/src/THC/THCTensor.cpp b/aten/src/THC/THCTensor.cpp index 9df36f097ba6ee..a8fb33c11a5bd4 100644 --- a/aten/src/THC/THCTensor.cpp +++ b/aten/src/THC/THCTensor.cpp @@ -10,7 +10,7 @@ #include "THCTensorInfo.cuh" int THCTensor_nDimensionLegacyNoScalars(THCState *state, const THCTensor *self) { - return self->dim(); + return THTensor_nDimensionLegacyNoScalars(self); } int THCTensor_nDimensionLegacyAll(THCState *state, const THCTensor *self) { @@ -99,15 +99,6 @@ void THCTensor_resizeNd(THCState *state, THCTensor *self, int nDimension, int64_ for(d = 0; d < nDimension; d++) { -#ifndef USE_TH_SIZE_ZERO_DIM - // we can't support this unless we have arbitrary 0-sized dimensions, but some calls to this - // currently exist and expect a size [0] tensor to be returned. - if (d == 0 && size[d] == 0) { - nDimension = 1; - } else { - AT_CHECK(size[d] > 0, "sizes must be non-negative"); - } -#endif if((self->dim() > d) && (size[d] != self->size(d))) { hascorrectsize = false; } @@ -234,9 +225,6 @@ void THCTensor_unsqueeze1d(THCState *state, THCTensor *self, THCTensor *src, int src = self; THArgCheck((dimension >= 0) && (dimension <= src->dim()), 3, "dimension out of range"); -#ifndef USE_TH_SIZE_ZERO_DIM - THArgCheck(!src->is_empty(), 3, "cannot unsqueeze empty tensor"); -#endif THCTensor_set(state, self, src); diff --git a/aten/src/THC/generic/THCTensor.cpp b/aten/src/THC/generic/THCTensor.cpp index e15ba5e5a2c666..940af6eb86ead4 100644 --- a/aten/src/THC/generic/THCTensor.cpp +++ b/aten/src/THC/generic/THCTensor.cpp @@ -28,11 +28,21 @@ int64_t THCTensor_(size)(THCState *state, const THCTensor *self, int dim) return THCTensor_size(state, self, dim); } +int64_t THCTensor_(sizeLegacyNoScalars)(THCState *state, const THCTensor *self, int dim) +{ + return THTensor_sizeLegacyNoScalars(self, dim); +} + int64_t THCTensor_(stride)(THCState *state, const THCTensor *self, int dim) { return THCTensor_stride(state, self, dim); } +int64_t THCTensor_(strideLegacyNoScalars)(THCState *state, const THCTensor *self, int dim) +{ + return THTensor_strideLegacyNoScalars(self, dim); +} + THLongStorage *THCTensor_(newSizeOf)(THCState *state, THCTensor *self) { return THCTensor_newSizeOf(state, self); @@ -367,11 +377,7 @@ void THCTensor_(narrow)(THCState *state, THCTensor *self, THCTensor *src, int di THArgCheck( (dimension >= 0) && (dimension < src->dim()), 3, "out of range"); THArgCheck( firstIndex >= 0, 4, "out of range"); -#ifdef USE_TH_SIZE_ZERO_DIM THArgCheck( size >= 0, 5, "out of range"); -#else - THArgCheck( size > 0, 5, "out of range"); -#endif THArgCheck(firstIndex+size <= src->size(dimension), 5, "out of range"); THCTensor_(set)(state, self, src); @@ -390,12 +396,8 @@ void THCTensor_(select)(THCState *state, THCTensor *self, THCTensor *src, int di if(!src) src = self; -#ifndef USE_TH_SIZE_ZERO_DIM - THArgCheck(THTensor_nDimensionLegacyAll(src) > 1, 1, "cannot select on a vector"); -#else #ifndef USE_TH_SCALAR THArgCheck(src->dim() > 1, 1, "cannot select on a vector"); -#endif #endif THArgCheck((dimension >= 0) && (dimension < src->dim()), 3, "out of range"); THArgCheck((sliceIndex >= 0) && (sliceIndex < src->size(dimension)), 4, "out of range"); @@ -417,8 +419,8 @@ void THCTensor_(transpose)(THCState *state, THCTensor *self, THCTensor *src, int if(!src) src = self; - THArgCheck( (dimension1 >= 0) && (dimension1 < src->dim()), 1, "out of range"); - THArgCheck( (dimension2 >= 0) && (dimension2 < src->dim()), 2, "out of range"); + THArgCheck( (dimension1 >= 0) && (dimension1 < THTensor_nDimensionLegacyNoScalars(src)), 1, "out of range"); + THArgCheck( (dimension2 >= 0) && (dimension2 < THTensor_nDimensionLegacyNoScalars(src)), 2, "out of range"); THCTensor_(set)(state, self, src); @@ -440,11 +442,8 @@ void THCTensor_(unfold)(THCState *state, THCTensor *self, THCTensor *src, int di if(!src) src = self; -#ifndef USE_TH_SIZE_ZERO_DIM - THArgCheck(!src->is_empty(), 1, "cannot unfold an empty tensor"); -#endif - THArgCheck(dimension < src->dim(), 2, "out of range"); - THArgCheck(size <= src->size(dimension), 3, "out of range"); + THArgCheck(dimension < THTensor_nDimensionLegacyNoScalars(src), 2, "out of range"); + THArgCheck(size <= THTensor_sizeLegacyNoScalars(src, dimension), 3, "out of range"); THArgCheck(step > 0, 4, "invalid step"); THCTensor_(set)(state, self, src); @@ -453,18 +452,20 @@ void THCTensor_(unfold)(THCState *state, THCTensor *self, THCTensor *src, int di std::vector newStride(self->dim() + 1); newSize[self->dim()] = size; - newStride[self->dim()] = self->stride(dimension); + newStride[self->dim()] = THTensor_strideLegacyNoScalars(self, dimension); for(d = 0; d < self->dim(); d++) { + auto self_size = THTensor_sizeLegacyNoScalars(self, d); + auto self_stride = THTensor_strideLegacyNoScalars(self, d); if(d == dimension) { - newSize[d] = (self->size(d) - size) / step + 1; - newStride[d] = step*self->stride(d); + newSize[d] = (self_size - size) / step + 1; + newStride[d] = step*self_stride; } else { - newSize[d] = self->size(d); - newStride[d] = self->stride(d); + newSize[d] = self_size; + newStride[d] = self_stride; } } @@ -603,15 +604,15 @@ void THCTensor_(resizeNd)(THCState *state, THCTensor *self, int nDimension, int6 void THCTensor_(set1d)(THCState *state, THCTensor *tensor, int64_t x0, real value) { - THArgCheck(tensor->dim() == 1, 1, "tensor must have one dimension"); - THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range"); + THArgCheck(THTensor_nDimensionLegacyNoScalars(tensor) == 1, 1, "tensor must have one dimension"); + THArgCheck( (x0 >= 0) && (x0 < THTensor_sizeLegacyNoScalars(tensor, 0)), 2, "out of range"); THCStorage_(set)(state, THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0), value); } real THCTensor_(get1d)(THCState *state, const THCTensor *tensor, int64_t x0) { - THArgCheck(tensor->dim() == 1, 1, "tensor must have one dimension"); - THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range"); + THArgCheck(THTensor_nDimensionLegacyNoScalars(tensor) == 1, 1, "tensor must have one dimension"); + THArgCheck( (x0 >= 0) && (x0 < THTensor_sizeLegacyNoScalars(tensor, 0)), 2, "out of range"); return THCStorage_(get)(state, THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)); } diff --git a/aten/src/THC/generic/THCTensor.h b/aten/src/THC/generic/THCTensor.h index dbb1591ae194f2..2ee1bf11a4be4c 100644 --- a/aten/src/THC/generic/THCTensor.h +++ b/aten/src/THC/generic/THCTensor.h @@ -26,7 +26,9 @@ THC_API int THCTensor_(nDimensionLegacyNoScalars)(THCState *state, const THCTens THC_API int THCTensor_(nDimensionLegacyAll)(THCState *state, const THCTensor *self); THC_API int64_t THCTensor_(size)(THCState *state, const THCTensor *self, int dim); +THC_API int64_t THCTensor_(sizeLegacyNoScalars)(THCState *state, const THCTensor *self, int dim); THC_API int64_t THCTensor_(stride)(THCState *state, const THCTensor *self, int dim); +THC_API int64_t THCTensor_(strideLegacyNoScalars)(THCState *state, const THCTensor *self, int dim); THC_API THLongStorage *THCTensor_(newSizeOf)(THCState *state, THCTensor *self); THC_API THLongStorage *THCTensor_(newStrideOf)(THCState *state, THCTensor *self); THC_API real *THCTensor_(data)(THCState *state, const THCTensor *self); diff --git a/aten/src/THC/generic/THCTensorIndex.cu b/aten/src/THC/generic/THCTensorIndex.cu index 4cbf5dd224abe5..82f56f9946e471 100644 --- a/aten/src/THC/generic/THCTensorIndex.cu +++ b/aten/src/THC/generic/THCTensorIndex.cu @@ -537,16 +537,6 @@ void THCTensor_(indexSelect)(THCState *state, THCTensor *dst, THCTensor *src, in THLongStorage *newSize; -#ifndef USE_TH_SIZE_ZERO_DIM - if (numIndices == 0) { - newSize = THCTensor_(newSizeOf)(state, src); - THLongStorage_set(newSize, 0, numIndices); - THCTensor_(resize)(state, dst, newSize, NULL); - THLongStorage_free(newSize); - return; - } -#endif - newSize = THCTensor_(newSizeOf)(state, src); THLongStorage_set(newSize, dim, numIndices); THCTensor_(resize)(state, dst, newSize, NULL); diff --git a/aten/src/THC/generic/THCTensorMath.cu b/aten/src/THC/generic/THCTensorMath.cu index 642b14aec48cfd..cc1a8c9ba57e41 100644 --- a/aten/src/THC/generic/THCTensorMath.cu +++ b/aten/src/THC/generic/THCTensorMath.cu @@ -330,9 +330,6 @@ void THCTensor_(nonzero)(THCState* state, THCudaLongTensor *tensor, void THCTensor_(diag)(THCState *state, THCTensor *self_, THCTensor *src_, int64_t k){ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); int nDimension = THCTensor_(nDimensionLegacyNoScalars)(state, src_); -#ifndef USE_TH_SIZE_ZERO_DIM - AT_ASSERT(!src_->is_empty()); -#endif THArgCheck((nDimension == 2) || (nDimension == 1), 1, "expected a matrix or a vector"); if (nDimension == 2) { int64_t stride0 = THCTensor_(stride)(state, src_, 0); diff --git a/aten/src/THC/generic/THCTensorMathBlas.cu b/aten/src/THC/generic/THCTensorMathBlas.cu index 17ef020e85f8ee..591780b04edf75 100644 --- a/aten/src/THC/generic/THCTensorMathBlas.cu +++ b/aten/src/THC/generic/THCTensorMathBlas.cu @@ -49,11 +49,15 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real { #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, mat, vec)); - if( (mat->dim() != 2) || (vec->dim() != 1) ) + if( (mat->dim() != 2) || (THTensor_nDimensionLegacyNoScalars(vec) != 1) ) THError("2D tensor and 1D tensor expected, got %dD, %dD tensors", - mat->dim(), vec->dim()); + mat->dim(), THTensor_nDimensionLegacyNoScalars(vec)); - if( mat->size(1) != vec->size(0) ) + + auto vec_size = THTensor_sizeLegacyNoScalars(vec, 0); + auto vec_stride = THTensor_strideLegacyNoScalars(vec, 0); + + if( mat->size(1) != THTensor_sizeLegacyNoScalars(vec, 0) ) THError("size mismatch"); if(t->dim() != 1) @@ -74,12 +78,12 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real #ifdef THC_REAL_IS_FLOAT THCudaBlas_Sgemv(state, 'n', mat->size(0), mat->size(1), alpha, THCTensor_(data)(state, mat), mat->stride(1), - THCTensor_(data)(state, vec), vec->stride(0), + THCTensor_(data)(state, vec), vec_stride, beta, THCTensor_(data)(state, r_), r_->stride(0)); #elif defined(THC_REAL_IS_DOUBLE) THCudaBlas_Dgemv(state, 'n', mat->size(0), mat->size(1), alpha, THCTensor_(data)(state, mat), mat->stride(1), - THCTensor_(data)(state, vec), vec->stride(0), + THCTensor_(data)(state, vec), vec_stride, beta, THCTensor_(data)(state, r_), r_->stride(0)); #endif } @@ -88,12 +92,12 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real #ifdef THC_REAL_IS_FLOAT THCudaBlas_Sgemv(state, 't', mat->size(1), mat->size(0), alpha, THCTensor_(data)(state, mat), mat->stride(0), - THCTensor_(data)(state, vec), vec->stride(0), + THCTensor_(data)(state, vec), vec_stride, beta, THCTensor_(data)(state, r_), r_->stride(0)); #elif defined(THC_REAL_IS_DOUBLE) THCudaBlas_Dgemv(state, 't', mat->size(1), mat->size(0), alpha, THCTensor_(data)(state, mat), mat->stride(0), - THCTensor_(data)(state, vec), vec->stride(0), + THCTensor_(data)(state, vec), vec_stride, beta, THCTensor_(data)(state, r_), r_->stride(0)); #endif } @@ -104,12 +108,12 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real #ifdef THC_REAL_IS_FLOAT THCudaBlas_Sgemv(state, 't', mat->size(1), mat->size(0), alpha, THCTensor_(data)(state, cmat), cmat->stride(0), - THCTensor_(data)(state, vec), vec->stride(0), + THCTensor_(data)(state, vec), vec_stride, beta, THCTensor_(data)(state, r_), r_->stride(0)); #elif defined(THC_REAL_IS_DOUBLE) THCudaBlas_Dgemv(state, 't', mat->size(1), mat->size(0), alpha, THCTensor_(data)(state, cmat), cmat->stride(0), - THCTensor_(data)(state, vec), vec->stride(0), + THCTensor_(data)(state, vec), vec_stride, beta, THCTensor_(data)(state, r_), r_->stride(0)); #endif @@ -129,7 +133,7 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real #elif defined(THC_REAL_IS_HALF) // Currently no Hgemv/SgemvEx in Cublas THCTensor *vecAsMatrix = THCTensor_(newWithTensor)(state, vec); - THCTensor_(resize2d)(state, vecAsMatrix, vecAsMatrix->size(0), 1); + THCTensor_(resize2d)(state, vecAsMatrix, vec_size, 1); THCTensor *tAsMatrix = THCTensor_(newWithTensor)(state, t); THCTensor_(resize2d)(state, tAsMatrix, tAsMatrix->size(0), 1); @@ -151,16 +155,20 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real a { #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, vec1, vec2)); - if ( (vec1->dim() != 1) || (vec2->dim() != 1) ) { + if ( (THTensor_nDimensionLegacyNoScalars(vec1) != 1) || (THTensor_nDimensionLegacyNoScalars(vec2) != 1) ) { THError("1D tensors expected, got %dD, %dD tensors", - vec1->dim(), vec2->dim()); + THTensor_nDimensionLegacyNoScalars(vec1), THTensor_nDimensionLegacyNoScalars(vec2)); } + auto vec1_size = THTensor_sizeLegacyNoScalars(vec1, 0); + auto vec2_size = THTensor_sizeLegacyNoScalars(vec2, 0); + auto vec1_stride = THTensor_strideLegacyNoScalars(vec1, 0); + auto vec2_stride = THTensor_strideLegacyNoScalars(vec2, 0); if (t->dim() != 2) { THError("size mismatch"); } - if ( (t->size(0) != vec1->size(0)) || (t->size(1) != vec2->size(0)) ) { + if ( (t->size(0) != vec1_size) || (t->size(1) != vec2_size) ) { THError("size mismatch"); } @@ -179,28 +187,28 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real a if(r_->stride(0) == 1) { #ifdef THC_REAL_IS_FLOAT - THCudaBlas_Sger(state, vec1->size(0), vec2->size(0), - alpha, THCTensor_(data)(state, vec1), vec1->stride(0), - THCTensor_(data)(state, vec2), vec2->stride(0), + THCudaBlas_Sger(state, vec1_size, vec2_size, + alpha, THCTensor_(data)(state, vec1), vec1_stride, + THCTensor_(data)(state, vec2), vec2_stride, THCTensor_(data)(state, r_), r_->stride(1)); #elif defined(THC_REAL_IS_DOUBLE) - THCudaBlas_Dger(state, vec1->size(0), vec2->size(0), - alpha, THCTensor_(data)(state, vec1), vec1->stride(0), - THCTensor_(data)(state, vec2), vec2->stride(0), + THCudaBlas_Dger(state, vec1->size(0), vec2_size, + alpha, THCTensor_(data)(state, vec1), vec1_stride, + THCTensor_(data)(state, vec2), vec2_stride, THCTensor_(data)(state, r_), r_->stride(1)); #endif } else if(r_->stride(1) == 1) { #ifdef THC_REAL_IS_FLOAT - THCudaBlas_Sger(state, vec2->size(0), vec1->size(0), - alpha, THCTensor_(data)(state, vec2), vec2->stride(0), - THCTensor_(data)(state, vec1), vec1->stride(0), + THCudaBlas_Sger(state, vec2_size, vec1_size, + alpha, THCTensor_(data)(state, vec2), vec2_stride, + THCTensor_(data)(state, vec1), vec1_stride, THCTensor_(data)(state, r_), r_->stride(0)); #elif defined(THC_REAL_IS_DOUBLE) - THCudaBlas_Dger(state, vec2->size(0), vec1->size(0), - alpha, THCTensor_(data)(state, vec2), vec2->stride(0), - THCTensor_(data)(state, vec1), vec1->stride(0), + THCudaBlas_Dger(state, vec2_size, vec1_size, + alpha, THCTensor_(data)(state, vec2), vec2_stride, + THCTensor_(data)(state, vec1), vec1_stride, THCTensor_(data)(state, r_), r_->stride(0)); #endif } @@ -209,14 +217,14 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real a THCTensor *cr = THCTensor_(newClone)(state, r_); #ifdef THC_REAL_IS_FLOAT - THCudaBlas_Sger(state, vec2->size(0), vec1->size(0), - alpha, THCTensor_(data)(state, vec2), vec2->stride(0), - THCTensor_(data)(state, vec1), vec1->stride(0), + THCudaBlas_Sger(state, vec2_size, vec1_size, + alpha, THCTensor_(data)(state, vec2), vec2_stride, + THCTensor_(data)(state, vec1), vec1_stride, THCTensor_(data)(state, cr), cr->stride(0)); #elif defined(THC_REAL_IS_DOUBLE) - THCudaBlas_Dger(state, vec2->size(0), vec1->size(0), - alpha, THCTensor_(data)(state, vec2), vec2->stride(0), - THCTensor_(data)(state, vec1), vec1->stride(0), + THCudaBlas_Dger(state, vec2_size, vec1_size, + alpha, THCTensor_(data)(state, vec2), vec2_stride, + THCTensor_(data)(state, vec1), vec1_stride, THCTensor_(data)(state, cr), cr->stride(0)); #endif @@ -225,11 +233,11 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real a #elif defined(THC_REAL_IS_HALF) // currently no Hger/SgerEx in Cublas. THCTensor *vec2T = THCTensor_(newWithTensor)(state, vec2); - THCTensor_(resize2d)(state, vec2T, vec2T->size(0), 1); + THCTensor_(resize2d)(state, vec2T, vec2_size, 1); THCTensor_(transpose)(state, vec2T, NULL, 0, 1); THCTensor *vec1M = THCTensor_(newWithTensor)(state, vec1); - THCTensor_(resize2d)(state, vec1M, vec1M->size(0), 1); + THCTensor_(resize2d)(state, vec1M, vec1_size, 1); THCTensor_(addmm)(state, r_, beta, t, alpha, vec1M, vec2T); THCTensor_(free)(state, vec2T); diff --git a/aten/src/THCUNN/CMakeLists.txt b/aten/src/THCUNN/CMakeLists.txt index 79b11c2db9b64f..78faef7a7f227b 100644 --- a/aten/src/THCUNN/CMakeLists.txt +++ b/aten/src/THCUNN/CMakeLists.txt @@ -43,7 +43,6 @@ ${CMAKE_CURRENT_SOURCE_DIR}/SpatialDilatedMaxPooling.cu ${CMAKE_CURRENT_SOURCE_DIR}/SpatialFractionalMaxPooling.cu ${CMAKE_CURRENT_SOURCE_DIR}/SpatialFullConvolution.cu ${CMAKE_CURRENT_SOURCE_DIR}/SpatialFullDilatedConvolution.cu -${CMAKE_CURRENT_SOURCE_DIR}/SpatialGridSamplerBilinear.cu ${CMAKE_CURRENT_SOURCE_DIR}/SpatialMaxPooling.cu ${CMAKE_CURRENT_SOURCE_DIR}/SpatialMaxUnpooling.cu ${CMAKE_CURRENT_SOURCE_DIR}/SpatialReflectionPadding.cu @@ -71,7 +70,6 @@ ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricDilatedMaxPooling.cu ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricFractionalMaxPooling.cu ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricFullConvolution.cu ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricFullDilatedConvolution.cu -${CMAKE_CURRENT_SOURCE_DIR}/VolumetricGridSamplerBilinear.cu ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricMaxPooling.cu ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricMaxUnpooling.cu ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricReplicationPadding.cu diff --git a/aten/src/THCUNN/ELU.cu b/aten/src/THCUNN/ELU.cu index d17d185b4858bf..9c4c2ea1fdc8b6 100644 --- a/aten/src/THCUNN/ELU.cu +++ b/aten/src/THCUNN/ELU.cu @@ -8,15 +8,17 @@ struct ELUupdateOutput_functor { const T negcoef_; const T poscoef_; + const T negiptcoef_; - ELUupdateOutput_functor(T negcoef, T poscoef) + ELUupdateOutput_functor(T negcoef, T poscoef, T negiptcoef) : negcoef_(negcoef) , poscoef_(poscoef) + , negiptcoef_(negiptcoef) {} __device__ void operator()(T *output, const T *input) const { - *output = *input <= 0 ? (exp(*input) - 1) * negcoef_ : *input * poscoef_; + *output = *input <= 0 ? (exp(*input * negiptcoef_) - 1) * negcoef_ : *input * poscoef_; } }; @@ -26,15 +28,17 @@ struct ELUupdateOutputIP_functor { const T negcoef_; const T poscoef_; + const T negiptcoef_; - ELUupdateOutputIP_functor(T negcoef, T poscoef) + ELUupdateOutputIP_functor(T negcoef, T poscoef, T negiptcoef) : negcoef_(negcoef) , poscoef_(poscoef) + , negiptcoef_(negiptcoef) {} __device__ void operator()(T *x) const { - *x = *x <= 0 ? (exp(*x) - 1) * negcoef_ : *x * poscoef_; + *x = *x <= 0 ? (exp(*x * negiptcoef_) - 1) * negcoef_ : *x * poscoef_; } }; @@ -43,15 +47,17 @@ struct ELUupdateGradInput_functor { const T negcoef_; const T poscoef_; + const T negiptcoef_; - ELUupdateGradInput_functor(T negcoef, T poscoef) + ELUupdateGradInput_functor(T negcoef, T poscoef, T negiptcoef) : negcoef_(negcoef) , poscoef_(poscoef) + , negiptcoef_(negiptcoef) {} __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const { - *gradInput = (*output) <= 0 ? (*gradOutput * (*output + negcoef_)) : (*gradOutput * poscoef_); + *gradInput = (*output) <= 0 ? (*gradOutput * negiptcoef_ * (*output + negcoef_)) : (*gradOutput * poscoef_); } }; diff --git a/aten/src/THCUNN/SpatialGridSamplerBilinear.cu b/aten/src/THCUNN/SpatialGridSamplerBilinear.cu deleted file mode 100644 index 30a1a5d5ade10b..00000000000000 --- a/aten/src/THCUNN/SpatialGridSamplerBilinear.cu +++ /dev/null @@ -1,243 +0,0 @@ -#include "THCUNN.h" -#include "common.h" -#include "THCDeviceTensor.cuh" -#include "THCDeviceTensorUtils.cuh" -#include "THCDeviceUtils.cuh" -#include "THCHalf.h" -#include "THCHalfAutoNumerics.cuh" -#include "THCAtomics.cuh" - -#define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < W && y >= 0 && y < H) -#define SAFE_ADD(input, x, y, n, c, H, W, value) \ - do { \ - if (WITHIN_BOUNDS(x, y, H, W)) { \ - atomicAdd(&input[n][c][y][x], value); \ - } \ - } while(0) - -#undef MIN -#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) ) -#undef MAX -#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) ) -#define CLIP_COORDINATES(in, out, clip_limit) out = MIN((clip_limit-1), MAX(in, 0)) - -const int MODE_BORDER = 1; - - -template -__launch_bounds__(1024) -__global__ void SpatialGridSamplerBilinear_updateOutput_kernel( - const int nthreads, - THCDeviceTensor input, - THCDeviceTensor grid, - THCDeviceTensor output, - const int padding_mode) { - - int N = input.getSize(0); - int C = input.getSize(1); - int IH = input.getSize(2); - int IW = input.getSize(3); - int H = grid.getSize(1); - int W = grid.getSize(2); - - CUDA_KERNEL_LOOP(index, nthreads) { - - const int n = index % N; - const int h = (index / N) % H; - const int w = (index / (N * H)) % W; - int c; - - // get the corresponding input x, y co-ordinates from grid - Dtype ix = grid[n][h][w][0]; - Dtype iy = grid[n][h][w][1]; - - // normalize ix, iy from [-1, 1] to [0, IH-1] & [0, IW-1] - ix = ScalarConvert::to(((ix + 1.f) / 2) * (IW-1)); - iy = ScalarConvert::to(((iy + 1.f) / 2) * (IH-1)); - - // get NE, NW, SE, SW pixel values from (x, y) - int ix_nw = floor(ScalarConvert::to(ix)); - int iy_nw = floor(ScalarConvert::to(iy)); - int ix_ne = ix_nw + 1; - int iy_ne = iy_nw; - int ix_sw = ix_nw; - int iy_sw = iy_nw + 1; - int ix_se = ix_nw + 1; - int iy_se = iy_nw + 1; - - // get surfaces to each neighbor: - Dtype nw = (ix_se - ix) * (iy_se - iy); - Dtype ne = (ix - ix_sw) * (iy_sw - iy); - Dtype sw = (ix_ne - ix) * (iy - iy_ne); - Dtype se = (ix - ix_nw) * (iy - iy_nw); - - // calculate bilinear weighted pixel value and set output pixel - if (padding_mode==MODE_BORDER){ - // clip coordinates to image borders - CLIP_COORDINATES(ix_nw, ix_nw, IW); - CLIP_COORDINATES(iy_nw, iy_nw, IH); - CLIP_COORDINATES(ix_ne, ix_ne, IW); - CLIP_COORDINATES(iy_ne, iy_ne, IH); - CLIP_COORDINATES(ix_sw, ix_sw, IW); - CLIP_COORDINATES(iy_sw, iy_sw, IH); - CLIP_COORDINATES(ix_se, ix_se, IW); - CLIP_COORDINATES(iy_se, iy_se, IH); - } - - Dtype out_val; - for (c = 0; c < C; ++c) { - out_val = ScalarConvert::to(0); - if (WITHIN_BOUNDS(ix_nw, iy_nw, IH, IW)) { - out_val += input[n][c][iy_nw][ix_nw] * nw; - } - if (WITHIN_BOUNDS(ix_ne, iy_ne, IH, IW)) { - out_val += input[n][c][iy_ne][ix_ne] * ne; - } - if (WITHIN_BOUNDS(ix_sw, iy_sw, IH, IW)) { - out_val += input[n][c][iy_sw][ix_sw] * sw; - } - if (WITHIN_BOUNDS(ix_se, iy_se, IH, IW)) { - out_val += input[n][c][iy_se][ix_se] * se; - } - output[n][c][h][w] = out_val; - } - } -} - -template -__launch_bounds__(1024) -__global__ void SpatialGridSamplerBilinear_updateGradInput_kernel( - const int nthreads, - THCDeviceTensor input, THCDeviceTensor gradInput, - THCDeviceTensor grid, THCDeviceTensor gradGrid, - THCDeviceTensor gradOutput, - const int padding_mode) { - - int N = input.getSize(0); - int C = input.getSize(1); - int IH = input.getSize(2); - int IW = input.getSize(3); - int H = grid.getSize(1); - int W = grid.getSize(2); - - CUDA_KERNEL_LOOP(index, nthreads) { - - const int n = index % N; - const int h = (index / N) % H; - const int w = (index / (N * H)) % W; - - // get the corresponding input x, y co-ordinates from grid - Dtype ix = grid[n][h][w][0]; - Dtype iy = grid[n][h][w][1]; - - Dtype gix = ScalarConvert::to(0); - Dtype giy = ScalarConvert::to(0); - - // normalize ix, iy from [-1, 1] to [0, H-1] & [0, W-1] - ix = ScalarConvert::to(((ix + 1.f) / 2) * (IW-1)); - iy = ScalarConvert::to(((iy + 1.f) / 2) * (IH-1));; - - // get NE, NW, SE, SW pixel values from (x, y) - int ix_nw = floor(ScalarConvert::to(ix)); - int iy_nw = floor(ScalarConvert::to(iy));; - int ix_ne = ix_nw + 1; - int iy_ne = iy_nw; - int ix_sw = ix_nw; - int iy_sw = iy_nw + 1; - int ix_se = ix_nw + 1; - int iy_se = iy_nw + 1; - - // get surfaces to each neighbor: - Dtype nw = (ix_se - ix) * (iy_se - iy); - Dtype ne = (ix - ix_sw) * (iy_sw - iy); - Dtype sw = (ix_ne - ix) * (iy - iy_ne); - Dtype se = (ix - ix_nw) * (iy - iy_nw); - - Dtype gradout; - Dtype nw_val; - Dtype ne_val; - Dtype sw_val; - Dtype se_val; - - int ix_nw_cl, iy_nw_cl, ix_ne_cl, iy_ne_cl, ix_sw_cl, iy_sw_cl, ix_se_cl, iy_se_cl; - - if (padding_mode==MODE_BORDER){ - // get clipped NE, NW, SE, SW pixel values from (x, y) - CLIP_COORDINATES(ix_nw, ix_nw_cl, IW); - CLIP_COORDINATES(iy_nw, iy_nw_cl, IH); - CLIP_COORDINATES(ix_ne, ix_ne_cl, IW); - CLIP_COORDINATES(iy_ne, iy_ne_cl, IH); - CLIP_COORDINATES(ix_sw, ix_sw_cl, IW); - CLIP_COORDINATES(iy_sw, iy_sw_cl, IH); - CLIP_COORDINATES(ix_se, ix_se_cl, IW); - CLIP_COORDINATES(iy_se, iy_se_cl, IH); - } - else { - ix_nw_cl = ix_nw; - iy_nw_cl = iy_nw; - ix_ne_cl = ix_ne; - iy_ne_cl = iy_ne; - ix_sw_cl = ix_sw; - iy_sw_cl = iy_sw; - ix_se_cl = ix_se; - iy_se_cl = iy_se; - } - - for (int c = 0; c < C; ++c) { - gradout = gradOutput[n][c][h][w]; - - // calculate and set gradInput - SAFE_ADD(gradInput, ix_nw_cl, iy_nw_cl, n, c, IH, IW, nw * gradout); - SAFE_ADD(gradInput, ix_ne_cl, iy_ne_cl, n, c, IH, IW, ne * gradout); - SAFE_ADD(gradInput, ix_sw_cl, iy_sw_cl, n, c, IH, IW, sw * gradout); - SAFE_ADD(gradInput, ix_se_cl, iy_se_cl, n, c, IH, IW, se * gradout); - - // calculate gradGrid - nw_val = ScalarConvert::to(0); - if (WITHIN_BOUNDS(ix_nw_cl, iy_nw_cl, IH, IW)) { - nw_val = input[n][c][iy_nw_cl][ix_nw_cl]; - } - ne_val = ScalarConvert::to(0); - if (WITHIN_BOUNDS(ix_ne_cl, iy_ne_cl, IH, IW)) { - ne_val = input[n][c][iy_ne_cl][ix_ne_cl]; - } - sw_val = ScalarConvert::to(0); - if (WITHIN_BOUNDS(ix_sw_cl, iy_sw_cl, IH, IW)) { - sw_val = input[n][c][iy_sw_cl][ix_sw_cl]; - } - se_val = ScalarConvert::to(0); - if (WITHIN_BOUNDS(ix_se_cl, iy_se_cl, IH, IW)) { - se_val = input[n][c][iy_se_cl][ix_se_cl]; - } - - gix += ScalarConvert::to(-1)*(nw_val * (iy_se - iy) * gradout); - gix += ne_val * (iy_sw - iy) * gradout; - gix += ScalarConvert::to(-1)*(sw_val * (iy - iy_ne) * gradout); - gix += se_val * (iy - iy_nw) * gradout; - - giy += ScalarConvert::to(-1)*(nw_val * (ix_se - ix) * gradout); - giy += ScalarConvert::to(-1)*(ne_val * (ix - ix_sw) * gradout); - giy += sw_val * (ix_ne - ix) * gradout; - giy += se_val * (ix - ix_nw) * gradout; - } - - // un-normalize gradGrid values back to [-1, 1] constraints - gix = gix * (IW - 1) / 2; - giy = giy * (IH - 1) / 2; - - Dtype gix_old = gradGrid[n][h][w][0]; - Dtype giy_old = gradGrid[n][h][w][1]; - - gradGrid[n][h][w][0] = gix_old + gix; - gradGrid[n][h][w][1] = giy_old + giy; - } -} - -#undef MIN -#undef MAX -#undef CLIP_COORDINATES -#undef WITHIN_BOUNDS -#undef SAFE_ADD - -#include "generic/SpatialGridSamplerBilinear.cu" -#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/VolumetricGridSamplerBilinear.cu b/aten/src/THCUNN/VolumetricGridSamplerBilinear.cu deleted file mode 100644 index 43b8ceff1cb8ae..00000000000000 --- a/aten/src/THCUNN/VolumetricGridSamplerBilinear.cu +++ /dev/null @@ -1,421 +0,0 @@ -#include "THCUNN.h" -#include "common.h" -#include "THCDeviceTensor.cuh" -#include "THCDeviceTensorUtils.cuh" -#include "THCDeviceUtils.cuh" -#include "THCHalf.h" -#include "THCHalfAutoNumerics.cuh" -#include "THCAtomics.cuh" - -#define WITHIN_BOUNDS(x, y, z, D, H, W) (x >= 0 && x < W && y >= 0 && y < H && z >= 0 && z < D) -#define SAFE_ADD(input, x, y, z, n, c, D, H, W, value) \ - do { \ - if (WITHIN_BOUNDS(x, y, z, D, H, W)) { \ - atomicAdd(&input[n][c][z][y][x], value); \ - } \ - } while(0) - -#undef MIN -#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) ) -#undef MAX -#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) ) -#define CLIP_COORDINATES(in, out, clip_limit) out = MIN((clip_limit-1), MAX(in, 0)) - -const int MODE_BORDER = 1; - - -template -__launch_bounds__(1024) -__global__ void VolumetricGridSamplerBilinear_updateOutput_kernel( - const int nthreads, - THCDeviceTensor input, - THCDeviceTensor grid, - THCDeviceTensor output, - const int padding_mode) { - - int N = input.getSize(0); - int C = input.getSize(1); - int ID = input.getSize(2); - int IH = input.getSize(3); - int IW = input.getSize(4); - int D = grid.getSize(1); - int H = grid.getSize(2); - int W = grid.getSize(3); - - CUDA_KERNEL_LOOP(index, nthreads) { - - const int n = index % N; - const int d = (index / N) % D; - const int h = (index / (N * D)) % H; - const int w = (index / (N * D * H)) % W; - int c; - - // get the corresponding input x, y, z co-ordinates from grid - Dtype ix = grid[n][d][h][w][0]; - Dtype iy = grid[n][d][h][w][1]; - Dtype iz = grid[n][d][h][w][2]; - - // normalize ix, iy, iz from [-1, 1] to [0, IW-1] & [0, IH-1] & [0, ID-1] - ix = ScalarConvert::to(((ix + 1.f) / 2) * (IW-1)); - iy = ScalarConvert::to(((iy + 1.f) / 2) * (IH-1)); - iz = ScalarConvert::to(((iz + 1.f) / 2) * (ID-1)); - - // get corner pixel values from (x, y, z) - // for 4d, we used north-east-south-west - // for 5d, we add top-bottom - int ix_tnw = floor(ScalarConvert::to(ix)); - int iy_tnw = floor(ScalarConvert::to(iy)); - int iz_tnw = floor(ScalarConvert::to(iz)); - - int ix_tne = ix_tnw + 1; - int iy_tne = iy_tnw; - int iz_tne = iz_tnw; - - int ix_tsw = ix_tnw; - int iy_tsw = iy_tnw + 1; - int iz_tsw = iz_tnw; - - int ix_tse = ix_tnw + 1; - int iy_tse = iy_tnw + 1; - int iz_tse = iz_tnw; - - int ix_bnw = ix_tnw; - int iy_bnw = iy_tnw; - int iz_bnw = iz_tnw + 1; - - int ix_bne = ix_tnw + 1; - int iy_bne = iy_tnw; - int iz_bne = iz_tnw + 1; - - int ix_bsw = ix_tnw; - int iy_bsw = iy_tnw + 1; - int iz_bsw = iz_tnw + 1; - - int ix_bse = ix_tnw + 1; - int iy_bse = iy_tnw + 1; - int iz_bse = iz_tnw + 1; - - // get surfaces to each neighbor: - Dtype tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz); - Dtype tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz); - Dtype tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz); - Dtype tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz); - Dtype bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse); - Dtype bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw); - Dtype bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne); - Dtype bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw); - - // calculate bilinear weighted pixel value and set output pixel - if (padding_mode==MODE_BORDER){ - // clip coordinates to image borders - CLIP_COORDINATES(ix_tnw, ix_tnw, IW); - CLIP_COORDINATES(iy_tnw, iy_tnw, IH); - CLIP_COORDINATES(iz_tnw, iz_tnw, ID); - CLIP_COORDINATES(ix_tne, ix_tne, IW); - CLIP_COORDINATES(iy_tne, iy_tne, IH); - CLIP_COORDINATES(iz_tne, iz_tne, ID); - CLIP_COORDINATES(ix_tsw, ix_tsw, IW); - CLIP_COORDINATES(iy_tsw, iy_tsw, IH); - CLIP_COORDINATES(iz_tsw, iz_tsw, ID); - CLIP_COORDINATES(ix_tse, ix_tse, IW); - CLIP_COORDINATES(iy_tse, iy_tse, IH); - CLIP_COORDINATES(iz_tse, iz_tse, ID); - CLIP_COORDINATES(ix_bnw, ix_bnw, IW); - CLIP_COORDINATES(iy_bnw, iy_bnw, IH); - CLIP_COORDINATES(iz_bnw, iz_bnw, ID); - CLIP_COORDINATES(ix_bne, ix_bne, IW); - CLIP_COORDINATES(iy_bne, iy_bne, IH); - CLIP_COORDINATES(iz_bne, iz_bne, ID); - CLIP_COORDINATES(ix_bsw, ix_bsw, IW); - CLIP_COORDINATES(iy_bsw, iy_bsw, IH); - CLIP_COORDINATES(iz_bsw, iz_bsw, ID); - CLIP_COORDINATES(ix_bse, ix_bse, IW); - CLIP_COORDINATES(iy_bse, iy_bse, IH); - CLIP_COORDINATES(iz_bse, iz_bse, ID); - } - - Dtype out_val; - for (c = 0; c < C; ++c) { - out_val = ScalarConvert::to(0); - if (WITHIN_BOUNDS(ix_tnw, iy_tnw, iz_tnw, ID, IH, IW)) { - out_val += input[n][c][iz_tnw][iy_tnw][ix_tnw] * tnw; - } - if (WITHIN_BOUNDS(ix_tne, iy_tne, iz_tne, ID, IH, IW)) { - out_val += input[n][c][iz_tne][iy_tne][ix_tne] * tne; - } - if (WITHIN_BOUNDS(ix_tsw, iy_tsw, iz_tsw, ID, IH, IW)) { - out_val += input[n][c][iz_tsw][iy_tsw][ix_tsw] * tsw; - } - if (WITHIN_BOUNDS(ix_tse, iy_tse, iz_tse, ID, IH, IW)) { - out_val += input[n][c][iz_tse][iy_tse][ix_tse] * tse; - } - if (WITHIN_BOUNDS(ix_bnw, iy_bnw, iz_bnw, ID, IH, IW)) { - out_val += input[n][c][iz_bnw][iy_bnw][ix_bnw] * bnw; - } - if (WITHIN_BOUNDS(ix_bne, iy_bne, iz_bne, ID, IH, IW)) { - out_val += input[n][c][iz_bne][iy_bne][ix_bne] * bne; - } - if (WITHIN_BOUNDS(ix_bsw, iy_bsw, iz_bsw, ID, IH, IW)) { - out_val += input[n][c][iz_bsw][iy_bsw][ix_bsw] * bsw; - } - if (WITHIN_BOUNDS(ix_bse, iy_bse, iz_bse, ID, IH, IW)) { - out_val += input[n][c][iz_bse][iy_bse][ix_bse] * bse; - } - output[n][c][d][h][w] = out_val; - } - } -} - -template -__launch_bounds__(1024) -__global__ void VolumetricGridSamplerBilinear_updateGradInput_kernel( - const int nthreads, - THCDeviceTensor input, THCDeviceTensor gradInput, - THCDeviceTensor grid, THCDeviceTensor gradGrid, - THCDeviceTensor gradOutput, - const int padding_mode) { - - int N = input.getSize(0); - int C = input.getSize(1); - int ID = input.getSize(2); - int IH = input.getSize(3); - int IW = input.getSize(4); - int D = grid.getSize(1); - int H = grid.getSize(2); - int W = grid.getSize(3); - - CUDA_KERNEL_LOOP(index, nthreads) { - - const int n = index % N; - const int d = (index / N) % D; - const int h = (index / (N * D)) % H; - const int w = (index / (N * D * H)) % W; - - // get the corresponding input x, y, z co-ordinates from grid - Dtype ix = grid[n][d][h][w][0]; - Dtype iy = grid[n][d][h][w][1]; - Dtype iz = grid[n][d][h][w][2]; - - Dtype gix = ScalarConvert::to(0); - Dtype giy = ScalarConvert::to(0); - Dtype giz = ScalarConvert::to(0); - - // normalize ix, iy, iz from [-1, 1] to [0, IW-1] & [0, IH-1] & [0, ID-1] - ix = ScalarConvert::to(((ix + 1.f) / 2) * (IW-1)); - iy = ScalarConvert::to(((iy + 1.f) / 2) * (IH-1)); - iz = ScalarConvert::to(((iz + 1.f) / 2) * (ID-1)); - - // get corner pixel values from (x, y, z) - // for 4d, we used north-east-south-west - // for 5d, we add top-bottom - int ix_tnw = floor(ScalarConvert::to(ix)); - int iy_tnw = floor(ScalarConvert::to(iy)); - int iz_tnw = floor(ScalarConvert::to(iz)); - - int ix_tne = ix_tnw + 1; - int iy_tne = iy_tnw; - int iz_tne = iz_tnw; - - int ix_tsw = ix_tnw; - int iy_tsw = iy_tnw + 1; - int iz_tsw = iz_tnw; - - int ix_tse = ix_tnw + 1; - int iy_tse = iy_tnw + 1; - int iz_tse = iz_tnw; - - int ix_bnw = ix_tnw; - int iy_bnw = iy_tnw; - int iz_bnw = iz_tnw + 1; - - int ix_bne = ix_tnw + 1; - int iy_bne = iy_tnw; - int iz_bne = iz_tnw + 1; - - int ix_bsw = ix_tnw; - int iy_bsw = iy_tnw + 1; - int iz_bsw = iz_tnw + 1; - - int ix_bse = ix_tnw + 1; - int iy_bse = iy_tnw + 1; - int iz_bse = iz_tnw + 1; - - // get surfaces to each neighbor: - Dtype tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz); - Dtype tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz); - Dtype tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz); - Dtype tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz); - Dtype bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse); - Dtype bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw); - Dtype bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne); - Dtype bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw); - - Dtype gradout; - Dtype tnw_val; - Dtype tne_val; - Dtype tsw_val; - Dtype tse_val; - Dtype bnw_val; - Dtype bne_val; - Dtype bsw_val; - Dtype bse_val; - - int ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, ix_tne_cl, iy_tne_cl, iz_tne_cl; - int ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, ix_tse_cl, iy_tse_cl, iz_tse_cl; - int ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, ix_bne_cl, iy_bne_cl, iz_bne_cl; - int ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, ix_bse_cl, iy_bse_cl, iz_bse_cl; - - if (padding_mode==MODE_BORDER){ - // clip coordinates to image borders - CLIP_COORDINATES(ix_tnw, ix_tnw_cl, IW); - CLIP_COORDINATES(iy_tnw, iy_tnw_cl, IH); - CLIP_COORDINATES(iz_tnw, iz_tnw_cl, ID); - CLIP_COORDINATES(ix_tne, ix_tne_cl, IW); - CLIP_COORDINATES(iy_tne, iy_tne_cl, IH); - CLIP_COORDINATES(iz_tne, iz_tne_cl, ID); - CLIP_COORDINATES(ix_tsw, ix_tsw_cl, IW); - CLIP_COORDINATES(iy_tsw, iy_tsw_cl, IH); - CLIP_COORDINATES(iz_tsw, iz_tsw_cl, ID); - CLIP_COORDINATES(ix_tse, ix_tse_cl, IW); - CLIP_COORDINATES(iy_tse, iy_tse_cl, IH); - CLIP_COORDINATES(iz_tse, iz_tse_cl, ID); - CLIP_COORDINATES(ix_bnw, ix_bnw_cl, IW); - CLIP_COORDINATES(iy_bnw, iy_bnw_cl, IH); - CLIP_COORDINATES(iz_bnw, iz_bnw_cl, ID); - CLIP_COORDINATES(ix_bne, ix_bne_cl, IW); - CLIP_COORDINATES(iy_bne, iy_bne_cl, IH); - CLIP_COORDINATES(iz_bne, iz_bne_cl, ID); - CLIP_COORDINATES(ix_bsw, ix_bsw_cl, IW); - CLIP_COORDINATES(iy_bsw, iy_bsw_cl, IH); - CLIP_COORDINATES(iz_bsw, iz_bsw_cl, ID); - CLIP_COORDINATES(ix_bse, ix_bse_cl, IW); - CLIP_COORDINATES(iy_bse, iy_bse_cl, IH); - CLIP_COORDINATES(iz_bse, iz_bse_cl, ID); - } - else { - ix_tnw_cl = ix_tnw; - iy_tnw_cl = iy_tnw; - iz_tnw_cl = iz_tnw; - ix_tne_cl = ix_tne; - iy_tne_cl = iy_tne; - iz_tne_cl = iz_tne; - ix_tsw_cl = ix_tsw; - iy_tsw_cl = iy_tsw; - iz_tsw_cl = iz_tsw; - ix_tse_cl = ix_tse; - iy_tse_cl = iy_tse; - iz_tse_cl = iz_tse; - ix_bnw_cl = ix_bnw; - iy_bnw_cl = iy_bnw; - iz_bnw_cl = iz_bnw; - ix_bne_cl = ix_bne; - iy_bne_cl = iy_bne; - iz_bne_cl = iz_bne; - ix_bsw_cl = ix_bsw; - iy_bsw_cl = iy_bsw; - iz_bsw_cl = iz_bsw; - ix_bse_cl = ix_bse; - iy_bse_cl = iy_bse; - iz_bse_cl = iz_bse; - } - - for (int c = 0; c < C; ++c) { - gradout = gradOutput[n][c][d][h][w]; - - // calculate and set gradInput - SAFE_ADD(gradInput, ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, n, c, ID, IH, IW, tnw * gradout); - SAFE_ADD(gradInput, ix_tne_cl, iy_tne_cl, iz_tne_cl, n, c, ID, IH, IW, tne * gradout); - SAFE_ADD(gradInput, ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, n, c, ID, IH, IW, tsw * gradout); - SAFE_ADD(gradInput, ix_tse_cl, iy_tse_cl, iz_tse_cl, n, c, ID, IH, IW, tse * gradout); - SAFE_ADD(gradInput, ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, n, c, ID, IH, IW, bnw * gradout); - SAFE_ADD(gradInput, ix_bne_cl, iy_bne_cl, iz_bne_cl, n, c, ID, IH, IW, bne * gradout); - SAFE_ADD(gradInput, ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, n, c, ID, IH, IW, bsw * gradout); - SAFE_ADD(gradInput, ix_bse_cl, iy_bse_cl, iz_bse_cl, n, c, ID, IH, IW, bse * gradout); - - // calculate gradGrid - tnw_val = ScalarConvert::to(0); - if (WITHIN_BOUNDS(ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, ID, IH, IW)) { - tnw_val = input[n][c][iz_tnw_cl][iy_tnw_cl][ix_tnw_cl]; - } - tne_val = ScalarConvert::to(0); - if (WITHIN_BOUNDS(ix_tne_cl, iy_tne_cl, iz_tne_cl, ID, IH, IW)) { - tne_val = input[n][c][iz_tne_cl][iy_tne_cl][ix_tne_cl]; - } - tsw_val = ScalarConvert::to(0); - if (WITHIN_BOUNDS(ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, ID, IH, IW)) { - tsw_val = input[n][c][iz_tsw_cl][iy_tsw_cl][ix_tsw_cl]; - } - tse_val = ScalarConvert::to(0); - if (WITHIN_BOUNDS(ix_tse_cl, iy_tse_cl, iz_tse_cl, ID, IH, IW)) { - tse_val = input[n][c][iz_tse_cl][iy_tse_cl][ix_tse_cl]; - } - bnw_val = ScalarConvert::to(0); - if (WITHIN_BOUNDS(ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, ID, IH, IW)) { - bnw_val = input[n][c][iz_bnw_cl][iy_bnw_cl][ix_bnw_cl]; - } - bne_val = ScalarConvert::to(0); - if (WITHIN_BOUNDS(ix_bne_cl, iy_bne_cl, iz_bne_cl, ID, IH, IW)) { - bne_val = input[n][c][iz_bne_cl][iy_bne_cl][ix_bne_cl]; - } - bsw_val = ScalarConvert::to(0); - if (WITHIN_BOUNDS(ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, ID, IH, IW)) { - bsw_val = input[n][c][iz_bsw_cl][iy_bsw_cl][ix_bsw_cl]; - } - bse_val = ScalarConvert::to(0); - if (WITHIN_BOUNDS(ix_bse_cl, iy_bse_cl, iz_bse_cl, ID, IH, IW)) { - bse_val = input[n][c][iz_bse_cl][iy_bse_cl][ix_bse_cl]; - } - - Dtype m1 = ScalarConvert::to(-1); - gix += m1 * tnw_val * (iy_bse - iy) * (iz_bse - iz) * gradout; - gix += tne_val * (iy_bsw - iy) * (iz_bsw - iz) * gradout; - gix += m1 * tsw_val * (iy - iy_bne) * (iz_bne - iz) * gradout; - gix += tse_val * (iy - iy_bnw) * (iz_bnw - iz) * gradout; - gix += m1 * bnw_val * (iy_tse - iy) * (iz - iz_tse) * gradout; - gix += bne_val * (iy_tsw - iy) * (iz - iz_tsw) * gradout; - gix += m1 * bsw_val * (iy - iy_tne) * (iz - iz_tne) * gradout; - gix += bse_val * (iy - iy_tnw) * (iz - iz_tnw) * gradout; - - - giy += m1 * tnw_val * (ix_bse - ix) * (iz_bse - iz) * gradout; - giy += m1 * tne_val * (ix - ix_bsw) * (iz_bsw - iz) * gradout; - giy += tsw_val * (ix_bne - ix) * (iz_bne - iz) * gradout; - giy += tse_val * (ix - ix_bnw) * (iz_bnw - iz) * gradout; - giy += m1 * bnw_val * (ix_tse - ix) * (iz - iz_tse) * gradout; - giy += m1 * bne_val * (ix - ix_tsw) * (iz - iz_tsw) * gradout; - giy += bsw_val * (ix_tne - ix) * (iz - iz_tne) * gradout; - giy += bse_val * (ix - ix_tnw) * (iz - iz_tnw) * gradout; - - giz += m1 * tnw_val * (ix_bse - ix) * (iy_bse - iy) * gradout; - giz += m1 * tne_val * (ix - ix_bsw) * (iy_bsw - iy) * gradout; - giz += m1 * tsw_val * (ix_bne - ix) * (iy - iy_bne) * gradout; - giz += m1 * tse_val * (ix - ix_bnw) * (iy - iy_bnw) * gradout; - giz += bnw_val * (ix_tse - ix) * (iy_tse - iy) * gradout; - giz += bne_val * (ix - ix_tsw) * (iy_tsw - iy) * gradout; - giz += bsw_val * (ix_tne - ix) * (iy - iy_tne) * gradout; - giz += bse_val * (ix - ix_tnw) * (iy - iy_tnw) * gradout; - } - - // un-normalize gradGrid values back to [-1, 1] constraints - gix = gix * (IW - 1) / 2; - giy = giy * (IH - 1) / 2; - giz = giz * (ID - 1) / 2; - - Dtype gix_old = gradGrid[n][d][h][w][0]; - Dtype giy_old = gradGrid[n][d][h][w][1]; - Dtype giz_old = gradGrid[n][d][h][w][2]; - - gradGrid[n][d][h][w][0] = gix_old + gix; - gradGrid[n][d][h][w][1] = giy_old + giy; - gradGrid[n][d][h][w][2] = giz_old + giz; - } -} - -#undef MIN -#undef MAX -#undef CLIP_COORDINATES -#undef WITHIN_BOUNDS -#undef SAFE_ADD - -#include "generic/VolumetricGridSamplerBilinear.cu" -#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h index 47f9bee0fb6744..e2a99640ba69b6 100644 --- a/aten/src/THCUNN/common.h +++ b/aten/src/THCUNN/common.h @@ -62,7 +62,7 @@ inline int GET_BLOCKS(const int N) #define THCUNN_check_dim_size(STATE, T, DIM, DIM_SIZE, SIZE) \ if (THCTensor_(nDimensionLegacyNoScalars)(STATE, T) != DIM || \ - THCTensor_(size)(STATE, T, DIM_SIZE) != SIZE) { \ + THCTensor_(sizeLegacyNoScalars)(STATE, T, DIM_SIZE) != SIZE) { \ THCDescBuff s1 = THCTensor_(sizeDesc)(state, T); \ THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \ " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \ @@ -70,7 +70,7 @@ inline int GET_BLOCKS(const int N) #define THCUNN_check_dim_size_indices(STATE, T, DIM, DIM_SIZE, SIZE) \ if (THCIndexTensor_(nDimensionLegacyNoScalars)(STATE, T) != DIM || \ - THCIndexTensor_(size)(STATE, T, DIM_SIZE) != SIZE) { \ + THCIndexTensor_(sizeLegacyNoScalars)(STATE, T, DIM_SIZE) != SIZE) { \ THCDescBuff s1 = THCIndexTensor_(sizeDesc)(state, T); \ THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \ " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \ diff --git a/aten/src/THCUNN/generic/BatchNormalization.cu b/aten/src/THCUNN/generic/BatchNormalization.cu index 03dd38a7bd76ee..81eabc68812f36 100644 --- a/aten/src/THCUNN/generic/BatchNormalization.cu +++ b/aten/src/THCUNN/generic/BatchNormalization.cu @@ -21,11 +21,11 @@ static THCDeviceTensor THNN_(devicetensor)(THCState *state, THCTensor int size[Dim]; for (int i = 0; i < Dim || i < inDim; ++i) { if (i < Dim && i < inDim) { - size[i] = t->size(i); + size[i] = THTensor_sizeLegacyNoScalars(t, i); } else if (i < Dim) { size[i] = 1; } else { - size[Dim - 1] *= t->size(i); + size[Dim - 1] *= THTensor_sizeLegacyNoScalars(t, i); } } return THCDeviceTensor(t->data(), size); diff --git a/aten/src/THCUNN/generic/ClassNLLCriterion.cu b/aten/src/THCUNN/generic/ClassNLLCriterion.cu index 6126dee76dcb27..6866c5798f7d23 100644 --- a/aten/src/THCUNN/generic/ClassNLLCriterion.cu +++ b/aten/src/THCUNN/generic/ClassNLLCriterion.cu @@ -16,7 +16,7 @@ void THNN_(ClassNLLCriterion_updateOutput)( } int n_dims = THCTensor_(nDimensionLegacyNoScalars)(state, input); - int n_classes = THCTensor_(size)(state, input, n_dims - 1); + int n_classes = THCTensor_(sizeLegacyNoScalars)(state, input, n_dims - 1); ignore_index -= TH_INDEX_BASE; if (weights) { @@ -31,8 +31,8 @@ void THNN_(ClassNLLCriterion_updateOutput)( THArgCheck(!input->is_empty() && (n_dims <= 2 && n_dims > 0), 2, "non-empty vector or matrix expected"); - int64_t batch_size = n_dims == 1 ? 1 : THCTensor_(size)(state, input, 0); - int64_t num_targets = THCudaLongTensor_size(state, target, 0); + int64_t batch_size = n_dims == 1 ? 1 : THCTensor_(sizeLegacyNoScalars)(state, input, 0); + int64_t num_targets = THCudaLongTensor_sizeLegacyNoScalars(state, target, 0); THArgCheck(batch_size == num_targets, 2, "mismatch between the batch size of input (%ld) and that of target (%ld)", batch_size, num_targets); @@ -152,7 +152,7 @@ void THNN_(ClassNLLCriterion_updateGradInput)( THArgCheck(!input->is_empty() && (n_dims <= 2 && n_dims > 0), 2, "non-empty vector or matrix expected"); int64_t batch_size = n_dims == 1 ? 1 : THCTensor_(size)(state, input, 0); - int64_t num_targets = THCudaLongTensor_size(state, target, 0); + int64_t num_targets = THCudaLongTensor_sizeLegacyNoScalars(state, target, 0); THArgCheck(batch_size == num_targets, 2, "mismatch between the batch size of input (%ld) and that of target (%ld)", batch_size, num_targets); diff --git a/aten/src/THCUNN/generic/ELU.cu b/aten/src/THCUNN/generic/ELU.cu index 5c09a0607f0246..6f78349110ec35 100644 --- a/aten/src/THCUNN/generic/ELU.cu +++ b/aten/src/THCUNN/generic/ELU.cu @@ -11,21 +11,23 @@ void THNN_(ELU_updateOutput)( THCTensor *output, accreal alpha, accreal scale, + accreal input_scale, bool inplace) { real negcoef = ScalarConvert::to(alpha * scale); - real poscoef = ScalarConvert::to(scale); + real poscoef = ScalarConvert::to(scale * input_scale); + real negiptcoef = ScalarConvert::to(input_scale); THCUNN_assertSameGPU(state, 2, input, output); if (inplace) { - THC_pointwiseApply1(state, input, ELUupdateOutputIP_functor(negcoef, poscoef)); + THC_pointwiseApply1(state, input, ELUupdateOutputIP_functor(negcoef, poscoef, negiptcoef)); THCTensor_(set)(state, output, input); } else { THCTensor_(resizeAs)(state, output, input); - THC_pointwiseApply2(state, output, input, ELUupdateOutput_functor(negcoef, poscoef)); + THC_pointwiseApply2(state, output, input, ELUupdateOutput_functor(negcoef, poscoef, negiptcoef)); } } @@ -36,15 +38,17 @@ void THNN_(ELU_updateGradInput)( THCTensor *gradInput, THCTensor *output, accreal alpha, - accreal scale) + accreal scale, + accreal input_scale) { real negcoef = ScalarConvert::to(alpha * scale); - real poscoef = ScalarConvert::to(scale); + real poscoef = ScalarConvert::to(scale * input_scale); + real negiptcoef = ScalarConvert::to(input_scale); THCUNN_check_nElement(state, output, gradOutput); THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput); THCTensor_(resizeAs)(state, gradInput, output); - THC_pointwiseApply3(state, gradInput, output, gradOutput, ELUupdateGradInput_functor(negcoef, poscoef)); + THC_pointwiseApply3(state, gradInput, output, gradOutput, ELUupdateGradInput_functor(negcoef, poscoef, negiptcoef)); } #endif diff --git a/aten/src/THCUNN/generic/GatedLinearUnit.cu b/aten/src/THCUNN/generic/GatedLinearUnit.cu index 4622403e76088f..9bd59eec538cb6 100644 --- a/aten/src/THCUNN/generic/GatedLinearUnit.cu +++ b/aten/src/THCUNN/generic/GatedLinearUnit.cu @@ -12,7 +12,7 @@ void THNN_(GatedLinear_updateOutput)( // size output to half of input dim = dim - TH_INDEX_BASE; - const int64_t nIn = THCTensor_(size)(state, input, dim); + const int64_t nIn = THCTensor_(sizeLegacyNoScalars)(state, input, dim); THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld", dim + TH_INDEX_BASE, nIn); const int64_t inputSize = THCTensor_(size)(state, input, dim) / 2; diff --git a/aten/src/THCUNN/generic/MultiMarginCriterion.cu b/aten/src/THCUNN/generic/MultiMarginCriterion.cu index 8272b3d4020ec7..65bd6cdec850bb 100644 --- a/aten/src/THCUNN/generic/MultiMarginCriterion.cu +++ b/aten/src/THCUNN/generic/MultiMarginCriterion.cu @@ -18,7 +18,7 @@ void THNN_(MultiMarginCriterion_updateOutput)( input = THCTensor_(newContiguous)(state, input); if(weights) weights = THCTensor_(newContiguous)(state, weights); - if (input->dim() == 1) + if (THTensor_nDimensionLegacyNoScalars(input) == 1) { dim3 blocks(1); dim3 threads(MULTIMARGIN_THREADS); @@ -30,7 +30,7 @@ void THNN_(MultiMarginCriterion_updateOutput)( THCTensor_(data)(state, input), THCIndexTensor_(data)(state, target), weights ? THCTensor_(data)(state, weights) : NULL, - 1, input->size(0), + 1, THTensor_sizeLegacyNoScalars(input, 0), reduction == Reduction::ElementwiseMean, margin ); @@ -42,7 +42,7 @@ void THNN_(MultiMarginCriterion_updateOutput)( THCTensor_(data)(state, input), THCIndexTensor_(data)(state, target), weights ? THCTensor_(data)(state, weights) : NULL, - 1, input->size(0), + 1, THTensor_sizeLegacyNoScalars(input, 0), reduction == Reduction::ElementwiseMean, margin ); @@ -52,7 +52,7 @@ void THNN_(MultiMarginCriterion_updateOutput)( else if (input->dim() == 2) { int nframe = input->size(0); - THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe), 3, + THArgCheck(!target->is_empty() && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), 3, "inconsistent target size"); dim3 blocks(input->size(0)); dim3 threads(MULTIMARGIN_THREADS); @@ -149,7 +149,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)( if(weights) weights = THCTensor_(newContiguous)(state, weights); - if (input->dim() == 1) + if (THTensor_nDimensionLegacyNoScalars(input) == 1) { dim3 blocks(1); dim3 threads(MULTIMARGIN_THREADS); @@ -162,7 +162,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)( THCTensor_(data)(state, input), THCIndexTensor_(data)(state, target), weights ? THCTensor_(data)(state, weights) : NULL, - 1, gradInput->size(0), + 1, THTensor_sizeLegacyNoScalars(gradInput, 0), reduction == Reduction::ElementwiseMean, margin, reduction != Reduction::None @@ -176,7 +176,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)( THCTensor_(data)(state, input), THCIndexTensor_(data)(state, target), weights ? THCTensor_(data)(state, weights) : NULL, - 1, gradInput->size(0), + 1, THTensor_sizeLegacyNoScalars(gradInput, 0), reduction == Reduction::ElementwiseMean, margin, reduction != Reduction::None @@ -187,7 +187,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)( else if (input->dim() == 2) { int nframe = gradInput->size(0); - THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe), 3, + THArgCheck(!target->is_empty() && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), 3, "inconsistent target size"); dim3 blocks(gradInput->size(0)); dim3 threads(MULTIMARGIN_THREADS); diff --git a/aten/src/THCUNN/generic/PReLU.cu b/aten/src/THCUNN/generic/PReLU.cu index 2517b409409aed..2a0d719ff6a3e6 100644 --- a/aten/src/THCUNN/generic/PReLU.cu +++ b/aten/src/THCUNN/generic/PReLU.cu @@ -24,8 +24,8 @@ void THNN_(PReLU_updateOutput)( input = THCTensor_(newContiguous)(state, input); int n = THCTensor_(nElement)(state, input); - if (input->size(ndim > 1) != nOutputPlane) - THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(ndim > 1)); + if (THTensor_sizeLegacyNoScalars(input, ndim > 1) != nOutputPlane) + THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, THTensor_sizeLegacyNoScalars(input, ndim > 1)); int mapSize = 1; for (int d = 2; d < ndim; d++) { @@ -69,8 +69,8 @@ void THNN_(PReLU_updateGradInput)( gradOutput = THCTensor_(newContiguous)(state, gradOutput); int n = THCTensor_(nElement)(state, input); - if (input->size(ndim > 1) != nOutputPlane) - THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(ndim > 1)); + if (THTensor_sizeLegacyNoScalars(input, ndim > 1) != nOutputPlane) + THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, THTensor_sizeLegacyNoScalars(input, ndim > 1)); int mapSize = 1; for (int d = 2; d < ndim; d++) { diff --git a/aten/src/THCUNN/generic/SparseLinear.cu b/aten/src/THCUNN/generic/SparseLinear.cu index f73bd5835c04bb..0363dcf0e3996a 100644 --- a/aten/src/THCUNN/generic/SparseLinear.cu +++ b/aten/src/THCUNN/generic/SparseLinear.cu @@ -4,17 +4,17 @@ static bool THNN_(checkInput)(THCTensor* t) { - return !t->is_empty() && THTensor_nDimensionLegacyAll(t) == 2 && t->size(1) == 3; + return !t->is_empty() && t->dim() == 2 && t->size(1) == 3; } static bool THNN_(checkSize2D)(THCTensor* t, int64_t size0, int64_t size1) { - return !t->is_empty() && THTensor_nDimensionLegacyAll(t) == 2 && t->size(0) == size0 && t->size(1) == size1; + return !t->is_empty() && t->dim() == 2 && t->size(0) == size0 && t->size(1) == size1; } static bool THNN_(checkSize1D)(THCTensor* t, int64_t size0) { - return !t->is_empty() && THTensor_nDimensionLegacyAll(t) == 1 && t->size(0) == size0; + return !t->is_empty() && THTensor_nDimensionLegacyNoScalars(t) == 1 && THTensor_sizeLegacyNoScalars(t, 0) == size0; } static inline void THNN_(copyCudaFloatingType)(THCState *state, THCudaIntTensor *buf, THCTensor *t) { diff --git a/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu b/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu index b7010977558816..ae211774a580db 100644 --- a/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu +++ b/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu @@ -8,10 +8,10 @@ void THNN_(SpatialClassNLLCriterion_shapeCheck)( THCIndexTensor *target, THCTensor *weights) { - AT_CHECK(!target->is_empty() && THCIndexTensor_(nDimensionLegacyNoScalars)(state, target) == 3, 1, + AT_CHECK(!target->is_empty() && target->dim() == 3, 1, "only batches of spatial targets supported (non-empty 3D tensors)" \ " but got targets of size: : ", target->sizes()); - AT_CHECK(!input->is_empty() && THCTensor_(nDimensionLegacyNoScalars)(state, input) == 4, 2, + AT_CHECK(!input->is_empty() && input->dim() == 4, 2, "only batches of spatial inputs supported (non-empty 4D tensors), " \ "but got input of size: ", input->sizes()); if (THCTensor_(size)(state, input, 0) != THCIndexTensor_(size)(state, target, 0) || diff --git a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu index 334afe93cb727e..7860404b685f52 100644 --- a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu +++ b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu @@ -73,7 +73,7 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)( int64_t nOutputPlane = weight->size(0); THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - int64_t nOutputPlane = bias->size(0); + int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0); THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); } THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight); diff --git a/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu b/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu index 7c6716c41f5bff..546ec2ae3c6185 100644 --- a/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu +++ b/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu @@ -31,7 +31,7 @@ void THNN_(SpatialDepthwiseConvolution_updateOutput)( // Bias has same # of channels as output if (bias) { - THAssert(bias->size(0) == weight->size(0)); + THAssert(THTensor_sizeLegacyNoScalars(bias, 0) == weight->size(0)); } input = THCTensor_(newContiguous)(state, input); diff --git a/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu b/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu index ad0f47418b86cf..4225583735460e 100644 --- a/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu +++ b/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu @@ -65,7 +65,7 @@ static inline void THNN_(SpatialDilatedConvolution_shapeCheck)( int64_t nOutputPlane = weight->size(0); THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - int64_t nOutputPlane = bias->size(0); + int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0); THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); } THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight); diff --git a/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu b/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu index 76777796e361e4..8d039d54068aaf 100644 --- a/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu +++ b/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu @@ -65,7 +65,7 @@ static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)( int64_t nOutputPlane = weight->size(1); THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - int64_t nOutputPlane = bias->size(0); + int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0); THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); } THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight); @@ -351,7 +351,7 @@ void THNN_(SpatialFullDilatedConvolution_accGradParameters)( if (gradWeight != NULL) { nOutputPlane = THCTensor_(size)(state, gradWeight, 1); } else if (gradBias != NULL) { - nOutputPlane = THCTensor_(size)(state, gradBias, 0); + nOutputPlane = THCTensor_(sizeLegacyNoScalars)(state, gradBias, 0); } else { return; } diff --git a/aten/src/THCUNN/generic/SpatialGridSamplerBilinear.cu b/aten/src/THCUNN/generic/SpatialGridSamplerBilinear.cu deleted file mode 100644 index 7e285cb55fa7d2..00000000000000 --- a/aten/src/THCUNN/generic/SpatialGridSamplerBilinear.cu +++ /dev/null @@ -1,97 +0,0 @@ -#ifndef THC_GENERIC_FILE -#define THC_GENERIC_FILE "generic/SpatialGridSamplerBilinear.cu" -#else - -static inline void THNN_(SpatialGridSamplerBilinear_shapeCheck)( - THCState *state, - THCTensor *input, - THCTensor *grid, - THCTensor *gradOutput) { - THCUNN_argCheck(state, !input->is_empty() && THCTensor_(nDimensionLegacyNoScalars)(state, input) == 4, 2, input, - "non-empty 4D input tensor expected but got: %s"); - THCUNN_argCheck(state, !grid->is_empty() && THCTensor_(nDimensionLegacyNoScalars)(state, grid) == 4, 2, grid, - "4D grid tensor expected but got: %s"); - - int64_t nbatch = THCTensor_(size)(state, input, 0); - int64_t channels = THCTensor_(size)(state, input, 1); - int64_t iheight = THCTensor_(size)(state, input, 2); - int64_t iwidth = THCTensor_(size)(state, input, 3); - int64_t oheight = THCTensor_(size)(state, grid, 1); - int64_t owidth = THCTensor_(size)(state, grid, 2); - - THCUNN_check_dim_size(state, grid, 4, 0, nbatch); - THCUNN_check_dim_size(state, grid, 4, 3, 2); - - if (gradOutput != NULL) { - THCUNN_check_dim_size(state, gradOutput, 4, 0, nbatch); - THCUNN_check_dim_size(state, gradOutput, 4, 1, channels); - THCUNN_check_dim_size(state, gradOutput, 4, 2, oheight); - THCUNN_check_dim_size(state, gradOutput, 4, 3, owidth); - } -} - -THC_API void THNN_(SpatialGridSamplerBilinear_updateOutput)( - THCState *state, - THCTensor *input, - THCTensor *grid, - THCTensor *output, - int padding_mode) { - - THCUNN_assertSameGPU(state, 3, input, grid, output); - THNN_(SpatialGridSamplerBilinear_shapeCheck)(state, input, grid, NULL); - int64_t N = THCTensor_(size)(state, input, 0); - int64_t C = THCTensor_(size)(state, input, 1); - int64_t IH = THCTensor_(size)(state, input, 2); - int64_t IW = THCTensor_(size)(state, input, 3); - int64_t H = THCTensor_(size)(state,grid, 1); - int64_t W = THCTensor_(size)(state, grid, 2); - - // resize output to the same shape as input - THCTensor_(resize4d)(state, output, N, C, H, W); - - THCDeviceTensor devInput = toDeviceTensor(state, input); - THCDeviceTensor devGrid = toDeviceTensor(state, grid); - THCDeviceTensor devOutput = toDeviceTensor(state, output); - - int count = static_cast(N*H*W); - SpatialGridSamplerBilinear_updateOutput_kernel - <<>>( - count, devInput, devGrid, devOutput, padding_mode); - THCudaCheck(cudaGetLastError()); -} - -THC_API void THNN_(SpatialGridSamplerBilinear_updateGradInput)( - THCState *state, - THCTensor *input, THCTensor *gradInput, - THCTensor *grid, THCTensor *gradGrid, - THCTensor *gradOutput, - int padding_mode) { - - THCUNN_assertSameGPU(state, 5, input, gradInput, grid, gradGrid, gradOutput); - THNN_(SpatialGridSamplerBilinear_shapeCheck)(state, input, grid, gradOutput); - int64_t N = THCTensor_(size)(state, input, 0); - int64_t C = THCTensor_(size)(state, input, 1); - int64_t IH = THCTensor_(size)(state, input, 2); - int64_t IW = THCTensor_(size)(state, input, 3); - int64_t H = THCTensor_(size)(state, grid, 1); - int64_t W = THCTensor_(size)(state, grid, 2); - - THCTensor_(resize4d)(state, gradInput, N, C, IH, IW); - THCTensor_(resize4d)(state, gradGrid, N, H, W, 2); - THCTensor_(zero)(state, gradInput); - THCTensor_(zero)(state, gradGrid); - - THCDeviceTensor devInput = toDeviceTensor(state, input); - THCDeviceTensor devGradInput = toDeviceTensor(state, gradInput); - THCDeviceTensor devGrid = toDeviceTensor(state, grid); - THCDeviceTensor devGradGrid = toDeviceTensor(state, gradGrid); - THCDeviceTensor devGradOutput = toDeviceTensor(state, gradOutput); - - int count = static_cast(N*H*W); - SpatialGridSamplerBilinear_updateGradInput_kernel - <<>>( - count, devInput, devGradInput, devGrid, devGradGrid, devGradOutput, padding_mode); - THCudaCheck(cudaGetLastError()); -} - -#endif diff --git a/aten/src/THCUNN/generic/THCUNN.h b/aten/src/THCUNN/generic/THCUNN.h index eaadf66c8306ee..3c4883a1e3c45d 100644 --- a/aten/src/THCUNN/generic/THCUNN.h +++ b/aten/src/THCUNN/generic/THCUNN.h @@ -119,6 +119,7 @@ THC_API void THNN_(ELU_updateOutput)( THCTensor *output, accreal alpha, accreal scale, + accreal input_scale, bool inplace); THC_API void THNN_(ELU_updateGradInput)( @@ -127,7 +128,8 @@ THC_API void THNN_(ELU_updateGradInput)( THCTensor *gradInput, THCTensor *output, accreal alpha, - accreal scale); + accreal scale, + accreal input_scale); THC_API void THNN_(FeatureLPPooling_updateOutput)( THCState* state, @@ -1045,34 +1047,6 @@ THC_API void THNN_(SpatialUpSamplingNearest_updateOutput)( int outputHeight, int outputWidth); -THC_API void THNN_(SpatialGridSamplerBilinear_updateOutput)( - THCState *state, - THCTensor *input, - THCTensor *grid, - THCTensor *output, - int padding_mode); - -THC_API void THNN_(SpatialGridSamplerBilinear_updateGradInput)( - THCState *state, - THCTensor *input, THCTensor *gradInput, - THCTensor *grid, THCTensor *gradGrid, - THCTensor *gradOutput, - int padding_mode); - -THC_API void THNN_(VolumetricGridSamplerBilinear_updateOutput)( - THCState *state, - THCTensor *input, - THCTensor *grid, - THCTensor *output, - int padding_mode); - -THC_API void THNN_(VolumetricGridSamplerBilinear_updateGradInput)( - THCState *state, - THCTensor *input, THCTensor *gradInput, - THCTensor *grid, THCTensor *gradGrid, - THCTensor *gradOutput, - int padding_mode); - THC_API void THNN_(RReLU_updateOutput)( THCState *state, THCTensor *input, diff --git a/aten/src/THCUNN/generic/TemporalReflectionPadding.cu b/aten/src/THCUNN/generic/TemporalReflectionPadding.cu index 870d38ba225f8c..310f22d03e5dfa 100644 --- a/aten/src/THCUNN/generic/TemporalReflectionPadding.cu +++ b/aten/src/THCUNN/generic/TemporalReflectionPadding.cu @@ -79,7 +79,7 @@ void THNN_(TemporalReflectionPadding_updateGradInput)( int planeDim = 0; int dimw = 1; - int numInputDims = THCTensor_(nDimensionLegacyNoScalars)(state, input); + int numInputDims = input->dim(); if (numInputDims == 3) { planeDim++; dimw++; diff --git a/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu b/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu index 52d97fbf2a3638..d6ffba3519553c 100644 --- a/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu +++ b/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu @@ -75,7 +75,7 @@ static inline void THNN_(VolumetricDilatedConvolution_shapeCheck)( int64_t nOutputPlane = weight->size(0); THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - int64_t nOutputPlane = bias->size(0); + int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0); THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); } THCUNN_check_dim_size(state, gradOutput, ndim, dimd, outputDepth); diff --git a/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu b/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu index 96310609e956f4..10a5fdc2643193 100644 --- a/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu +++ b/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu @@ -387,7 +387,7 @@ void THNN_(VolumetricFullDilatedConvolution_accGradParameters)( if (gradWeight) { nOutputPlane = THCTensor_(size)(state, gradWeight, 1); } else if (gradBias) { - nOutputPlane = THCTensor_(size)(state, gradBias, 0); + nOutputPlane = THCTensor_(sizeLegacyNoScalars)(state, gradBias, 0); } else { return; } diff --git a/aten/src/THCUNN/generic/VolumetricGridSamplerBilinear.cu b/aten/src/THCUNN/generic/VolumetricGridSamplerBilinear.cu deleted file mode 100644 index 086667ca476ac1..00000000000000 --- a/aten/src/THCUNN/generic/VolumetricGridSamplerBilinear.cu +++ /dev/null @@ -1,104 +0,0 @@ -#ifndef THC_GENERIC_FILE -#define THC_GENERIC_FILE "generic/VolumetricGridSamplerBilinear.cu" -#else - -static inline void THNN_(VolumetricGridSamplerBilinear_shapeCheck)( - THCState *state, - THCTensor *input, - THCTensor *grid, - THCTensor *gradOutput) { - THCUNN_argCheck(state, !input->is_empty() && THCTensor_(nDimensionLegacyNoScalars)(state, input) == 5, 2, input, - "non-empty 5D input tensor expected but got: %s"); - THCUNN_argCheck(state, !grid->is_empty() && THCTensor_(nDimensionLegacyNoScalars)(state, grid) == 5, 2, grid, - "non-empty 5D grid tensor expected but got: %s"); - - int64_t nbatch = THCTensor_(size)(state, input, 0); - int64_t channels = THCTensor_(size)(state, input, 1); - int64_t idepth = THCTensor_(size)(state, input, 2); - int64_t iheight = THCTensor_(size)(state, input, 3); - int64_t iwidth = THCTensor_(size)(state, input, 4); - int64_t odepth = THCTensor_(size)(state, grid, 1); - int64_t oheight = THCTensor_(size)(state, grid, 2); - int64_t owidth = THCTensor_(size)(state, grid, 3); - - THCUNN_check_dim_size(state, grid, 5, 0, nbatch); - THCUNN_check_dim_size(state, grid, 5, 4, 3); - - if (gradOutput != NULL) { - THCUNN_check_dim_size(state, gradOutput, 5, 0, nbatch); - THCUNN_check_dim_size(state, gradOutput, 5, 1, channels); - THCUNN_check_dim_size(state, gradOutput, 5, 2, odepth); - THCUNN_check_dim_size(state, gradOutput, 5, 3, oheight); - THCUNN_check_dim_size(state, gradOutput, 5, 4, owidth); - } -} - -THC_API void THNN_(VolumetricGridSamplerBilinear_updateOutput)( - THCState *state, - THCTensor *input, - THCTensor *grid, - THCTensor *output, - int padding_mode) { - - THCUNN_assertSameGPU(state, 3, input, grid, output); - THNN_(VolumetricGridSamplerBilinear_shapeCheck)(state, input, grid, NULL); - int64_t N = THCTensor_(size)(state, input, 0); - int64_t C = THCTensor_(size)(state, input, 1); - int64_t ID = THCTensor_(size)(state, input, 2); - int64_t IH = THCTensor_(size)(state, input, 3); - int64_t IW = THCTensor_(size)(state, input, 4); - int64_t D = THCTensor_(size)(state,grid, 1); - int64_t H = THCTensor_(size)(state,grid, 2); - int64_t W = THCTensor_(size)(state, grid, 3); - - // resize output to the same shape as input - THCTensor_(resize5d)(state, output, N, C, D, H, W); - - THCDeviceTensor devInput = toDeviceTensor(state, input); - THCDeviceTensor devGrid = toDeviceTensor(state, grid); - THCDeviceTensor devOutput = toDeviceTensor(state, output); - - int count = static_cast(N*D*H*W); - VolumetricGridSamplerBilinear_updateOutput_kernel - <<>>( - count, devInput, devGrid, devOutput, padding_mode); - THCudaCheck(cudaGetLastError()); -} - -THC_API void THNN_(VolumetricGridSamplerBilinear_updateGradInput)( - THCState *state, - THCTensor *input, THCTensor *gradInput, - THCTensor *grid, THCTensor *gradGrid, - THCTensor *gradOutput, - int padding_mode) { - - THCUNN_assertSameGPU(state, 5, input, gradInput, grid, gradGrid, gradOutput); - THNN_(VolumetricGridSamplerBilinear_shapeCheck)(state, input, grid, gradOutput); - int64_t N = THCTensor_(size)(state, input, 0); - int64_t C = THCTensor_(size)(state, input, 1); - int64_t ID = THCTensor_(size)(state, input, 2); - int64_t IH = THCTensor_(size)(state, input, 3); - int64_t IW = THCTensor_(size)(state, input, 4); - int64_t D = THCTensor_(size)(state,grid, 1); - int64_t H = THCTensor_(size)(state,grid, 2); - int64_t W = THCTensor_(size)(state, grid, 3); - - THCTensor_(resize5d)(state, gradInput, N, C, ID, IH, IW); - THCTensor_(resize5d)(state, gradGrid, N, D, H, W, 3); - THCTensor_(zero)(state, gradInput); - THCTensor_(zero)(state, gradGrid); - - THCDeviceTensor devInput = toDeviceTensor(state, input); - THCDeviceTensor devGradInput = toDeviceTensor(state, gradInput); - THCDeviceTensor devGrid = toDeviceTensor(state, grid); - THCDeviceTensor devGradGrid = toDeviceTensor(state, gradGrid); - THCDeviceTensor devGradOutput = toDeviceTensor(state, gradOutput); - - int count = static_cast(N*D*H*W); - VolumetricGridSamplerBilinear_updateGradInput_kernel - <<>>( - count, devInput, devGradInput, devGrid, devGradGrid, devGradOutput, padding_mode); - THCudaCheck(cudaGetLastError()); -} - -#endif diff --git a/aten/src/THNN/generic/ClassNLLCriterion.c b/aten/src/THNN/generic/ClassNLLCriterion.c index c7d42b583374cc..7db0531d60d1ef 100644 --- a/aten/src/THNN/generic/ClassNLLCriterion.c +++ b/aten/src/THNN/generic/ClassNLLCriterion.c @@ -82,7 +82,7 @@ void THNN_(ClassNLLCriterion_updateOutput)( } } else if (THTensor_(nDimensionLegacyAll)(input) == 2) { int batch_size = THTensor_(size)(input, 0); - THAssert(THIndexTensor_(size)(target, 0) == batch_size); + THAssert(THTensor_sizeLegacyNoScalars(target, 0) == batch_size); int n_target = THTensor_(size)(input, 1); @@ -189,7 +189,7 @@ void THNN_(ClassNLLCriterion_updateGradInput)( } else if (THTensor_(nDimensionLegacyAll)(input) == 2) { int batch_size = THTensor_(size)(input, 0); - THAssert(THIndexTensor_(size)(target, 0) == batch_size); + THAssert(THTensor_sizeLegacyNoScalars(target, 0) == batch_size); int n_target = THTensor_(size)(input, 1); diff --git a/aten/src/THNN/generic/ELU.c b/aten/src/THNN/generic/ELU.c index f2d87185b813a5..62111ebbf4d7c2 100644 --- a/aten/src/THNN/generic/ELU.c +++ b/aten/src/THNN/generic/ELU.c @@ -8,19 +8,21 @@ void THNN_(ELU_updateOutput)( THTensor *output, accreal alpha_, accreal scale, + accreal input_scale, bool inplace) { real negcoef = TH_CONVERT_ACCREAL_TO_REAL(alpha_ * scale); - real poscoef = TH_CONVERT_ACCREAL_TO_REAL(scale); + real poscoef = TH_CONVERT_ACCREAL_TO_REAL(scale * input_scale); + real negiptcoef = TH_CONVERT_ACCREAL_TO_REAL(input_scale); if (inplace) { TH_TENSOR_APPLY(real, input, - *input_data = *input_data <= 0 ? (exp(*input_data)-1) * negcoef : *input_data * poscoef; + *input_data = *input_data <= 0 ? (exp(*input_data * negiptcoef)-1) * negcoef : *input_data * poscoef; ); THTensor_(set)(output, input); } else { THTensor_(resizeAs)(output, input); TH_TENSOR_APPLY2(real, input, real, output, - *output_data = *input_data <= 0 ? (exp(*input_data)-1) * negcoef : *input_data * poscoef; + *output_data = *input_data <= 0 ? (exp(*input_data * negiptcoef)-1) * negcoef : *input_data * poscoef; ); } } @@ -31,14 +33,16 @@ void THNN_(ELU_updateGradInput)( THTensor *gradInput, THTensor *output, accreal alpha_, - accreal scale) + accreal scale, + accreal input_scale) { real negcoef = TH_CONVERT_ACCREAL_TO_REAL(alpha_ * scale); - real poscoef = TH_CONVERT_ACCREAL_TO_REAL(scale); + real poscoef = TH_CONVERT_ACCREAL_TO_REAL(scale * input_scale); + real negiptcoef = TH_CONVERT_ACCREAL_TO_REAL(input_scale); THNN_CHECK_NELEMENT(output, gradOutput); THTensor_(resizeAs)(gradInput, output); TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output, - *gradInput_data = *output_data <= 0 ? *gradOutput_data * (*output_data + negcoef) : *gradOutput_data * poscoef; + *gradInput_data = *output_data <= 0 ? *gradOutput_data * negiptcoef * (*output_data + negcoef) : *gradOutput_data * poscoef; ); } diff --git a/aten/src/THNN/generic/GatedLinearUnit.c b/aten/src/THNN/generic/GatedLinearUnit.c index 68cdc37d54214a..0f888744240473 100644 --- a/aten/src/THNN/generic/GatedLinearUnit.c +++ b/aten/src/THNN/generic/GatedLinearUnit.c @@ -10,7 +10,7 @@ void THNN_(GatedLinear_updateOutput)( { // size output to half of input dim = dim - TH_INDEX_BASE; - const int64_t nIn = THTensor_(size)(input, dim); + const int64_t nIn = THTensor_sizeLegacyNoScalars(input, dim); THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld", dim + TH_INDEX_BASE, nIn); diff --git a/aten/src/THNN/generic/LookupTable.c b/aten/src/THNN/generic/LookupTable.c index 2260b168d8e8d5..fa6648e2a6b80c 100644 --- a/aten/src/THNN/generic/LookupTable.c +++ b/aten/src/THNN/generic/LookupTable.c @@ -40,7 +40,7 @@ void THNN_(LookupTable_accGradParameters)( if (scaleGradByFreq) { - THIntegerTensor_(resize1d)(count, gradWeight->size(0)); + THIntegerTensor_(resize1d)(count, THTensor_sizeLegacyNoScalars(gradWeight, 0)); count_data = THIntegerTensor_(data)(count); } diff --git a/aten/src/THNN/generic/MultiLabelMarginCriterion.c b/aten/src/THNN/generic/MultiLabelMarginCriterion.c index 0699c3ac471c55..a18252b06914d6 100644 --- a/aten/src/THNN/generic/MultiLabelMarginCriterion.c +++ b/aten/src/THNN/generic/MultiLabelMarginCriterion.c @@ -17,14 +17,14 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)( int64_t t, d, dt, ddt; real sum; - AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2), + AT_CHECK(!input->is_empty() && input->dim() <= 2, "non-empty vector or matrix expected, got size: ", input->sizes()); - if (input->dim() == 1) + if (input->dim() <= 1) { nframe = 1; - dim = input->size(0); - AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size(0) == dim), + dim = THTensor_sizeLegacyNoScalars(input, 0); + AT_CHECK(!target->is_empty() && (target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == dim), "inconsistent target size"); } else @@ -155,16 +155,16 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)( int64_t t, d, dt; real g; - AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2), + AT_CHECK(!input->is_empty() && input->dim() <= 2, "vector or matrix expected, got size: ", input->sizes()); - if (input->dim() == 1) + if (input->dim() <= 1) { nframe = 1; - dim = input->size(0); - AT_CHECK((!target->is_empty() && target->dim() == 1) && (target->size(0) == dim), + dim = THTensor_sizeLegacyNoScalars(input, 0); + AT_CHECK((!target->is_empty() && target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == dim), "inconsistent target size"); - AT_CHECK((!isTarget->is_empty() && isTarget->dim() == 1) && (isTarget->size(0) == dim), + AT_CHECK((!isTarget->is_empty() && isTarget->dim() <= 1) && (THTensor_sizeLegacyNoScalars(isTarget, 0) == dim), "inconsistent isTarget size"); } else diff --git a/aten/src/THNN/generic/MultiMarginCriterion.c b/aten/src/THNN/generic/MultiMarginCriterion.c index 424669e5de8515..2c8f38be23eb3a 100644 --- a/aten/src/THNN/generic/MultiMarginCriterion.c +++ b/aten/src/THNN/generic/MultiMarginCriterion.c @@ -20,19 +20,19 @@ void THNN_(MultiMarginCriterion_updateOutput)( int64_t t, d; real sum; - AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2), + AT_CHECK(!input->is_empty() && input->dim() <= 2, "non-empty vector or matrix expected, got size: ", input->sizes()); - if (input->dim() == 1) + if (input->dim() <= 1) { nframe = 1; - dim = input->size(0); + dim = THTensor_sizeLegacyNoScalars(input, 0); } else { nframe = input->size(0); dim = input->size(1); - AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe), + AT_CHECK(!target->is_empty() && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), "inconsistent target size, got: ", target->sizes()); } @@ -136,19 +136,19 @@ void THNN_(MultiMarginCriterion_updateGradInput)( int64_t t, d; real g; - AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2), + AT_CHECK(!input->is_empty() && (input->dim() <= 2), "non-empty vector or matrix expected, got size: ", input->sizes()); - if (input->dim() == 1) + if (input->dim() <= 1) { nframe = 1; - dim = input->size(0); + dim = THTensor_sizeLegacyNoScalars(input, 0); } else { nframe = input->size(0); dim = input->size(1); - AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe), + AT_CHECK(!target->is_empty() && (target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), "inconsistent target size, got: ", target->sizes()); } diff --git a/aten/src/THNN/generic/PReLU.c b/aten/src/THNN/generic/PReLU.c index e148fde783ce9d..1837874852d2bb 100644 --- a/aten/src/THNN/generic/PReLU.c +++ b/aten/src/THNN/generic/PReLU.c @@ -26,8 +26,8 @@ void THNN_(PReLU_updateOutput)( int64_t bs = 1, ks = 1; { int64_t input_ndim = THTensor_(nDimensionLegacyAll)(input); - if (input->size(input_ndim > 1) != nOutputPlane) - THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(input_ndim > 1)); + if (THTensor_sizeLegacyNoScalars(input, input_ndim > 1) != nOutputPlane) + THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, THTensor_sizeLegacyNoScalars(input, input_ndim > 1)); if (input_ndim > 1) { bs = input->size(0); @@ -91,8 +91,8 @@ void THNN_(PReLU_updateGradInput)( int64_t bs = 1, ks = 1; { int64_t input_ndim = THTensor_(nDimensionLegacyAll)(input); - if (input->size(input_ndim > 1) != nOutputPlane) - THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(input_ndim > 1)); + if (THTensor_sizeLegacyNoScalars(input, input_ndim > 1) != nOutputPlane) + THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, THTensor_sizeLegacyNoScalars(input, input_ndim > 1)); if (input_ndim > 1) { bs = input->size(0); @@ -162,8 +162,8 @@ void THNN_(PReLU_accGradParameters)( int64_t bs = 1, ks = 1; { int64_t input_ndim = THTensor_(nDimensionLegacyAll)(input); - if (input->size(input_ndim > 1) != nOutputPlane) - THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(input_ndim > 1)); + if (THTensor_sizeLegacyNoScalars(input, input_ndim > 1) != nOutputPlane) + THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, THTensor_sizeLegacyNoScalars(input, input_ndim > 1)); if (input_ndim > 1) { bs = input->size(0); diff --git a/aten/src/THNN/generic/SparseLinear.c b/aten/src/THNN/generic/SparseLinear.c index a28d4e78477ceb..3bf8e652fa9ed9 100644 --- a/aten/src/THNN/generic/SparseLinear.c +++ b/aten/src/THNN/generic/SparseLinear.c @@ -26,7 +26,7 @@ static bool THNN_(checkSize2D)(THTensor* t, int64_t size0, int64_t size1) static bool THNN_(checkSize1D)(THTensor* t, int64_t size0) { - return !t->is_empty() && t->dim() == 1 && t->size(0) == size0; + return !t->is_empty() && THTensor_nDimensionLegacyNoScalars(t) == 1 && THTensor_sizeLegacyNoScalars(t, 0) == size0; } static void THNN_(set1d)(THTensor *t, int64_t x0, real value) { diff --git a/aten/src/THNN/generic/SpatialConvolutionMM.c b/aten/src/THNN/generic/SpatialConvolutionMM.c index fce2c8575935a5..f18a6d0817059b 100644 --- a/aten/src/THNN/generic/SpatialConvolutionMM.c +++ b/aten/src/THNN/generic/SpatialConvolutionMM.c @@ -72,7 +72,7 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)( int64_t nOutputPlane = weight->size(0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - int64_t nOutputPlane = bias->size(0); + int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); @@ -332,7 +332,7 @@ static void THNN_(SpatialConvolutionMM_accGradParameters_frame)( } if (gradBias) { - for(i = 0; i < gradBias->size(0); i++) + for(i = 0; i < THTensor_sizeLegacyNoScalars(gradBias, 0); i++) { int64_t k; real sum = 0; diff --git a/aten/src/THNN/generic/SpatialDilatedConvolution.c b/aten/src/THNN/generic/SpatialDilatedConvolution.c index 63e7bd81033e12..2f71861963fcdf 100644 --- a/aten/src/THNN/generic/SpatialDilatedConvolution.c +++ b/aten/src/THNN/generic/SpatialDilatedConvolution.c @@ -64,7 +64,7 @@ static inline void THNN_(SpatialDilatedConvolution_shapeCheck)( int64_t nOutputPlane = weight->size(0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - int64_t nOutputPlane = bias->size(0); + int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); diff --git a/aten/src/THNN/generic/SpatialFullDilatedConvolution.c b/aten/src/THNN/generic/SpatialFullDilatedConvolution.c index 7226db67ef1a74..eeb644fc9eb5e6 100644 --- a/aten/src/THNN/generic/SpatialFullDilatedConvolution.c +++ b/aten/src/THNN/generic/SpatialFullDilatedConvolution.c @@ -64,7 +64,7 @@ static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)( int64_t nOutputPlane = weight->size(1); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - int64_t nOutputPlane = bias->size(0); + int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); @@ -332,7 +332,7 @@ void THNN_(SpatialFullDilatedConvolution_accGradParameters)( if (gradWeight) { nOutputPlane = THTensor_(size)(gradWeight, 1); } else if (gradBias) { - nOutputPlane = THTensor_(size)(gradBias, 0); + nOutputPlane = THTensor_sizeLegacyNoScalars(gradBias, 0); } else { return; } @@ -402,7 +402,7 @@ void THNN_(SpatialFullDilatedConvolution_accGradParameters)( // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) int64_t n = columns->size(0); // nOutputPlane * kh * kw - int64_t m = input_n->size(0); // nInputPlane + int64_t m = THTensor_sizeLegacyNoScalars(input_n, 0); // nInputPlane int64_t k = columns->size(1); // inputHeight * inputWidth // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) diff --git a/aten/src/THNN/generic/SpatialGridSamplerBilinear.c b/aten/src/THNN/generic/SpatialGridSamplerBilinear.c deleted file mode 100644 index d31f3e0a76c20a..00000000000000 --- a/aten/src/THNN/generic/SpatialGridSamplerBilinear.c +++ /dev/null @@ -1,250 +0,0 @@ -#ifndef TH_GENERIC_FILE -#define TH_GENERIC_FILE "generic/SpatialGridSamplerBilinear.c" -#else - -#undef MIN -#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) ) -#undef MAX -#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) ) - -#undef MODE_BORDER -#define MODE_BORDER 1 - -static inline void THNN_(SpatialGridSamplerBilinear_shapeCheck) - (THTensor *input, THTensor *grid, THTensor *gradOutput) { - THNN_ARGCHECK(!input->is_empty() && input->dim() == 4, 2, input, - "non-empty 4D input tensor expected but got: %s"); - THNN_ARGCHECK(!grid->is_empty() && grid->dim() == 4, 2, grid, - "non-empty 4D grid tensor expected but got: %s"); - - int nbatch = THTensor_(size)(input, 0); - int channels = THTensor_(size)(input, 1); - int oheight = THTensor_(size)(grid, 1); - int owidth = THTensor_(size)(grid, 2); - - THNN_CHECK_DIM_SIZE(grid, 4, 0, nbatch); - THNN_CHECK_DIM_SIZE(grid, 4, 3, 2); - - if (gradOutput != NULL) { - THNN_CHECK_DIM_SIZE(gradOutput, 4, 0, nbatch); - THNN_CHECK_DIM_SIZE(gradOutput, 4, 1, channels); - THNN_CHECK_DIM_SIZE(gradOutput, 4, 2, oheight); - THNN_CHECK_DIM_SIZE(gradOutput, 4, 3, owidth); - } -} - -#define SAFE_GET(input, x, y, n, c, H, W) x >= 0 && x < W && y >=0 \ - && y < H ? THTensor_(fastGet4d)(input, n, c, y, x) : 0 - -#define CLIP_COORDINATES(in, out, clip_limit) out = MIN((clip_limit-1), MAX(in, 0)) - -TH_API void THNN_(SpatialGridSamplerBilinear_updateOutput)( - THNNState *state, - THTensor *input, - THTensor *grid, - THTensor *output, - int padding_mode) { - - THNN_(SpatialGridSamplerBilinear_shapeCheck)(input, grid, NULL); - int N = THTensor_(size)(input, 0); - int C = THTensor_(size)(input, 1); - int IH = THTensor_(size)(input, 2); - int IW = THTensor_(size)(input, 3); - int H = THTensor_(size)(grid, 1); - int W = THTensor_(size)(grid, 2); - - // resize output to the same shape as input - THTensor_(resize4d)(output, N, C, H, W); - - // loop over each output pixel - int n, h, w, c; -#pragma omp parallel for private(n, h, w, c) - for (n = 0; n < N; ++n) { - for (h = 0; h < H; ++h) { - for (w = 0; w < W; ++w) { - // get the corresponding input x, y co-ordinates from grid - real ix = THTensor_(fastGet4d)(grid, n, h, w, 0); - real iy = THTensor_(fastGet4d)(grid, n, h, w, 1); - - // normalize ix, iy from [-1, 1] to [0, IH-1] & [0, IW-1] - ix = ((ix + 1) / 2) * (IW-1); - iy = ((iy + 1) / 2) * (IH-1); - - // get NE, NW, SE, SW pixel values from (x, y) - int ix_nw = floor(ix); - int iy_nw = floor(iy); - int ix_ne = ix_nw + 1; - int iy_ne = iy_nw; - int ix_sw = ix_nw; - int iy_sw = iy_nw + 1; - int ix_se = ix_nw + 1; - int iy_se = iy_nw + 1; - - // get surfaces to each neighbor: - real nw = (ix_se - ix) * (iy_se - iy); - real ne = (ix - ix_sw) * (iy_sw - iy); - real sw = (ix_ne - ix) * (iy - iy_ne); - real se = (ix - ix_nw) * (iy - iy_nw); - - if (padding_mode==MODE_BORDER){ - // clip coordinates to image borders - CLIP_COORDINATES(ix_nw, ix_nw, IW); - CLIP_COORDINATES(iy_nw, iy_nw, IH); - CLIP_COORDINATES(ix_ne, ix_ne, IW); - CLIP_COORDINATES(iy_ne, iy_ne, IH); - CLIP_COORDINATES(ix_sw, ix_sw, IW); - CLIP_COORDINATES(iy_sw, iy_sw, IH); - CLIP_COORDINATES(ix_se, ix_se, IW); - CLIP_COORDINATES(iy_se, iy_se, IH); - } - - // calculate bilinear weighted pixel value and set output pixel - for (c = 0; c < C; ++c) { - // (c, iy_nw, ix_nw) * nw + (c, iy_ne, ix_ne) * ne - // + (c, iy_sw, ix_sw) * sw + (c, iy_se, ix_se) * se - real nw_val = SAFE_GET(input, ix_nw, iy_nw, n, c, IH, IW); - real ne_val = SAFE_GET(input, ix_ne, iy_ne, n, c, IH, IW); - real sw_val = SAFE_GET(input, ix_sw, iy_sw, n, c, IH, IW); - real se_val = SAFE_GET(input, ix_se, iy_se, n, c, IH, IW); - real out_val = nw_val * nw + ne_val * ne + sw_val * sw + se_val * se; - THTensor_(fastSet4d)(output, n, c, h, w, out_val); - } - } - } - } -} - -#define SAFE_ADD(input, x, y, n, c, H, W, value) \ - do { \ - if (x >= 0 && x < W && y >=0 && y < H) { \ - real old_value = THTensor_(fastGet4d)(input, n, c, y, x); \ - THTensor_(fastSet4d)(input, n, c, y, x, value + old_value); \ - } \ - } while(0) - -TH_API void THNN_(SpatialGridSamplerBilinear_updateGradInput)( - THNNState *state, - THTensor *input, THTensor *gradInput, - THTensor *grid, THTensor *gradGrid, - THTensor *gradOutput, - int padding_mode) { - - THNN_(SpatialGridSamplerBilinear_shapeCheck)(input, grid, gradOutput); - int N = THTensor_(size)(input, 0); - int C = THTensor_(size)(input, 1); - int IH = THTensor_(size)(input, 2); - int IW = THTensor_(size)(input, 3); - int H = THTensor_(size)(grid, 1); - int W = THTensor_(size)(grid, 2); - - THTensor_(resize4d)(gradInput, N, C, IH, IW); - THTensor_(resize4d)(gradGrid, N, H, W, 2); - THTensor_(zero)(gradInput); - THTensor_(zero)(gradGrid); - - // loop over each output pixel - int n, h, w; -#pragma omp parallel for private(n, h, w) - for (n = 0; n < N; ++n) { - for (h = 0; h < H; ++h) { - for (w = 0; w < W; ++w) { - // get the corresponding input x, y co-ordinates from grid - real ix = THTensor_(fastGet4d)(grid, n, h, w, 0); - real iy = THTensor_(fastGet4d)(grid, n, h, w, 1); - - real gix = 0; - real giy = 0; - - // normalize ix, iy from [-1, 1] to [0, H-1] & [0, W-1] - ix = ((ix + 1) / 2) * (IW-1); - iy = ((iy + 1) / 2) * (IH-1); - - // get NE, NW, SE, SW pixel values from (x, y) - int ix_nw = floor(ix); - int iy_nw = floor(iy); - int ix_ne = ix_nw + 1; - int iy_ne = iy_nw; - int ix_sw = ix_nw; - int iy_sw = iy_nw + 1; - int ix_se = ix_nw + 1; - int iy_se = iy_nw + 1; - - // get surfaces to each neighbor: - real nw = (ix_se - ix) * (iy_se - iy); - real ne = (ix - ix_sw) * (iy_sw - iy); - real sw = (ix_ne - ix) * (iy - iy_ne); - real se = (ix - ix_nw) * (iy - iy_nw); - - int ix_nw_cl, iy_nw_cl, ix_ne_cl, iy_ne_cl, ix_sw_cl, iy_sw_cl, ix_se_cl, iy_se_cl; - - if (padding_mode==MODE_BORDER){ - // get clipped NE, NW, SE, SW pixel values from (x, y) - CLIP_COORDINATES(ix_nw, ix_nw_cl, IW); - CLIP_COORDINATES(iy_nw, iy_nw_cl, IH); - CLIP_COORDINATES(ix_ne, ix_ne_cl, IW); - CLIP_COORDINATES(iy_ne, iy_ne_cl, IH); - CLIP_COORDINATES(ix_sw, ix_sw_cl, IW); - CLIP_COORDINATES(iy_sw, iy_sw_cl, IH); - CLIP_COORDINATES(ix_se, ix_se_cl, IW); - CLIP_COORDINATES(iy_se, iy_se_cl, IH); - } - else { - ix_nw_cl = ix_nw; - iy_nw_cl = iy_nw; - ix_ne_cl = ix_ne; - iy_ne_cl = iy_ne; - ix_sw_cl = ix_sw; - iy_sw_cl = iy_sw; - ix_se_cl = ix_se; - iy_se_cl = iy_se; - } - - for (int c = 0; c < C; ++c) { - real gradout = THTensor_(fastGet4d)(gradOutput, n, c, h, w); - - // calculate and set gradInput - SAFE_ADD(gradInput, ix_nw_cl, iy_nw_cl, n, c, IH, IW, nw * gradout); - SAFE_ADD(gradInput, ix_ne_cl, iy_ne_cl, n, c, IH, IW, ne * gradout); - SAFE_ADD(gradInput, ix_sw_cl, iy_sw_cl, n, c, IH, IW, sw * gradout); - SAFE_ADD(gradInput, ix_se_cl, iy_se_cl, n, c, IH, IW, se * gradout); - - // calculate gradGrid - real nw_val = SAFE_GET(input, ix_nw_cl, iy_nw_cl, n, c, IH, IW); - real ne_val = SAFE_GET(input, ix_ne_cl, iy_ne_cl, n, c, IH, IW); - real sw_val = SAFE_GET(input, ix_sw_cl, iy_sw_cl, n, c, IH, IW); - real se_val = SAFE_GET(input, ix_se_cl, iy_se_cl, n, c, IH, IW); - - gix -= nw_val * (iy_se - iy) * gradout; - gix += ne_val * (iy_sw - iy) * gradout; - gix -= sw_val * (iy - iy_ne) * gradout; - gix += se_val * (iy - iy_nw) * gradout; - - giy -= nw_val * (ix_se - ix) * gradout; - giy -= ne_val * (ix - ix_sw) * gradout; - giy += sw_val * (ix_ne - ix) * gradout; - giy += se_val * (ix - ix_nw) * gradout; - } - - // un-normalize gradGrid values back to [-1, 1] constraints - gix = gix * (IW - 1) / 2; - giy = giy * (IH - 1) / 2; - - real gix_old = THTensor_(fastGet4d)(gradGrid, n, h, w, 0); - real giy_old = THTensor_(fastGet4d)(gradGrid, n, h, w, 1); - - THTensor_(fastSet4d)(gradGrid, n, h, w, 0, gix_old + gix); - THTensor_(fastSet4d)(gradGrid, n, h, w, 1, giy_old + giy); - } - } - } -} - - -#undef MIN -#undef MAX -#undef SAFE_GET -#undef CLIP_COORDINATES -#undef SAFE_ADD -#undef MODE_BORDER - -#endif diff --git a/aten/src/THNN/generic/THNN.h b/aten/src/THNN/generic/THNN.h index 455da04c7e4454..1d7a9176553756 100644 --- a/aten/src/THNN/generic/THNN.h +++ b/aten/src/THNN/generic/THNN.h @@ -90,7 +90,8 @@ TH_API void THNN_(ELU_updateOutput)( THTensor *input, // input tensor THTensor *output, // [OUT] ELU output accreal alpha, // an ELU parameter (as in paper) - accreal scale, // scaling factor + accreal scale, // scaling factor for output + accreal input_scale, // scaling factor for input bool inplace); // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated) TH_API void THNN_(ELU_updateGradInput)( THNNState *state, // library's state @@ -98,7 +99,8 @@ TH_API void THNN_(ELU_updateGradInput)( THTensor *gradInput, // [OUT] gradient w.r.t. input THTensor *output, // output from a forward pass accreal alpha, // an ELU parameter (as in paper) - accreal scale); + accreal scale, + accreal input_scale); TH_API void THNN_(DistKLDivCriterion_updateOutput)( THNNState *state, // library's state @@ -1227,34 +1229,6 @@ TH_API void THNN_(SpatialUpSamplingBilinear_updateGradInput)( int osizeW, bool align_corners); -TH_API void THNN_(SpatialGridSamplerBilinear_updateOutput)( - THNNState *state, - THTensor *input, - THTensor *grid, - THTensor *output, - int padding_mode); - -TH_API void THNN_(SpatialGridSamplerBilinear_updateGradInput)( - THNNState *state, - THTensor *input, THTensor *gradInput, - THTensor *grid, THTensor *gradGrid, - THTensor *gradOutput, - int padding_mode); - -TH_API void THNN_(VolumetricGridSamplerBilinear_updateOutput)( - THNNState *state, - THTensor *input, - THTensor *grid, - THTensor *output, - int padding_mode); - -TH_API void THNN_(VolumetricGridSamplerBilinear_updateGradInput)( - THNNState *state, - THTensor *input, THTensor *gradInput, - THTensor *grid, THTensor *gradGrid, - THTensor *gradOutput, - int padding_mode); - TH_API void THNN_(unfolded_acc)( THTensor *finput, THTensor *input, diff --git a/aten/src/THNN/generic/TemporalRowConvolution.c b/aten/src/THNN/generic/TemporalRowConvolution.c index b623e5a2ad7fd4..e7b51ec194c402 100644 --- a/aten/src/THNN/generic/TemporalRowConvolution.c +++ b/aten/src/THNN/generic/TemporalRowConvolution.c @@ -38,7 +38,7 @@ static inline void THNN_(TemporalRowConvolution_shapeCheck)( THNN_ARGCHECK(!input->is_empty() && (ndim == 2 || ndim == 3), 1, input, "non-empty 2D or 3D (batch mode) input tensor expected, but got :%s"); - int64_t inputFrameSize = weight->size(0); + int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0); int64_t nInputFrame = input->size(dimS); int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1; @@ -197,7 +197,7 @@ void THNN_(TemporalRowConvolution_updateOutput)( THNN_(TemporalRowConvolution_shapeCheck)( state, input, NULL, weight, bias, kW, dW, padW); - int64_t inputFrameSize = weight->size(0); + int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0); int64_t nInputFrame = input->size(ndim - 1); int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1; @@ -311,7 +311,7 @@ void THNN_(TemporalRowConvolution_updateGradInput)( THNN_(TemporalRowConvolution_shapeCheck)(state, input, gradOutput, weight, NULL, kW, dW, padW); - int64_t inputFrameSize = weight->size(0); + int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0); int64_t nInputFrame = input->size(ndim - 1); int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1; @@ -386,7 +386,7 @@ static void THNN_(TemporalRowConvolution_accGradParameters_frame)( THTensor_(free)(tfinput); if (gradBias != NULL) { - for (i = 0; i < gradBias->size(0); i++) { + for (i = 0; i < THTensor_sizeLegacyNoScalars(gradBias, 0); i++) { int64_t k; real sum = 0; real *data = THStorage_(data)(THTensor_getStoragePtr(gradOutput3d)) diff --git a/aten/src/THNN/generic/VolumetricConvolution.c b/aten/src/THNN/generic/VolumetricConvolution.c index 4b74445e047705..c979edf71f8f4c 100644 --- a/aten/src/THNN/generic/VolumetricConvolution.c +++ b/aten/src/THNN/generic/VolumetricConvolution.c @@ -51,7 +51,7 @@ void THNN_(VolumetricConvolution_updateOutput)( /* add bias */ if (bias) { - for (i = 0; i < bias->size(0); i++) + for (i = 0; i < THTensor_sizeLegacyNoScalars(bias, 0); i++) { THTensor_(select)(outn, output, 0, i); THTensor_(fill)(outn, THTensor_(get1d)(bias, i)); @@ -78,7 +78,7 @@ void THNN_(VolumetricConvolution_updateOutput)( /* add bias */ if (bias) { - for (i = 0; i < bias->size(0); i++) + for (i = 0; i < THTensor_sizeLegacyNoScalars(bias, 0); i++) { THTensor_(select)(outn, outb, 0, i); THTensor_(fill)(outn, THTensor_(get1d)(bias, i)); @@ -117,7 +117,7 @@ void THNN_(VolumetricConvolution_updateGradInput)( "non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor " "expected for weight, but got: %s"); - int nOutputPlane = (int)weight->size(0); + int nOutputPlane = (int)THTensor_sizeLegacyNoScalars(weight, 0); THNN_ARGCHECK(!gradOutput->is_empty() && (gradOutput->dim() == 4 || gradOutput->dim() == 5), 3, gradOutput, @@ -187,9 +187,9 @@ void THNN_(VolumetricConvolution_accGradParameters)( "non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor " "expected for gradWeight, but got: %s"); - int nOutputPlane = (int)gradWeight->size(0); + int nOutputPlane = (int)THTensor_sizeLegacyNoScalars(gradWeight, 0); if (gradBias) { - THArgCheck(!gradBias->is_empty() && gradBias->dim() == 1 && gradBias->size(0) == nOutputPlane, 5, + THArgCheck(!gradBias->is_empty() && THTensor_nDimensionLegacyNoScalars(gradBias) == 1 && THTensor_sizeLegacyNoScalars(gradBias, 0) == nOutputPlane, 5, "gradBias tensor has wrong size" ); } diff --git a/aten/src/THNN/generic/VolumetricConvolutionMM.c b/aten/src/THNN/generic/VolumetricConvolutionMM.c index 14d98a79dd29b8..209d1575dacbec 100644 --- a/aten/src/THNN/generic/VolumetricConvolutionMM.c +++ b/aten/src/THNN/generic/VolumetricConvolutionMM.c @@ -102,7 +102,7 @@ static void inline THNN_(VolumetricConvolutionMM_shapeCheck)( int64_t nOutputPlane = weight->size(0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - int64_t nOutputPlane = bias->size(0); + int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, outputDepth); @@ -691,7 +691,7 @@ static void THNN_(VolumetricConvolutionMM_accGradParameters_frame)( } if (gradBias) { - for (i = 0; i < gradBias->size(0); i++) + for (i = 0; i < THTensor_sizeLegacyNoScalars(gradBias, 0); i++) { int64_t k; real sum = 0; diff --git a/aten/src/THNN/generic/VolumetricDilatedConvolution.c b/aten/src/THNN/generic/VolumetricDilatedConvolution.c index 8222c534612fd5..c9fa19f0adf488 100644 --- a/aten/src/THNN/generic/VolumetricDilatedConvolution.c +++ b/aten/src/THNN/generic/VolumetricDilatedConvolution.c @@ -69,7 +69,7 @@ static inline void THNN_(VolumetricDilatedConvolution_shapeCheck)( int64_t nOutputPlane = weight->size(0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - int64_t nOutputPlane = bias->size(0); + int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimd, outputDepth); diff --git a/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c b/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c index 4cc4dcc69837d8..16dedeffb9c58f 100644 --- a/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c +++ b/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c @@ -154,7 +154,7 @@ static inline void THNN_(VolumetricFullDilatedConvolution_shapeCheck)( const int64_t nOutputPlane = weight->size(1); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - const int64_t nOutputPlane = bias->size(0); + const int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimd, outputDepth); @@ -441,7 +441,7 @@ void THNN_(VolumetricFullDilatedConvolution_accGradParameters)( if (gradWeight) { nOutputPlane = THTensor_(size)(gradWeight, 1); } else if (gradBias) { - nOutputPlane = THTensor_(size)(gradBias, 0); + nOutputPlane = THTensor_sizeLegacyNoScalars(gradBias, 0); } else { return; } diff --git a/aten/src/THNN/generic/VolumetricGridSamplerBilinear.c b/aten/src/THNN/generic/VolumetricGridSamplerBilinear.c deleted file mode 100644 index 4d7ace422d4e97..00000000000000 --- a/aten/src/THNN/generic/VolumetricGridSamplerBilinear.c +++ /dev/null @@ -1,409 +0,0 @@ -#ifndef TH_GENERIC_FILE -#define TH_GENERIC_FILE "generic/VolumetricGridSamplerBilinear.c" -#else - -#undef MIN -#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) ) -#undef MAX -#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) ) - -#undef MODE_BORDER -#define MODE_BORDER 1 - -static inline void THNN_(VolumetricGridSamplerBilinear_shapeCheck) - (THTensor *input, THTensor *grid, THTensor *gradOutput) { - THNN_ARGCHECK(!input->is_empty() && input->dim() == 5, 2, input, - "non-empty 5D input tensor expected but got: %s"); - THNN_ARGCHECK(!grid->is_empty() && grid->dim() == 5, 2, grid, - "non-empty 5D grid tensor expected but got: %s"); - - int nbatch = THTensor_(size)(input, 0); - int channels = THTensor_(size)(input, 1); - int odepth = THTensor_(size)(grid, 1); - int oheight = THTensor_(size)(grid, 2); - int owidth = THTensor_(size)(grid, 3); - - THNN_CHECK_DIM_SIZE(grid, 5, 0, nbatch); - THNN_CHECK_DIM_SIZE(grid, 5, 4, 3); - - if (gradOutput != NULL) { - THNN_CHECK_DIM_SIZE(gradOutput, 5, 0, nbatch); - THNN_CHECK_DIM_SIZE(gradOutput, 5, 1, channels); - THNN_CHECK_DIM_SIZE(gradOutput, 5, 2, odepth); - THNN_CHECK_DIM_SIZE(gradOutput, 5, 3, oheight); - THNN_CHECK_DIM_SIZE(gradOutput, 5, 4, owidth); - } -} - -#define SAFE_GET(input, x, y, z, n, c, D, H, W) \ - x >= 0 && x < W && y >=0 && y < H && z >= 0 && z < D \ - ? THTensor_(fastGet5d)(input, n, c, z, y, x) : 0 - -#define CLIP_COORDINATES(in, out, clip_limit) out = MIN((clip_limit-1), MAX(in, 0)) - -TH_API void THNN_(VolumetricGridSamplerBilinear_updateOutput)( - THNNState *state, - THTensor *input, - THTensor *grid, - THTensor *output, - int padding_mode) { - - THNN_(VolumetricGridSamplerBilinear_shapeCheck)(input, grid, NULL); - int N = THTensor_(size)(input, 0); - int C = THTensor_(size)(input, 1); - int ID = THTensor_(size)(input, 2); - int IH = THTensor_(size)(input, 3); - int IW = THTensor_(size)(input, 4); - int D = THTensor_(size)(grid, 1); - int H = THTensor_(size)(grid, 2); - int W = THTensor_(size)(grid, 3); - - // resize output to the same shape as input - THTensor_(resize5d)(output, N, C, D, H, W); - - // loop over each output pixel - int n, d, h, w, c; -#pragma omp parallel for private(n, d, h, w, c) - for (n = 0; n < N; ++n) { - for (d = 0; d < D; ++d) { - for (h = 0; h < H; ++h) { - for (w = 0; w < W; ++w) { - // get the corresponding input x, y, z co-ordinates from grid - real ix = THTensor_(fastGet5d)(grid, n, d, h, w, 0); - real iy = THTensor_(fastGet5d)(grid, n, d, h, w, 1); - real iz = THTensor_(fastGet5d)(grid, n, d, h, w, 2); - - // normalize ix, iy, iz from [-1, 1] to [0, IW-1] & [0, IH-1] & [0, ID-1] - ix = ((ix + 1) / 2) * (IW-1); - iy = ((iy + 1) / 2) * (IH-1); - iz = ((iz + 1) / 2) * (ID-1); - - // get corner pixel values from (x, y, z) - // for 4d, we used north-east-south-west - // for 5d, we add top-bottom - int ix_tnw = floor(ix); - int iy_tnw = floor(iy); - int iz_tnw = floor(iz); - - int ix_tne = ix_tnw + 1; - int iy_tne = iy_tnw; - int iz_tne = iz_tnw; - - int ix_tsw = ix_tnw; - int iy_tsw = iy_tnw + 1; - int iz_tsw = iz_tnw; - - int ix_tse = ix_tnw + 1; - int iy_tse = iy_tnw + 1; - int iz_tse = iz_tnw; - - int ix_bnw = ix_tnw; - int iy_bnw = iy_tnw; - int iz_bnw = iz_tnw + 1; - - int ix_bne = ix_tnw + 1; - int iy_bne = iy_tnw; - int iz_bne = iz_tnw + 1; - - int ix_bsw = ix_tnw; - int iy_bsw = iy_tnw + 1; - int iz_bsw = iz_tnw + 1; - - int ix_bse = ix_tnw + 1; - int iy_bse = iy_tnw + 1; - int iz_bse = iz_tnw + 1; - - // get surfaces to each neighbor: - real tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz); - real tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz); - real tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz); - real tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz); - real bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse); - real bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw); - real bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne); - real bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw); - - if (padding_mode==MODE_BORDER){ - // clip coordinates to image borders - CLIP_COORDINATES(ix_tnw, ix_tnw, IW); - CLIP_COORDINATES(iy_tnw, iy_tnw, IH); - CLIP_COORDINATES(iz_tnw, iz_tnw, ID); - CLIP_COORDINATES(ix_tne, ix_tne, IW); - CLIP_COORDINATES(iy_tne, iy_tne, IH); - CLIP_COORDINATES(iz_tne, iz_tne, ID); - CLIP_COORDINATES(ix_tsw, ix_tsw, IW); - CLIP_COORDINATES(iy_tsw, iy_tsw, IH); - CLIP_COORDINATES(iz_tsw, iz_tsw, ID); - CLIP_COORDINATES(ix_tse, ix_tse, IW); - CLIP_COORDINATES(iy_tse, iy_tse, IH); - CLIP_COORDINATES(iz_tse, iz_tse, ID); - CLIP_COORDINATES(ix_bnw, ix_bnw, IW); - CLIP_COORDINATES(iy_bnw, iy_bnw, IH); - CLIP_COORDINATES(iz_bnw, iz_bnw, ID); - CLIP_COORDINATES(ix_bne, ix_bne, IW); - CLIP_COORDINATES(iy_bne, iy_bne, IH); - CLIP_COORDINATES(iz_bne, iz_bne, ID); - CLIP_COORDINATES(ix_bsw, ix_bsw, IW); - CLIP_COORDINATES(iy_bsw, iy_bsw, IH); - CLIP_COORDINATES(iz_bsw, iz_bsw, ID); - CLIP_COORDINATES(ix_bse, ix_bse, IW); - CLIP_COORDINATES(iy_bse, iy_bse, IH); - CLIP_COORDINATES(iz_bse, iz_bse, ID); - } - - // calculate bilinear weighted pixel value and set output pixel - for (c = 0; c < C; ++c) { - // (c, iy_nw, ix_nw) * nw + (c, iy_ne, ix_ne) * ne - // + (c, iy_sw, ix_sw) * sw + (c, iy_se, ix_se) * se - real tnw_val = SAFE_GET(input, ix_tnw, iy_tnw, iz_tnw, n, c, ID, IH, IW); - real tne_val = SAFE_GET(input, ix_tne, iy_tne, iz_tne, n, c, ID, IH, IW); - real tsw_val = SAFE_GET(input, ix_tsw, iy_tsw, iz_tsw, n, c, ID, IH, IW); - real tse_val = SAFE_GET(input, ix_tse, iy_tse, iz_tse, n, c, ID, IH, IW); - real bnw_val = SAFE_GET(input, ix_bnw, iy_bnw, iz_bnw, n, c, ID, IH, IW); - real bne_val = SAFE_GET(input, ix_bne, iy_bne, iz_bne, n, c, ID, IH, IW); - real bsw_val = SAFE_GET(input, ix_bsw, iy_bsw, iz_bsw, n, c, ID, IH, IW); - real bse_val = SAFE_GET(input, ix_bse, iy_bse, iz_bse, n, c, ID, IH, IW); - real out_val = tnw_val * tnw + tne_val * tne + tsw_val * tsw + tse_val * tse + - bnw_val * bnw + bne_val * bne + bsw_val * bsw + bse_val * bse; - THTensor_(fastSet5d)(output, n, c, d, h, w, out_val); - } - } - } - } - } -} - -#define SAFE_ADD(input, x, y, z, n, c, D, H, W, value) \ - do { \ - if (x >= 0 && x < W && y >=0 && y < H && z >=0 && z < D) { \ - real old_value = THTensor_(fastGet5d)(input, n, c, z, y, x); \ - THTensor_(fastSet5d)(input, n, c, z, y, x, value + old_value); \ - } \ - } while(0) - -TH_API void THNN_(VolumetricGridSamplerBilinear_updateGradInput)( - THNNState *state, - THTensor *input, THTensor *gradInput, - THTensor *grid, THTensor *gradGrid, - THTensor *gradOutput, - int padding_mode) { - - THNN_(VolumetricGridSamplerBilinear_shapeCheck)(input, grid, gradOutput); - int N = THTensor_(size)(input, 0); - int C = THTensor_(size)(input, 1); - int ID = THTensor_(size)(input, 2); - int IH = THTensor_(size)(input, 3); - int IW = THTensor_(size)(input, 4); - int D = THTensor_(size)(grid, 1); - int H = THTensor_(size)(grid, 2); - int W = THTensor_(size)(grid, 3); - - THTensor_(resize5d)(gradInput, N, C, ID, IH, IW); - THTensor_(resize5d)(gradGrid, N, D, H, W, 3); - THTensor_(zero)(gradInput); - THTensor_(zero)(gradGrid); - - // loop over each output pixel - int n, d, h, w; -//#pragma omp parallel for private(n, d, h, w) - for (n = 0; n < N; ++n) { - for (d = 0; d < D; ++d) { - for (h = 0; h < H; ++h) { - for (w = 0; w < W; ++w) { - // get the corresponding input x, y, z co-ordinates from grid - real ix = THTensor_(fastGet5d)(grid, n, d, h, w, 0); - real iy = THTensor_(fastGet5d)(grid, n, d, h, w, 1); - real iz = THTensor_(fastGet5d)(grid, n, d, h, w, 2); - - real gix = 0; - real giy = 0; - real giz = 0; - - // normalize ix, iy, iz from [-1, 1] to [0, W-1] & [0, H-1] & [0, D-1] - ix = ((ix + 1) / 2) * (IW-1); - iy = ((iy + 1) / 2) * (IH-1); - iz = ((iz + 1) / 2) * (ID-1); - - // get corner pixel values from (x, y, z) - // for 4d, we used north-east-south-west - // for 5d, we add top-bottom - int ix_tnw = floor(ix); - int iy_tnw = floor(iy); - int iz_tnw = floor(iz); - - int ix_tne = ix_tnw + 1; - int iy_tne = iy_tnw; - int iz_tne = iz_tnw; - - int ix_tsw = ix_tnw; - int iy_tsw = iy_tnw + 1; - int iz_tsw = iz_tnw; - - int ix_tse = ix_tnw + 1; - int iy_tse = iy_tnw + 1; - int iz_tse = iz_tnw; - - int ix_bnw = ix_tnw; - int iy_bnw = iy_tnw; - int iz_bnw = iz_tnw + 1; - - int ix_bne = ix_tnw + 1; - int iy_bne = iy_tnw; - int iz_bne = iz_tnw + 1; - - int ix_bsw = ix_tnw; - int iy_bsw = iy_tnw + 1; - int iz_bsw = iz_tnw + 1; - - int ix_bse = ix_tnw + 1; - int iy_bse = iy_tnw + 1; - int iz_bse = iz_tnw + 1; - - // get surfaces to each neighbor: - real tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz); - real tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz); - real tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz); - real tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz); - real bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse); - real bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw); - real bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne); - real bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw); - - int ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, ix_tne_cl, iy_tne_cl, iz_tne_cl; - int ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, ix_tse_cl, iy_tse_cl, iz_tse_cl; - int ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, ix_bne_cl, iy_bne_cl, iz_bne_cl; - int ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, ix_bse_cl, iy_bse_cl, iz_bse_cl; - - if (padding_mode==MODE_BORDER){ - // clip coordinates to image borders - CLIP_COORDINATES(ix_tnw, ix_tnw_cl, IW); - CLIP_COORDINATES(iy_tnw, iy_tnw_cl, IH); - CLIP_COORDINATES(iz_tnw, iz_tnw_cl, ID); - CLIP_COORDINATES(ix_tne, ix_tne_cl, IW); - CLIP_COORDINATES(iy_tne, iy_tne_cl, IH); - CLIP_COORDINATES(iz_tne, iz_tne_cl, ID); - CLIP_COORDINATES(ix_tsw, ix_tsw_cl, IW); - CLIP_COORDINATES(iy_tsw, iy_tsw_cl, IH); - CLIP_COORDINATES(iz_tsw, iz_tsw_cl, ID); - CLIP_COORDINATES(ix_tse, ix_tse_cl, IW); - CLIP_COORDINATES(iy_tse, iy_tse_cl, IH); - CLIP_COORDINATES(iz_tse, iz_tse_cl, ID); - CLIP_COORDINATES(ix_bnw, ix_bnw_cl, IW); - CLIP_COORDINATES(iy_bnw, iy_bnw_cl, IH); - CLIP_COORDINATES(iz_bnw, iz_bnw_cl, ID); - CLIP_COORDINATES(ix_bne, ix_bne_cl, IW); - CLIP_COORDINATES(iy_bne, iy_bne_cl, IH); - CLIP_COORDINATES(iz_bne, iz_bne_cl, ID); - CLIP_COORDINATES(ix_bsw, ix_bsw_cl, IW); - CLIP_COORDINATES(iy_bsw, iy_bsw_cl, IH); - CLIP_COORDINATES(iz_bsw, iz_bsw_cl, ID); - CLIP_COORDINATES(ix_bse, ix_bse_cl, IW); - CLIP_COORDINATES(iy_bse, iy_bse_cl, IH); - CLIP_COORDINATES(iz_bse, iz_bse_cl, ID); - } - else { - ix_tnw_cl = ix_tnw; - iy_tnw_cl = iy_tnw; - iz_tnw_cl = iz_tnw; - ix_tne_cl = ix_tne; - iy_tne_cl = iy_tne; - iz_tne_cl = iz_tne; - ix_tsw_cl = ix_tsw; - iy_tsw_cl = iy_tsw; - iz_tsw_cl = iz_tsw; - ix_tse_cl = ix_tse; - iy_tse_cl = iy_tse; - iz_tse_cl = iz_tse; - ix_bnw_cl = ix_bnw; - iy_bnw_cl = iy_bnw; - iz_bnw_cl = iz_bnw; - ix_bne_cl = ix_bne; - iy_bne_cl = iy_bne; - iz_bne_cl = iz_bne; - ix_bsw_cl = ix_bsw; - iy_bsw_cl = iy_bsw; - iz_bsw_cl = iz_bsw; - ix_bse_cl = ix_bse; - iy_bse_cl = iy_bse; - iz_bse_cl = iz_bse; - } - - for (int c = 0; c < C; ++c) { - real gradout = THTensor_(fastGet5d)(gradOutput, n, c, d, h, w); - - // calculate and set gradInput - SAFE_ADD(gradInput, ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, n, c, ID, IH, IW, tnw * gradout); - SAFE_ADD(gradInput, ix_tne_cl, iy_tne_cl, iz_tne_cl, n, c, ID, IH, IW, tne * gradout); - SAFE_ADD(gradInput, ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, n, c, ID, IH, IW, tsw * gradout); - SAFE_ADD(gradInput, ix_tse_cl, iy_tse_cl, iz_tse_cl, n, c, ID, IH, IW, tse * gradout); - SAFE_ADD(gradInput, ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, n, c, ID, IH, IW, bnw * gradout); - SAFE_ADD(gradInput, ix_bne_cl, iy_bne_cl, iz_bne_cl, n, c, ID, IH, IW, bne * gradout); - SAFE_ADD(gradInput, ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, n, c, ID, IH, IW, bsw * gradout); - SAFE_ADD(gradInput, ix_bse_cl, iy_bse_cl, iz_bse_cl, n, c, ID, IH, IW, bse * gradout); - - // calculate gradGrid - real tnw_val = SAFE_GET(input, ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, n, c, ID, IH, IW); - real tne_val = SAFE_GET(input, ix_tne_cl, iy_tne_cl, iz_tne_cl, n, c, ID, IH, IW); - real tsw_val = SAFE_GET(input, ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, n, c, ID, IH, IW); - real tse_val = SAFE_GET(input, ix_tse_cl, iy_tse_cl, iz_tse_cl, n, c, ID, IH, IW); - real bnw_val = SAFE_GET(input, ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, n, c, ID, IH, IW); - real bne_val = SAFE_GET(input, ix_bne_cl, iy_bne_cl, iz_bne_cl, n, c, ID, IH, IW); - real bsw_val = SAFE_GET(input, ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, n, c, ID, IH, IW); - real bse_val = SAFE_GET(input, ix_bse_cl, iy_bse_cl, iz_bse_cl, n, c, ID, IH, IW); - - gix -= tnw_val * (iy_bse - iy) * (iz_bse - iz) * gradout; - gix += tne_val * (iy_bsw - iy) * (iz_bsw - iz) * gradout; - gix -= tsw_val * (iy - iy_bne) * (iz_bne - iz) * gradout; - gix += tse_val * (iy - iy_bnw) * (iz_bnw - iz) * gradout; - gix -= bnw_val * (iy_tse - iy) * (iz - iz_tse) * gradout; - gix += bne_val * (iy_tsw - iy) * (iz - iz_tsw) * gradout; - gix -= bsw_val * (iy - iy_tne) * (iz - iz_tne) * gradout; - gix += bse_val * (iy - iy_tnw) * (iz - iz_tnw) * gradout; - - - giy -= tnw_val * (ix_bse - ix) * (iz_bse - iz) * gradout; - giy -= tne_val * (ix - ix_bsw) * (iz_bsw - iz) * gradout; - giy += tsw_val * (ix_bne - ix) * (iz_bne - iz) * gradout; - giy += tse_val * (ix - ix_bnw) * (iz_bnw - iz) * gradout; - giy -= bnw_val * (ix_tse - ix) * (iz - iz_tse) * gradout; - giy -= bne_val * (ix - ix_tsw) * (iz - iz_tsw) * gradout; - giy += bsw_val * (ix_tne - ix) * (iz - iz_tne) * gradout; - giy += bse_val * (ix - ix_tnw) * (iz - iz_tnw) * gradout; - - giz -= tnw_val * (ix_bse - ix) * (iy_bse - iy) * gradout; - giz -= tne_val * (ix - ix_bsw) * (iy_bsw - iy) * gradout; - giz -= tsw_val * (ix_bne - ix) * (iy - iy_bne) * gradout; - giz -= tse_val * (ix - ix_bnw) * (iy - iy_bnw) * gradout; - giz += bnw_val * (ix_tse - ix) * (iy_tse - iy) * gradout; - giz += bne_val * (ix - ix_tsw) * (iy_tsw - iy) * gradout; - giz += bsw_val * (ix_tne - ix) * (iy - iy_tne) * gradout; - giz += bse_val * (ix - ix_tnw) * (iy - iy_tnw) * gradout; - - } - - // un-normalize gradGrid values back to [-1, 1] constraints - gix = gix * (IW - 1) / 2; - giy = giy * (IH - 1) / 2; - giz = giz * (ID - 1) / 2; - - real gix_old = THTensor_(fastGet5d)(gradGrid, n, d, h, w, 0); - real giy_old = THTensor_(fastGet5d)(gradGrid, n, d, h, w, 1); - real giz_old = THTensor_(fastGet5d)(gradGrid, n, d, h, w, 2); - - THTensor_(fastSet5d)(gradGrid, n, d, h, w, 0, gix_old + gix); - THTensor_(fastSet5d)(gradGrid, n, d, h, w, 1, giy_old + giy); - THTensor_(fastSet5d)(gradGrid, n, d, h, w, 2, giz_old + giz); - } - } - } - } -} - -#undef MIN -#undef MAX -#undef SAFE_GET -#undef CLIP_COORDINATES -#undef SAFE_ADD -#undef MODE_BORDER - -#endif diff --git a/aten/src/THNN/init.cpp b/aten/src/THNN/init.cpp index 6c79f5be295b60..c77cd76d54ec87 100644 --- a/aten/src/THNN/init.cpp +++ b/aten/src/THNN/init.cpp @@ -45,7 +45,7 @@ #define THNN_CHECK_DIM_SIZE(T, DIM, DIM_SIZE, SIZE) \ if (THTensor_(nDimensionLegacyNoScalars)(T) != DIM || \ - THTensor_(size)(T, DIM_SIZE) != SIZE) { \ + THTensor_sizeLegacyNoScalars(T, DIM_SIZE) != SIZE) { \ THDescBuff s1 = THTensor_(sizeDesc)(T); \ THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \ " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \ @@ -53,7 +53,7 @@ #define THNN_CHECK_DIM_SIZE_INDICES(T, DIM, DIM_SIZE, SIZE) \ if (THIndexTensor_(nDimensionLegacyNoScalars)(T) != DIM || \ - THIndexTensor_(size)(T, DIM_SIZE) != SIZE) { \ + THTensor_sizeLegacyNoScalars(T, DIM_SIZE) != SIZE) { \ THDescBuff s1 = THIndexTensor_(sizeDesc)(T); \ THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \ " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \ @@ -245,9 +245,6 @@ #include "generic/SpatialUpSamplingBilinear.c" #include "THGenerateFloatTypes.h" -#include "generic/SpatialGridSamplerBilinear.c" -#include "THGenerateFloatTypes.h" - #include "generic/VolumetricAveragePooling.c" #include "THGenerateFloatTypes.h" @@ -304,6 +301,3 @@ #include "generic/VolumetricUpSamplingTrilinear.c" #include "THGenerateFloatTypes.h" - -#include "generic/VolumetricGridSamplerBilinear.c" -#include "THGenerateFloatTypes.h" diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 0d84ccbfb606a1..588dae10e8e8e3 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -40,6 +40,7 @@ if(BUILD_ATEN) # ATen tests use catch instead of gtest so keep separate for now # list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS}) # list(APPEND Caffe2_GPU_TEST_SRCS ${ATen_CUDA_TEST_SRCS}) + list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CORE_TEST_SRCS}) list(APPEND Caffe2_CPU_INCLUDE ${ATen_CPU_INCLUDE}) list(APPEND Caffe2_GPU_INCLUDE ${ATen_CUDA_INCLUDE}) list(APPEND Caffe2_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS}) @@ -51,6 +52,15 @@ if(BUILD_ATEN) set(Caffe2_HIP_SRCS ${ATen_CUDA_SRCS}) set(Caffe2_HIP_INCLUDES ${Caffe2_HIP_INCLUDES} ${Caffe2_GPU_INCLUDE}) ENDIF(USE_ROCM) +else() + # Only add "ATen Core", a minimal, easy-to-compile fragment of ATen. + # This codepath should only be exercised by the Android build. + add_subdirectory(../aten/src/ATen/core ATen_core) + list(APPEND Caffe2_CPU_SRCS ${ATen_CORE_SRCS}) + list(APPEND Caffe2_CPU_INCLUDE ${ATen_CORE_INCLUDE}) + list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CORE_TEST_SRCS}) + # TODO: We should probably install the headers, but I don't know + # how to do that. endif() # ---[ Torch build @@ -215,6 +225,72 @@ target_include_directories(caffe2 SYSTEM PRIVATE "${Caffe2_DEPENDENCY_INCLUDE}") aten_set_target_props(caffe2) target_compile_options(caffe2 INTERFACE "-std=c++11") target_compile_options(caffe2 PRIVATE "-DCAFFE2_BUILD_MAIN_LIB") +if (MSVC AND NOT BUILD_SHARED_LIBS) + # Note [Supporting both static and dynamic libraries on Window] + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # A Windows library may be distributed as either a static or dynamic + # library. The chosen distribution mechanism affects how you setup + # the headers for the library: if you statically link a function, + # all you need is an ordinary signature: + # + # void f(); + # + # But if you *dynamically* link it, then you must provide a __declspec + # specifying that it should be imported from a DLL: + # + # __declspec(dllimport) void f(); + # + # Mixing the two situations will not work: if you specify dllimport + # while statically linking, the linker will complain it cannot find + # the __imp_f symbol (which serve as the DLL entrypoint); if you + # fail to specify dllimport for a symbol that's coming from a DLL, + # the linker will complain that it can't find f. Joy! + # + # Most places on the Internet, you will find people have written + # their headers under the assumption that the application will + # only ever be dynamically linked, as they define a macro which + # tags a function as __declspec(dllexport) if you are actually + # building the library, and __declspec(dllimport) otherwise. But + # if you want these headers to also work if you are linking against + # a static library, you need a way to avoid adding these __declspec's + # at all. And that "mechanism" needs to apply to any downstream + # libraries/executables which are going to link against your library. + # + # As an aside, why do we need to support both modes? + # For historical reasons, PyTorch ATen on Windows is built dynamically, + # while Caffe2 on Windows is built statically (mostly because if + # we build it dynamically, we are over the DLL exported symbol limit--and + # that is because Caffe2 hasn't comprehensively annotated all symbols + # which cross the DLL boundary with CAFFE_API). So any code + # which is used by both PyTorch and Caffe2 needs to support both + # modes of linking. + # + # So, you have a macro (call it AT_CORE_STATIC_WINDOWS) which you need to have + # set for any downstream library/executable that transitively includes your + # headers. How are you going to do this? You have two options: + # + # 1. Write out a config.h header which stores whether or not + # you are linking statically or dynamically. + # + # 2. Force all of users to set the the macro themselves. If they + # use cmake, you can set -DAT_CORE_STATIC_WINDOWS=1 as a PUBLIC + # compile option, in which case cmake will automatically + # add the macro for you. + # + # Which one is better? Well, it depends: they trade off implementor + # ease versus user ease: (1) is more work for the library author + # but the user doesn't have to worry about it; (2) requires the user + # to set the macro themselves... but only if they don't use cmake. + # + # So, which is appropriate in our situation? In my mind, here is + # the distinguishing factor: it is more common to distribute + # DLLs, since they don't require you to line up the CRT version + # (/MD, /MDd, /MT, /MTd) and MSVC version at the use site. So, + # if a user is already in the business of static linkage, they're + # already in "expert user" realm. So, I've decided that at this + # point in time, the simplicity of implementation of (2) wins out. + target_compile_options(caffe2 PUBLIC "-DAT_CORE_STATIC_WINDOWS=1") +endif() # Use -O2 for release builds (-O3 doesn't improve perf, and -Os results in perf regression) target_compile_options(caffe2 PRIVATE "$<$,$>:-O2>") install(TARGETS caffe2 EXPORT Caffe2Targets DESTINATION lib) diff --git a/caffe2/contrib/aten/aten_op.cc b/caffe2/contrib/aten/aten_op.cc index bc93f4866ebc28..df3ee5326b7d90 100644 --- a/caffe2/contrib/aten/aten_op.cc +++ b/caffe2/contrib/aten/aten_op.cc @@ -10,7 +10,6 @@ at::Backend ATenOp::backend() const { } OPERATOR_SCHEMA(ATen); -CAFFE_KNOWN_TYPE(at::Half); namespace math { template <> diff --git a/caffe2/core/context.h b/caffe2/core/context.h index f2831909e1587a..fc3969879f30c4 100644 --- a/caffe2/core/context.h +++ b/caffe2/core/context.h @@ -13,6 +13,9 @@ #include "caffe2/core/typeid.h" #include "caffe2/proto/caffe2.pb.h" +#include "ATen/core/ATenCoreTest.h" +#include "ATen/core/ArrayRef.h" + CAFFE2_DECLARE_bool(caffe2_report_cpu_memory_usage); namespace caffe2 { diff --git a/caffe2/core/context_test.cc b/caffe2/core/context_test.cc index a6e44846e9e0be..8924a9dc931be9 100644 --- a/caffe2/core/context_test.cc +++ b/caffe2/core/context_test.cc @@ -6,6 +6,11 @@ namespace caffe2 { +TEST(CPUContextTest, ATenCoreTest) { + int i = at::CoreTest(); + EXPECT_EQ(i + 1, at::CoreTest()); +} + TEST(CPUContextTest, TestAllocAlignment) { for (int i = 1; i < 10; ++i) { auto data = CPUContext::New(i); diff --git a/caffe2/core/dispatch/DeviceId.h b/caffe2/core/dispatch/DeviceId.h index e74a803557ea0d..e5744ce1e1c2d6 100644 --- a/caffe2/core/dispatch/DeviceId.h +++ b/caffe2/core/dispatch/DeviceId.h @@ -1,8 +1,8 @@ #pragma once +#include #include #include -#include "caffe2/utils/C++17.h" namespace c10 { diff --git a/caffe2/core/dispatch/LayoutId.h b/caffe2/core/dispatch/LayoutId.h index 7f039fadfa9698..9ec44519b95a99 100644 --- a/caffe2/core/dispatch/LayoutId.h +++ b/caffe2/core/dispatch/LayoutId.h @@ -1,10 +1,10 @@ #pragma once -#include "caffe2/utils/IdWrapper.h" +#include "ATen/core/IdWrapper.h" namespace c10 { -class LayoutId final : public c10::guts::IdWrapper { +class LayoutId final : public at::IdWrapper { public: constexpr explicit LayoutId(underlying_type id): IdWrapper(id) {} @@ -19,4 +19,4 @@ class LayoutId final : public c10::guts::IdWrapper { } -C10_DEFINE_HASH_FOR_IDWRAPPER(c10::LayoutId) +AT_DEFINE_HASH_FOR_IDWRAPPER(c10::LayoutId) diff --git a/caffe2/core/dispatch/TensorTypeId.h b/caffe2/core/dispatch/TensorTypeId.h index a80fc8377c8ca5..244817904667b9 100644 --- a/caffe2/core/dispatch/TensorTypeId.h +++ b/caffe2/core/dispatch/TensorTypeId.h @@ -1,6 +1,6 @@ #pragma once -#include "caffe2/utils/IdWrapper.h" +#include "ATen/core/IdWrapper.h" #include #include #include @@ -21,7 +21,7 @@ namespace details { /** * Dynamic type ID of a Tensor argument. It represents something like CPUTensor, etc. */ -class TensorTypeId final : public guts::IdWrapper { +class TensorTypeId final : public at::IdWrapper { public: // Don't use this! // Unfortunately, a default constructor needs to be defined because of https://reviews.llvm.org/D41223 @@ -35,4 +35,4 @@ class TensorTypeId final : public guts::IdWrapper namespace c10 { diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpClasses.h b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpClasses.h index 1e8156abe42172..70490856b5ecaf 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpClasses.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpClasses.h @@ -659,336 +659,3 @@ class NHWC2NCHW : public NeuralNetOperator { private: }; - -class Int8Quantize : public NeuralNetOperator { - public: - Int8Quantize() : NeuralNetOperator(NNKind::Int8Quantize) {} - - ~Int8Quantize() {} - - NOMNIGRAPH_DEFINE_NN_RTTI(Int8Quantize); - - private: -}; - -class Int8Dequantize : public NeuralNetOperator { - public: - Int8Dequantize() : NeuralNetOperator(NNKind::Int8Dequantize) {} - - ~Int8Dequantize() {} - - NOMNIGRAPH_DEFINE_NN_RTTI(Int8Dequantize); - - private: -}; - -class Int8AveragePool : public NeuralNetOperator { - public: - Int8AveragePool() : NeuralNetOperator(NNKind::Int8AveragePool) {} - - Int8AveragePool(const AveragePool& averagePool) - : NeuralNetOperator(NNKind::Int8AveragePool) {} - - ~Int8AveragePool() {} - - NOMNIGRAPH_DEFINE_NN_RTTI(Int8AveragePool); - - private: -}; - -class Int8Conv : public NeuralNetOperator { - public: - Int8Conv() : NeuralNetOperator(NNKind::Int8Conv) {} - - Int8Conv(const Conv& conv) : NeuralNetOperator(NNKind::Int8Conv) {} - - ~Int8Conv() {} - - NOMNIGRAPH_DEFINE_NN_RTTI(Int8Conv); - - private: -}; - -class Int8ConvTranspose : public NeuralNetOperator { - public: - Int8ConvTranspose() : NeuralNetOperator(NNKind::Int8ConvTranspose) {} - - Int8ConvTranspose(const ConvTranspose& convTranspose) - : NeuralNetOperator(NNKind::Int8ConvTranspose) {} - - ~Int8ConvTranspose() {} - - NOMNIGRAPH_DEFINE_NN_RTTI(Int8ConvTranspose); - - private: -}; - -class Int8FC : public NeuralNetOperator { - public: - Int8FC() : NeuralNetOperator(NNKind::Int8FC) {} - - Int8FC(const FC& fC) : NeuralNetOperator(NNKind::Int8FC) {} - - ~Int8FC() {} - - NOMNIGRAPH_DEFINE_NN_RTTI(Int8FC); - - private: -}; - -class Int8MaxPool : public NeuralNetOperator { - public: - Int8MaxPool() : NeuralNetOperator(NNKind::Int8MaxPool) {} - - Int8MaxPool(const MaxPool& maxPool) - : NeuralNetOperator(NNKind::Int8MaxPool) {} - - ~Int8MaxPool() {} - - NOMNIGRAPH_DEFINE_NN_RTTI(Int8MaxPool); - - private: -}; - -class Int8Relu : public NeuralNetOperator { - public: - Int8Relu() : NeuralNetOperator(NNKind::Int8Relu) {} - - Int8Relu(const Relu& relu) : NeuralNetOperator(NNKind::Int8Relu) {} - - ~Int8Relu() {} - - NOMNIGRAPH_DEFINE_NN_RTTI(Int8Relu); - - private: -}; - -class Int8GivenTensorFill : public NeuralNetOperator { - public: - Int8GivenTensorFill() : NeuralNetOperator(NNKind::Int8GivenTensorFill) {} - - Int8GivenTensorFill(const GivenTensorFill& givenTensorFill) - : NeuralNetOperator(NNKind::Int8GivenTensorFill) {} - - ~Int8GivenTensorFill() {} - - NOMNIGRAPH_DEFINE_NN_RTTI(Int8GivenTensorFill); - - private: -}; - -class Int8Concat : public NeuralNetOperator { - public: - Int8Concat() : NeuralNetOperator(NNKind::Int8Concat) {} - - Int8Concat(const Concat& concat) : NeuralNetOperator(NNKind::Int8Concat) {} - - ~Int8Concat() {} - - NOMNIGRAPH_DEFINE_NN_RTTI(Int8Concat); - - private: -}; - -class Int8Softmax : public NeuralNetOperator { - public: - Int8Softmax() : NeuralNetOperator(NNKind::Int8Softmax) {} - - Int8Softmax(const Softmax& softmax) - : NeuralNetOperator(NNKind::Int8Softmax) {} - - ~Int8Softmax() {} - - NOMNIGRAPH_DEFINE_NN_RTTI(Int8Softmax); - - private: -}; - -class Int8ChannelShuffle : public NeuralNetOperator { - public: - Int8ChannelShuffle() : NeuralNetOperator(NNKind::Int8ChannelShuffle) {} - - Int8ChannelShuffle(const ChannelShuffle& channelShuffle) - : NeuralNetOperator(NNKind::Int8ChannelShuffle) {} - - ~Int8ChannelShuffle() {} - - NOMNIGRAPH_DEFINE_NN_RTTI(Int8ChannelShuffle); - - private: -}; - -class Int8Sum : public NeuralNetOperator { - public: - Int8Sum() : NeuralNetOperator(NNKind::Int8Sum) {} - - Int8Sum(const Sum& sum) : NeuralNetOperator(NNKind::Int8Sum) {} - - ~Int8Sum() {} - - NOMNIGRAPH_DEFINE_NN_RTTI(Int8Sum); - - private: -}; - -class Int8Add : public NeuralNetOperator { - public: - Int8Add() : NeuralNetOperator(NNKind::Int8Add) {} - - Int8Add(const Add& add) : NeuralNetOperator(NNKind::Int8Add) {} - - ~Int8Add() {} - - NOMNIGRAPH_DEFINE_NN_RTTI(Int8Add); - - private: -}; - -class Int8Reshape : public NeuralNetOperator { - public: - Int8Reshape() : NeuralNetOperator(NNKind::Int8Reshape) {} - - Int8Reshape(const Reshape& reshape) - : NeuralNetOperator(NNKind::Int8Reshape) {} - - ~Int8Reshape() {} - - NOMNIGRAPH_DEFINE_NN_RTTI(Int8Reshape); - - private: -}; - -class Int8Flatten : public NeuralNetOperator { - public: - Int8Flatten() : NeuralNetOperator(NNKind::Int8Flatten) {} - - Int8Flatten(const Flatten& flatten) - : NeuralNetOperator(NNKind::Int8Flatten) {} - - ~Int8Flatten() {} - - NOMNIGRAPH_DEFINE_NN_RTTI(Int8Flatten); - - private: -}; - -class Int8ConvRelu : public NeuralNetOperator { - public: - Int8ConvRelu() : NeuralNetOperator(NNKind::Int8ConvRelu) {} - - Int8ConvRelu(const ConvRelu& convRelu) - : NeuralNetOperator(NNKind::Int8ConvRelu) {} - - ~Int8ConvRelu() {} - - NOMNIGRAPH_DEFINE_NN_RTTI(Int8ConvRelu); - - private: -}; - -class Int8SumRelu : public NeuralNetOperator { - public: - Int8SumRelu() : NeuralNetOperator(NNKind::Int8SumRelu) {} - - Int8SumRelu(const SumRelu& sumRelu) - : NeuralNetOperator(NNKind::Int8SumRelu) {} - - ~Int8SumRelu() {} - - NOMNIGRAPH_DEFINE_NN_RTTI(Int8SumRelu); - - private: -}; - -class Int8AveragePoolRelu : public NeuralNetOperator { - public: - Int8AveragePoolRelu() : NeuralNetOperator(NNKind::Int8AveragePoolRelu) {} - - Int8AveragePoolRelu(const AveragePoolRelu& averagePoolRelu) - : NeuralNetOperator(NNKind::Int8AveragePoolRelu) {} - - ~Int8AveragePoolRelu() {} - - NOMNIGRAPH_DEFINE_NN_RTTI(Int8AveragePoolRelu); - - private: -}; - -class Int8MaxPoolRelu : public NeuralNetOperator { - public: - Int8MaxPoolRelu() : NeuralNetOperator(NNKind::Int8MaxPoolRelu) {} - - Int8MaxPoolRelu(const MaxPoolRelu& maxPoolRelu) - : NeuralNetOperator(NNKind::Int8MaxPoolRelu) {} - - ~Int8MaxPoolRelu() {} - - NOMNIGRAPH_DEFINE_NN_RTTI(Int8MaxPoolRelu); - - private: -}; - -class BatchMatMul : public NeuralNetOperator { - public: - BatchMatMul(bool transA = false, bool transB = true, bool broadcast = false) - : NeuralNetOperator(NNKind::BatchMatMul), - TransA(transA), - TransB(transB), - Broadcast(broadcast) {} - - ~BatchMatMul() {} - - NOMNIGRAPH_DEFINE_NN_RTTI(BatchMatMul); - - bool getTransA() const { - return TransA; - } - - bool getTransB() const { - return TransB; - } - - bool getBroadcast() const { - return Broadcast; - } - - void setTransA(bool transA) { - TransA = transA; - } - - void setTransB(bool transB) { - TransB = transB; - } - - void setBroadcast(bool broadcast) { - Broadcast = broadcast; - } - - private: - bool TransA; - bool TransB; - bool Broadcast; -}; - -class BatchGather : public NeuralNetOperator { - public: - BatchGather() : NeuralNetOperator(NNKind::BatchGather) {} - - ~BatchGather() {} - - NOMNIGRAPH_DEFINE_NN_RTTI(BatchGather); - - private: -}; - -class ConcatBatchMatMulBatchGatherOp : public NeuralNetOperator { - public: - ConcatBatchMatMulBatchGatherOp() - : NeuralNetOperator(NNKind::ConcatBatchMatMulBatchGatherOp) {} - - ~ConcatBatchMatMulBatchGatherOp() {} - - NOMNIGRAPH_DEFINE_NN_RTTI(ConcatBatchMatMulBatchGatherOp); - - private: -}; diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpEnum.h b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpEnum.h index 9c4277293d0b41..4d15dd40613403 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpEnum.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpEnum.h @@ -1,9 +1,4 @@ Relu, Conv, ConvRelu, ConvTranspose, AveragePool, AveragePoolRelu, MaxPool, MaxPoolRelu, Sum, SumRelu, Send, Receive, BatchNormalization, FC, GivenTensorFill, Concat, Softmax, ChannelShuffle, Add, Reshape, Flatten, - NCHW2NHWC, NHWC2NCHW, Int8Quantize, Int8Dequantize, Int8AveragePool, - Int8Conv, Int8ConvTranspose, Int8FC, Int8MaxPool, Int8Relu, - Int8GivenTensorFill, Int8Concat, Int8Softmax, Int8ChannelShuffle, Int8Sum, - Int8Add, Int8Reshape, Int8Flatten, Int8ConvRelu, Int8SumRelu, - Int8AveragePoolRelu, Int8MaxPoolRelu, BatchMatMul, BatchGather, - ConcatBatchMatMulBatchGatherOp + NCHW2NHWC, NHWC2NCHW diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpNames.h b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpNames.h index 87ffda3c4f3436..88ffa0b1ba6bb0 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpNames.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpNames.h @@ -1,92 +1,68 @@ case NNKind::Relu: return "Relu"; + case NNKind::Conv: return "Conv"; + case NNKind::ConvRelu: return "ConvRelu"; + case NNKind::ConvTranspose: return "ConvTranspose"; + case NNKind::AveragePool: return "AveragePool"; + case NNKind::AveragePoolRelu: return "AveragePoolRelu"; + case NNKind::MaxPool: return "MaxPool"; + case NNKind::MaxPoolRelu: return "MaxPoolRelu"; + case NNKind::Sum: return "Sum"; + case NNKind::SumRelu: return "SumRelu"; + case NNKind::Send: return "Send"; + case NNKind::Receive: return "Receive"; + case NNKind::BatchNormalization: return "BatchNormalization"; + case NNKind::FC: return "FC"; + case NNKind::GivenTensorFill: return "GivenTensorFill"; + case NNKind::Concat: return "Concat"; + case NNKind::Softmax: return "Softmax"; + case NNKind::ChannelShuffle: return "ChannelShuffle"; + case NNKind::Add: return "Add"; + case NNKind::Reshape: return "Reshape"; + case NNKind::Flatten: return "Flatten"; + case NNKind::NCHW2NHWC: return "NCHW2NHWC"; + case NNKind::NHWC2NCHW: return "NHWC2NCHW"; -case NNKind::Int8Quantize: - return "Int8Quantize"; -case NNKind::Int8Dequantize: - return "Int8Dequantize"; -case NNKind::Int8AveragePool: - return "Int8AveragePool"; -case NNKind::Int8Conv: - return "Int8Conv"; -case NNKind::Int8ConvTranspose: - return "Int8ConvTranspose"; -case NNKind::Int8FC: - return "Int8FC"; -case NNKind::Int8MaxPool: - return "Int8MaxPool"; -case NNKind::Int8Relu: - return "Int8Relu"; -case NNKind::Int8GivenTensorFill: - return "Int8GivenTensorFill"; -case NNKind::Int8Concat: - return "Int8Concat"; -case NNKind::Int8Softmax: - return "Int8Softmax"; -case NNKind::Int8ChannelShuffle: - return "Int8ChannelShuffle"; -case NNKind::Int8Sum: - return "Int8Sum"; -case NNKind::Int8Add: - return "Int8Add"; -case NNKind::Int8Reshape: - return "Int8Reshape"; -case NNKind::Int8Flatten: - return "Int8Flatten"; -case NNKind::Int8ConvRelu: - return "Int8ConvRelu"; -case NNKind::Int8SumRelu: - return "Int8SumRelu"; -case NNKind::Int8AveragePoolRelu: - return "Int8AveragePoolRelu"; -case NNKind::Int8MaxPoolRelu: - return "Int8MaxPoolRelu"; -case NNKind::BatchMatMul: - return "BatchMatMul"; -case NNKind::BatchGather: - return "BatchGather"; -case NNKind::ConcatBatchMatMulBatchGatherOp: - return "ConcatBatchMatMulBatchGatherOp"; diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h index 3c5148e5b6c70f..aab127d8c56e16 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h @@ -46,28 +46,31 @@ class Edge : public StorageType { public: using NodeRef = typename Graph::NodeRef; Edge(NodeRef tail, NodeRef head, U... args) - : StorageType(std::forward(args)...), Tail(tail), Head(head) { + : StorageType(std::forward(args)...), + tail_(tail), + head_(head) { DEBUG_PRINT("Creating instance of Edge: %p\n", this); } const NodeRef& tail() const { - return Tail; + return tail_; } const NodeRef& head() const { - return Head; + return head_; } void setTail(NodeRef n) { - Tail = n; + tail_ = n; } void setHead(NodeRef n) { - Head = n; + head_ = n; } private: - NodeRef Tail; - NodeRef Head; + NodeRef tail_; + NodeRef head_; + friend class Graph; }; @@ -88,54 +91,55 @@ class Node : public StorageType, public Notifier> { /// \brief Adds an edge by reference to known in-edges. /// \p e A reference to an edge that will be added as an in-edge. void addInEdge(EdgeRef e) { - inEdges.emplace_back(e); + inEdges_.emplace_back(e); } /// \brief Adds an edge by reference to known out-edges. /// \p e A reference to an edge that will be added as an out-edge. void addOutEdge(EdgeRef e) { - outEdges.emplace_back(e); + outEdges_.emplace_back(e); } /// \brief Removes an edge by reference to known in-edges. /// \p e A reference to an edge that will be removed from in-edges. void removeInEdge(EdgeRef e) { - auto iter = std::find(inEdges.begin(), inEdges.end(), e); - assert( - iter != inEdges.end() && - "Attempted to remove edge that isn't connected to this node"); - inEdges.erase(iter); + removeEdgeInternal(inEdges_, e); } /// \brief Removes an edge by reference to known out-edges. /// \p e A reference to an edge that will be removed from out-edges. void removeOutEdge(EdgeRef e) { - auto iter = std::find(outEdges.begin(), outEdges.end(), e); - assert( - iter != outEdges.end() && - "Attempted to remove edge that isn't connected to this node"); - outEdges.erase(iter); + removeEdgeInternal(outEdges_, e); } const std::vector& getOutEdges() const { - return outEdges; + return outEdges_; } const std::vector& getInEdges() const { - return inEdges; + return inEdges_; } - void setInEdges(std::vector es) { - inEdges = es; + void setInEdges(std::vector edges) { + inEdges_ = edges; } - void setOutEdges(std::vector es) { - outEdges = es; + void setOutEdges(std::vector edges) { + outEdges_ = edges; } - protected: - std::vector inEdges; - std::vector outEdges; + private: + std::vector inEdges_; + std::vector outEdges_; + friend class Graph; + + void removeEdgeInternal(std::vector& edges, EdgeRef e) { + auto iter = std::find(edges.begin(), edges.end(), e); + assert( + iter != edges.end() && + "Attempted to remove edge that isn't connected to this node"); + edges.erase(iter); + } }; /// \brief Effectively a constant reference to a graph. @@ -158,46 +162,56 @@ class Subgraph { using EdgeRef = typename Graph::EdgeRef; void addNode(NodeRef n) { - Nodes.insert(n); + nodes_.insert(n); } + bool hasNode(NodeRef n) const { - return Nodes.count(n) != 0; + return nodes_.count(n) != 0; } + void removeNode(NodeRef n) { - Nodes.erase(n); + nodes_.erase(n); } void addEdge(EdgeRef e) { - Edges.insert(e); + edges_.insert(e); } - bool hasEdge(EdgeRef n) const { - return Edges.count(n) != 0; + + bool hasEdge(EdgeRef e) const { + return edges_.count(e) != 0; } + void removeEdge(EdgeRef e) { - Edges.erase(e); + edges_.erase(e); } const std::unordered_set& getNodes() const { - return Nodes; + return nodes_; + } + + const size_t getNodesCount() const { + return (size_t)nodes_.size(); } + const std::unordered_set& getEdges() const { - return Edges; + return edges_; } + private: + std::unordered_set nodes_; + std::unordered_set edges_; + void printEdges() { - for (const auto& edge : Edges) { + for (const auto& edge : edges_) { printf("Edge: %p (%p -> %p)\n", &edge, edge->tail(), edge->head()); } } void printNodes() const { - for (const auto& node : Nodes) { + for (const auto& node : nodes_) { printf("Node: %p\n", node); } } - - std::unordered_set Nodes; - std::unordered_set Edges; }; /// \brief A simple graph implementation @@ -231,21 +245,21 @@ class Graph { } void importNode(NodeRef node, Graph& otherGraph) { - for (auto it = Nodes.begin(); it != Nodes.end(); ++it) { + for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { if (&(*it) == node) { - std::list>& otherNodes = otherGraph.Nodes; - otherNodes.splice(otherNodes.end(), Nodes, it, ++it); - otherGraph.NodeRefs.insert(node); + std::list>& otherNodes = otherGraph.nodes_; + otherNodes.splice(otherNodes.end(), nodes_, it, ++it); + otherGraph.nodeRefs_.insert(node); break; } } } void importEdge(EdgeRef edge, Graph& otherGraph) { - std::list>& otherEdges = otherGraph.Edges; - for (auto it = Edges.begin(); it != Edges.end(); ++it) { + std::list>& otherEdges = otherGraph.edges_; + for (auto it = edges_.begin(); it != edges_.end(); ++it) { if (&(*it) == edge) { - otherEdges.splice(otherEdges.end(), Edges, it, ++it); + otherEdges.splice(otherEdges.end(), edges_, it, ++it); break; } } @@ -313,9 +327,9 @@ class Graph { /// \return A reference to the edge created. EdgeRef createEdge(NodeRef tail, NodeRef head, U... data) { DEBUG_PRINT("Creating edge (%p -> %p)\n", tail, head); - this->Edges.emplace_back( + this->edges_.emplace_back( Edge(tail, head, std::forward(data)...)); - EdgeRef e = &this->Edges.back(); + EdgeRef e = &this->edges_.back(); head->addInEdge(e); tail->addOutEdge(e); return e; @@ -339,85 +353,85 @@ class Graph { /// related to the node. void deleteNode(NodeRef n, bool deleteEdges = true) { if (deleteEdges) { - auto inEdges = n->inEdges; + auto inEdges = n->inEdges_; for (auto& edge : inEdges) { deleteEdge(edge); } - auto outEdges = n->outEdges; + auto outEdges = n->outEdges_; for (auto& edge : outEdges) { deleteEdge(edge); } } - for (auto i = Nodes.begin(); i != Nodes.end(); ++i) { + for (auto i = nodes_.begin(); i != nodes_.end(); ++i) { if (&*i == n) { - NodeRefs.erase(n); - Nodes.erase(i); + nodeRefs_.erase(n); + nodes_.erase(i); break; } } } - bool hasNode(NodeRef ref) const { - return NodeRefs.find(ref) != NodeRefs.end(); + bool hasNode(NodeRef node) const { + return nodeRefs_.find(node) != nodeRefs_.end(); } /// \brief Deletes a edge from the graph. /// \p e A reference to the edge. - void deleteEdge(EdgeRef e, bool remove_ref = true) { - if (remove_ref) { - e->Tail->removeOutEdge(e); - e->Head->removeInEdge(e); + void deleteEdge(EdgeRef e, bool removeRef = true) { + if (removeRef) { + e->tail_->removeOutEdge(e); + e->head_->removeInEdge(e); } - for (auto i = Edges.begin(); i != Edges.end(); ++i) { + for (auto i = edges_.begin(); i != edges_.end(); ++i) { if (&*i == e) { - Edges.erase(i); + edges_.erase(i); break; } } } const std::vector getMutableNodes() { - std::vector v; - for (auto& n : Nodes) { + std::vector result; + for (auto& n : nodes_) { DEBUG_PRINT("Adding node to mutable output (%p)\n", &n); - v.emplace_back(&n); + result.emplace_back(&n); } - return v; + return result; } const std::vector getMutableEdges() { - std::vector v; - for (auto& e : Edges) { + std::vector result; + for (auto& e : edges_) { DEBUG_PRINT("Adding edge to mutable output (%p)\n", &e); - v.emplace_back(&e); + result.emplace_back(&e); } - return v; + return result; + } + + private: + std::list> nodes_; + std::list> edges_; + std::unordered_set nodeRefs_; + + NodeRef createNodeInternal(Node&& node) { + nodes_.emplace_back(std::move(node)); + NodeRef nodeRef = &nodes_.back(); + DEBUG_PRINT("Creating node (%p)\n", nodeRef); + nodeRefs_.insert(nodeRef); + return nodeRef; } void printEdges() { - for (const auto& edge : Edges) { + for (const auto& edge : edges_) { printf("Edge: %p (%p -> %p)\n", &edge, edge.tail(), edge.head()); } } void printNodes() const { - for (const auto& node : Nodes) { + for (const auto& node : nodes_) { printf("Node: %p\n", &node); } } - - private: - std::list> Nodes; - std::list> Edges; - std::unordered_set NodeRefs; - - NodeRef createNodeInternal(Node&& node) { - Nodes.emplace_back(std::move(node)); - NodeRef nodeRef = &Nodes.back(); - DEBUG_PRINT("Creating node (%p)\n", nodeRef); - NodeRefs.insert(nodeRef); - return nodeRef; - } }; } // namespace nom diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h b/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h new file mode 100644 index 00000000000000..08ead742950740 --- /dev/null +++ b/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h @@ -0,0 +1,174 @@ +#ifndef NOM_TRANFORMATIONS_SUBGRAPH_MATCHER_H +#define NOM_TRANFORMATIONS_SUBGRAPH_MATCHER_H + +namespace nom { + +namespace matcher { + +/* + * Subtree matching criteria consists of + * - Node matching criteria for the subtree's root. + * - Children subtree matching criteria + * - A count, which means we may want more than one of this subtree. The count + * can be unlimited. The count is only used when we match children of a + * subtree root, not matching the subtree itself. + */ +template +class SubtreeMatchCriteria { + public: + static const int kStarCount = -1; + SubtreeMatchCriteria( + const NodeMatchCriteria& root, + const std::vector& children, + int count) + : root_(root), children_(children), count_(count){}; + + private: + NodeMatchCriteria root_; + std::vector children_; + int count_; + + template + friend class SubgraphMatcher; +}; + +/* + * Utilities for subgraph matching. + */ +template < + typename GraphType, + typename NodeMatchCriteria, + typename NodeMatcherClass> +struct SubgraphMatcher { + static bool isNodeMatch( + typename GraphType::NodeRef node, + const NodeMatchCriteria& criteria) { + return NodeMatcherClass::isMatch(node, criteria); + } + + // Check if there can be a sub-tree that matches the given criteria that + // is rooted at the given rootNode. + // The flag invertGraphTraversal specify if we should follow out edges or + // in edges. The default is true which is useful for a functional + // intepretation of a dataflow graph. + static bool isSubtreeMatch( + typename GraphType::NodeRef root, + const SubtreeMatchCriteria& criteria, + bool invertGraphTraversal = true) { + if (!isNodeMatch(root, criteria.root_)) { + return false; + } + auto& edges = + invertGraphTraversal ? root->getInEdges() : root->getOutEdges(); + + int numEdges = edges.size(); + int numChildrenCriteria = criteria.children_.size(); + + // The current algorithm implies that the ordering of the children is + // important. The children nodes will be matched with the children subtree + // criteria in the given order. + + int currentEdgeIdx = 0; + for (int criteriaIdx = 0; criteriaIdx < numChildrenCriteria; + criteriaIdx++) { + auto childrenCriteria = criteria.children_[criteriaIdx]; + + int expectedCount = childrenCriteria.count_; + bool isStarCount = + expectedCount == SubtreeMatchCriteria::kStarCount; + + int countMatch = 0; + + // Continue to match subsequent edges with the current children criteria. + // Note that if the child criteria is a * pattern, this greedy algorithm + // will attempt to find the longest possible sequence that matches the + // children criteria. + for (; currentEdgeIdx < numEdges && + (isStarCount || countMatch < expectedCount); + currentEdgeIdx++) { + auto edge = edges[currentEdgeIdx]; + auto nextNode = invertGraphTraversal ? edge->tail() : edge->head(); + + if (!isSubtreeMatch(nextNode, childrenCriteria, invertGraphTraversal)) { + if (!isStarCount) { + // If the current criteria isn't a * pattern, this indicates a + // failure. + return false; + } else { + // Otherwise, we should move on to the next children criteria. + break; + } + } + + countMatch++; + } + + if (countMatch < expectedCount) { + // Fails because there are not enough matches as specified by the + // criteria. + return false; + } + } + + if (currentEdgeIdx < numEdges) { + // Fails because there are unmatched edges. + return false; + } + return true; + } + + // Utility to transform a graph by looking for subtrees that match + // a given pattern and then allow callers to mutate the graph based on + // subtrees that are found. + // The current implementation doesn't handle any graph transformation + // itself. Callers should be responsible for all intended mutation, including + // deleting nodes in the subtrees found by this algorithm. + // Note: if the replaceFunction lambda returns false, the entire procedure + // is aborted. This maybe useful in certain cases when we want to terminate + // the subtree search early. + // invertGraphTraversal flag: see documentation in isSubtreeMatch + static void replaceSubtree( + GraphType& graph, + const SubtreeMatchCriteria& criteria, + const std::function< + bool(GraphType& g, typename GraphType::NodeRef subtreeRoot)>& + replaceFunction, + bool invertGraphTraversal = true) { + for (auto nodeRef : graph.getMutableNodes()) { + // Make sure the node is still in the graph. + if (!graph.hasNode(nodeRef)) { + continue; + } + if (isSubtreeMatch(nodeRef, criteria, invertGraphTraversal)) { + if (!replaceFunction(graph, nodeRef)) { + // If replaceFunction returns false, it means that we should abort + // the entire procedure. + break; + } + } + } + } +}; + +// Convenient methods to create subtree matching criteria. +template +SubtreeMatchCriteria tree( + const NodeMatchCriteria& root, + const std::vector>& children = {}, + int count = 1) { + return SubtreeMatchCriteria(root, children, count); +} + +template +SubtreeMatchCriteria treeStar( + const NodeMatchCriteria& root, + const std::vector>& children = {}) { + return tree( + root, children, SubtreeMatchCriteria::kStarCount); +} + +} // namespace matcher + +} // namespace nom + +#endif // NOM_TRANFORMATIONS_SUBGRAPH_MATCHER_H diff --git a/caffe2/core/nomnigraph/op_gen.py b/caffe2/core/nomnigraph/op_gen.py index c62148ea52cff5..2d1125f5762ad4 100755 --- a/caffe2/core/nomnigraph/op_gen.py +++ b/caffe2/core/nomnigraph/op_gen.py @@ -6,6 +6,8 @@ from __future__ import unicode_literals import argparse +from textwrap import dedent +from subprocess import call def parse_lines(lines): @@ -22,25 +24,27 @@ def parse_lines(lines): index = 0 while index < len(lines): line = lines[index] - if line.lower().startswith('macro'): - assert (parse_state == EMPTY) - macro_line = line.split(' ') + if line.lower().startswith("macro"): + assert parse_state == EMPTY + macro_line = line.split(" ") # Support macros that look like attributes # e.g. macro - CONV_LIKE - curr_macro = ' '.join(macro_line[1:]) - assert (curr_macro not in macros) + curr_macro = " ".join(macro_line[1:]) + assert curr_macro not in macros, 'Macro "{}" defined twice.'.format( + curr_macro + ) macros[curr_macro] = [] parse_state = MACRO - lines = lines[:index] + lines[index + 1:] + lines = lines[:index] + lines[index + 1 :] continue - elif line.lower().startswith('endmacro'): - assert (parse_state == MACRO) + elif line.lower().startswith("endmacro"): + assert parse_state == MACRO parse_state = EMPTY - lines = lines[:index] + lines[index + 1:] + lines = lines[:index] + lines[index + 1 :] continue elif parse_state == MACRO: macros[curr_macro].append(line) - lines = lines[:index] + lines[index + 1:] + lines = lines[:index] + lines[index + 1 :] continue index += 1 @@ -48,7 +52,7 @@ def parse_lines(lines): while index < len(lines): line = lines[index] if line in macros: - lines = lines[:index] + macros[line] + lines[index + 1:] + lines = lines[:index] + macros[line] + lines[index + 1 :] index += len(macros[line]) - 1 index += 1 @@ -63,20 +67,20 @@ def parse_lines(lines): for line in lines: if not len(line): continue - if line[0] == '-': - assert (parse_state is OP) - attr = [_.strip() for _ in line[1:].split(':')] - assert (attr[0][0].isupper()) - if (len(attr) == 2): # attribute : type + if line[0] == "-": + assert parse_state is OP + attr = [_.strip() for _ in line[1:].split(":")] + assert attr[0][0].isupper() + if len(attr) == 2: # attribute : type ops[curr_op]["attributes"].append((attr[0], attr[1])) - elif (len(attr) == 3): # attribute : type + elif len(attr) == 3: # attribute : type ops[curr_op]["attributes"].append((attr[0], attr[1], attr[2])) else: - op = [l.strip() for l in line.split(':')] - assert (len(op[0].split(' ')) == 1) + op = [l.strip() for l in line.split(":")] + assert len(op[0].split(" ")) == 1 parse_state = OP curr_op = op[0] - assert (curr_op not in ops) + assert curr_op not in ops ops[curr_op] = {} op_list.append(curr_op) if len(op) > 1: @@ -101,20 +105,26 @@ def gen_class(op, op_def): attr_arg = "{type} {lower_name}".format( type=t, lower_name=lower_name + default_arg ) - attr_init = "{name}({lower_name})".format( - name=name, lower_name=lower_name - ) + attr_init = "{name}({lower_name})".format(name=name, lower_name=lower_name) attr_declare = "{type} {name};".format(type=t, name=name) - attr_get = """ - {type} get{name}() const {{ - return {name}; - }} -""".format(type=t, name=name) - attr_set = """ - void set{name}({type} {lower_name}) {{ - {name} = {lower_name}; - }} -""".format(type=t, name=name, lower_name=lower_name) + attr_get = dedent( + """ + {type} get{name}() const {{ + return {name}; + }} + """.format( + type=t, name=name + ) + ) + attr_set = dedent( + """ + void set{name}({type} {lower_name}) {{ + {name} = {lower_name}; + }} + """.format( + type=t, name=name, lower_name=lower_name + ) + ) attribute_args.append(attr_arg) attribute_init.append(attr_init) attribute_declarations.append(attr_declare) @@ -132,38 +142,43 @@ def gen_class(op, op_def): name=attr[0], other_op=lower_other_op ) ) - init = """ - {op}(const {other_op}& {lower_other_op}) : - {other_init} {{}} -""".format( - op=op, - other_op=other_op, - lower_other_op=lower_other_op, - other_init=',\n '.join(other_init) + init = dedent( + """ + {op}(const {other_op}& {lower_other_op}) : + {other_init} {{}} + """.format( + op=op, + other_op=other_op, + lower_other_op=lower_other_op, + other_init=",\n ".join(other_init), + ) ) extra_init += init - return """class {op} : public NeuralNetOperator {{ - public: - {op}({attribute_args}) : - {attribute_init} {{}} - {extra_init} - ~{op}() {{}} - - NOMNIGRAPH_DEFINE_NN_RTTI({op}); -{getters}{setters} - private: - {attribute_declarations} -}}; - -""".format( - op=op, - extra_init=extra_init, - getters=''.join(attribute_getters), - setters=''.join(attribute_setters), - attribute_args=',\n '.join(attribute_args), - attribute_init=',\n '.join(attribute_init), - attribute_declarations='\n '.join(attribute_declarations) + return dedent( + """ + class {op} : public NeuralNetOperator {{ + public: + {op}({attribute_args}) : + {attribute_init} {{}} + {extra_init} + ~{op}() {{}} + + NOMNIGRAPH_DEFINE_NN_RTTI({op}); + {getters}{setters} + private: + {attribute_declarations} + }}; + + """.format( + op=op, + extra_init=extra_init, + getters="".join(attribute_getters), + setters="".join(attribute_setters), + attribute_args=",\n".join(attribute_args), + attribute_init=",\n".join(attribute_init), + attribute_declarations="\n".join(attribute_declarations), + ) ) @@ -175,33 +190,51 @@ def gen_classes(ops, op_list): def gen_enum(op_list): - return ',\n'.join([op for op in op_list]) + '\n' + return ",\n".join([op for op in op_list]) + "\n" def gen_names(op_list): f = "" for op in op_list: - f += """case NNKind::{name}: - return \"{name}\"; -""".format(name=op) + f += dedent( + """ + case NNKind::{name}: + return \"{name}\"; + """.format( + name=op + ) + ) return f if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Generate op files.') - parser.add_argument('--install_dir', help='installation directory') - parser.add_argument('--source_def', help='ops.def') + parser = argparse.ArgumentParser(description="Generate op files.") + parser.add_argument("--install_dir", help="installation directory") + parser.add_argument("--source_def", help="ops.def", action="append") args = parser.parse_args() install_dir = args.install_dir + sources = args.source_def - with open(args.source_def, 'rb') as f: - lines = f.readlines() - lines = [l.strip().decode("utf-8") for l in lines] + lines = [] + for source in sources: + with open(source, "rb") as f: + lines_tmp = f.readlines() + lines += [l.strip().decode("utf-8") for l in lines_tmp] ops, op_list = parse_lines(lines) - with open(install_dir + '/OpClasses.h', 'wb') as f: + with open(install_dir + "/OpClasses.h", "wb") as f: f.write(gen_classes(ops, op_list).encode("utf-8")) - with open(install_dir + '/OpNames.h', 'wb') as f: + with open(install_dir + "/OpNames.h", "wb") as f: f.write(gen_names(op_list).encode("utf-8")) - with open(install_dir + '/OpEnum.h', 'wb') as f: + with open(install_dir + "/OpEnum.h", "wb") as f: f.write(gen_enum(op_list).encode("utf-8")) + + try: + cmd = ["clang-format", "-i", install_dir + "/OpClasses.h"] + call(cmd) + cmd = ["clang-format", "-i", install_dir + "/OpNames.h"] + call(cmd) + cmd = ["clang-format", "-i", install_dir + "/OpEnum.h"] + call(cmd) + except Exception: + pass diff --git a/caffe2/core/nomnigraph/ops.def b/caffe2/core/nomnigraph/ops.def index 53dd951c8fc1c2..6183e3c25726a3 100644 --- a/caffe2/core/nomnigraph/ops.def +++ b/caffe2/core/nomnigraph/ops.def @@ -69,30 +69,3 @@ CopyFromOpenCL NCHW2NHWC NHWC2NCHW -Int8Quantize -Int8Dequantize -Int8AveragePool : AveragePool -Int8Conv : Conv -Int8ConvTranspose : ConvTranspose -Int8FC : FC -Int8MaxPool : MaxPool -Int8Relu : Relu -Int8GivenTensorFill : GivenTensorFill -Int8Concat : Concat -Int8Softmax : Softmax -Int8ChannelShuffle : ChannelShuffle -Int8Sum : Sum -Int8Add : Add -Int8Reshape : Reshape -Int8Flatten : Flatten -Int8ConvRelu : ConvRelu -Int8SumRelu : SumRelu -Int8AveragePoolRelu : AveragePoolRelu -Int8MaxPoolRelu : MaxPoolRelu - -BatchMatMul -- TransA : bool : false -- TransB : bool : true -- Broadcast: bool : false -BatchGather -ConcatBatchMatMulBatchGatherOp diff --git a/caffe2/core/nomnigraph/tests/binary_match_test.cc b/caffe2/core/nomnigraph/tests/binary_match_test.cc index 4834cea30f3e23..ca3fd11b3a9126 100644 --- a/caffe2/core/nomnigraph/tests/binary_match_test.cc +++ b/caffe2/core/nomnigraph/tests/binary_match_test.cc @@ -19,7 +19,7 @@ TEST(BinaryMatch, AllMatch) { auto matches = nom::algorithm::binaryMatch( &graph, [](decltype(graph)::NodeRef n) { return true; }); EXPECT_EQ(matches.size(), 1); - EXPECT_EQ(matches.front().Nodes.size(), graph.getMutableNodes().size()); + EXPECT_EQ(matches.front().getNodesCount(), graph.getMutableNodes().size()); } TEST(BinaryMatch, EmptyGraph) { @@ -58,9 +58,9 @@ TEST(BinaryMatch, Basic) { EXPECT_EQ(matches.size(), 1); auto match = matches.front(); - EXPECT_EQ(match.Nodes.size(), 4); + EXPECT_EQ(match.getNodesCount(), 4); std::set exp{"2", "3", "4", "6"}; - for (auto n : match.Nodes) { + for (auto n : match.getNodes()) { EXPECT_EQ(exp.count(n->data()), 1); exp.erase(n->data()); } @@ -104,16 +104,16 @@ TEST(BinaryMatch, RemovedMiddleNode) { auto match1 = matches.front(); auto match2 = matches.back(); - EXPECT_EQ(match1.Nodes.size(), 2); - EXPECT_EQ(match2.Nodes.size(), 1); + EXPECT_EQ(match1.getNodesCount(), 2); + EXPECT_EQ(match2.getNodesCount(), 1); std::set exp1{"2", "4"}; std::set exp2{"6"}; - for (auto n : match1.Nodes) { + for (auto n : match1.getNodes()) { EXPECT_EQ(exp1.count(n->data()), 1); exp1.erase(n->data()); } - for (auto n : match2.Nodes) { + for (auto n : match2.getNodes()) { EXPECT_EQ(exp2.count(n->data()), 1); exp2.erase(n->data()); } diff --git a/caffe2/core/nomnigraph/tests/subgraph_matcher_test.cc b/caffe2/core/nomnigraph/tests/subgraph_matcher_test.cc new file mode 100644 index 00000000000000..ddd8a15fcdc2bc --- /dev/null +++ b/caffe2/core/nomnigraph/tests/subgraph_matcher_test.cc @@ -0,0 +1,404 @@ +#include + +#include "test_util.h" + +#include "nomnigraph/Transformations/SubgraphMatcher.h" + +#include + +namespace nom { + +namespace matcher { + +using NodeType = std::string; +using Criteria = std::string; + +// Node matches a criteria (string) if the data string is the same as the +// criteria. Special case: "*" will match any thing. +struct TestNodeMatch { + static bool isMatch( + const nom::Graph::NodeRef& node, + const Criteria& criteria) { + return criteria == "*" || criteria == node->data(); + } +}; + +using TestGraph = Graph; +using TestMatcher = SubgraphMatcher; + +Criteria any() { + return Criteria("*"); +} + +// Make it more concise to create matching criteria in dataflow graph. +// For example, operatorTree("opA", ...) will refer to a tree like this: +// ... -> opA -> opA_Output +SubtreeMatchCriteria operatorTree( + const Criteria& root, + const std::vector>& childrenCriteria = {}, + int count = 1) { + return tree(any(), {tree(root, childrenCriteria)}, count); +} + +std::map TestGraphNodePrinter( + TestGraph::NodeRef node) { + std::map labelMap; + labelMap["label"] = node->data(); + return labelMap; +}; + +// Attempts to create a realistic dataflow graph that shows a fuse procedure. +struct DataFlowTestGraph { + const int numInputs = 4; + + TestGraph graph; + + TestGraph::NodeRef opB; + TestGraph::NodeRef opF; + TestGraph::NodeRef opC; + TestGraph::NodeRef opG; + TestGraph::NodeRef dataOut; + + // Realistic data flow test graph. + /* + + + +---------------+ + | | + | +---------+ | +---------+ + +---------------------+ | input_A | | | input_B | + | +---------+ | +---------+ + | | | | + | | | | + | v v v + +---------++---------+ +-------------------------+ +--------+ + | input_C || input_D | --> | opC | --> | dataC2 | + +---------++---------+ +-------------------------+ +--------+ + | + | + v + +---------+ + | dataC | -+ + +---------+ | + | | + | | + v | + +---------+ | + | opB | <+ + +---------+ + | + | + v + +---------+ + | dataB | + +---------+ + | + | + v + +---------+ + | opF | + +---------+ + | + | + v + +---------+ + | dataF | + +---------+ + | + | + v + +---------+ +---------+ + | dataI | --> | opG | + +---------+ +---------+ + | + | + v + +---------+ + | dataOut | + +---------+ + */ + DataFlowTestGraph() { + opC = graph.createNode("opC"); + + for (int i = 0; i < numInputs; i++) { + auto dataInput = graph.createNode("input"); + graph.createEdge(dataInput, opC); + } + + auto dataC = graph.createNode("dataC"); + auto dataC2 = graph.createNode("dataC2"); + graph.createEdge(opC, dataC); + graph.createEdge(opC, dataC2); + + opB = graph.createNode("opB"); + // There are 2 edges + graph.createEdge(dataC, opB); + graph.createEdge(dataC, opB); + + auto dataB = graph.createNode("dataB"); + graph.createEdge(opB, dataB); + + opF = graph.createNode("opF"); + graph.createEdge(dataB, opF); + + auto dataF = graph.createNode("dataF"); + graph.createEdge(opF, dataF); + + auto dataI = graph.createNode("dataI"); + + opG = graph.createNode("opG"); + graph.createEdge(dataF, opG); + graph.createEdge(dataI, opG); + + dataOut = graph.createNode("dataOut"); + graph.createEdge(opG, dataOut); + + // Use nom::converters::convertToDotString(&graph, TestGraphNodePrinter) + // to visualize the graph. + } +}; + +SubtreeMatchCriteria DataFlowTestGraphCriteria() { + // clang-format off + return tree( + Criteria("opG"),{ + operatorTree("opF", { + // Note: we currently don't enforce that these 2 opC nodes + // have to be the same. + operatorTree("opB", { + operatorTree("opC", { + treeStar(Criteria("input")) + }, 2), + }) + }), + tree(any()) // matches dataI + }); + // clang-format on +} + +TestGraph::NodeRef getInNode(TestGraph::NodeRef node, int index) { + return node->getInEdges()[index]->tail(); +} + +} // namespace matcher + +} // namespace nom + +using namespace nom::matcher; + +// Simple test cases for node matching criteria. +TEST(SubgraphMatcher, IsNodeMatch) { + TestGraph graph; + auto n1 = graph.createNode("Hello"); + auto n2 = graph.createNode("Le"); + graph.createEdge(n1, n2); + + EXPECT_TRUE(TestMatcher::isNodeMatch(n1, "Hello")); + EXPECT_FALSE(TestMatcher::isNodeMatch(n1, "G")); + EXPECT_TRUE(TestMatcher::isNodeMatch(n2, "Le")); + EXPECT_FALSE(TestMatcher::isNodeMatch(n2, "le")); +} + +// Test subtree matching with a simple tree graph. +TEST(SubgraphMatcher, IsSubtreeMatch) { + TestGraph graph; + auto n1 = graph.createNode("1"); + auto n2 = graph.createNode("2"); + auto n3 = graph.createNode("3"); + auto n4 = graph.createNode("4"); + auto n5 = graph.createNode("5"); + auto n6 = graph.createNode("6"); + auto n7 = graph.createNode("7"); + + graph.createEdge(n1, n2); + graph.createEdge(n2, n3); + graph.createEdge(n2, n4); + graph.createEdge(n1, n5); + graph.createEdge(n5, n6); + graph.createEdge(n5, n7); + /* N1 + / \ + N2 N5 + / \ / \ + N3 N4 N6 N7 + */ + + auto subtree = tree(any(), {tree(any()), tree(any())}); + EXPECT_FALSE(TestMatcher::isSubtreeMatch(n1, subtree, false)); + EXPECT_FALSE(TestMatcher::isSubtreeMatch(n4, subtree, false)); + + EXPECT_TRUE(TestMatcher::isSubtreeMatch(n2, subtree, false)); + EXPECT_TRUE(TestMatcher::isSubtreeMatch(n5, subtree, false)); + + subtree = tree(Criteria("5"), {tree(any()), tree(any())}); + EXPECT_FALSE(TestMatcher::isSubtreeMatch(n2, subtree, false)); + EXPECT_TRUE(TestMatcher::isSubtreeMatch(n5, subtree, false)); + + subtree = tree(any(), {tree(any()), tree(Criteria("4"))}); + EXPECT_TRUE(TestMatcher::isSubtreeMatch(n2, subtree, false)); + EXPECT_FALSE(TestMatcher::isSubtreeMatch(n5, subtree, false)); +} + +// Test subtree matching in which * (repeated) matching of children is allowed. +TEST(SubgraphMatcher, IsSubtreeMatchRepeated) { + TestGraph graph; + auto n1 = graph.createNode("1"); + auto n2 = graph.createNode("2"); + auto n3A = graph.createNode("3"); + auto n3B = graph.createNode("3"); + auto n4 = graph.createNode("4"); + auto n5A = graph.createNode("5"); + auto n5B = graph.createNode("5"); + auto n5C = graph.createNode("5"); + graph.createEdge(n1, n2); + graph.createEdge(n1, n3A); + graph.createEdge(n1, n3B); + graph.createEdge(n1, n4); + graph.createEdge(n1, n4); + graph.createEdge(n1, n5A); + graph.createEdge(n1, n5B); + graph.createEdge(n1, n5C); + + auto subtree = tree(any(), {tree(Criteria("2"))}); + EXPECT_FALSE(TestMatcher::isSubtreeMatch(n1, subtree, false)); + + subtree = tree(any(), {treeStar(Criteria("2"))}); + EXPECT_FALSE(TestMatcher::isSubtreeMatch(n1, subtree, false)); + + // clang-format off + subtree = tree(any(), { + tree(Criteria("2")), + tree(Criteria("3"), {}, 2), + tree(Criteria("4"), {}, 2), + tree(Criteria("5"), {}, 3) + }); + EXPECT_TRUE(TestMatcher::isSubtreeMatch(n1, subtree, false)); + + subtree = tree(any(), { + tree(Criteria("2")), + tree(Criteria("3"), {}, 2), + tree(Criteria("4"), {}, 2), + treeStar(Criteria("5")) + }); + EXPECT_TRUE(TestMatcher::isSubtreeMatch(n1, subtree, false)); + + subtree = tree(any(), { + tree(Criteria("2")), + treeStar(Criteria("3")), + tree(Criteria("4"), {}, 2), + treeStar(Criteria("5")) + }); + EXPECT_TRUE(TestMatcher::isSubtreeMatch(n1, subtree, false)); + + subtree = tree(any(), { + tree(Criteria("2")), + treeStar(Criteria("3")), + }); + // Fails because there are unmatched edges. + EXPECT_FALSE(TestMatcher::isSubtreeMatch(n1, subtree, false)); + + subtree = tree(any(), { + tree(Criteria("2")), + tree(Criteria("3"), {}, 2), + tree(Criteria("4")), + tree(Criteria("5"), {}, 3) + }); + // Fails because the count is wrong; we have 2 edges to node N4 while + // the pattern expects only 1. + EXPECT_FALSE(TestMatcher::isSubtreeMatch(n1, subtree, false)); + // clang-format on +} + +TEST(SubgraphMatcher, IsSubtreeMatchRealistic) { + auto graph = DataFlowTestGraph(); + auto subtree = DataFlowTestGraphCriteria(); + + EXPECT_FALSE(TestMatcher::isSubtreeMatch(graph.opF, subtree)); + EXPECT_FALSE(TestMatcher::isSubtreeMatch(graph.opC, subtree)); + EXPECT_FALSE(TestMatcher::isSubtreeMatch(graph.opB, subtree)); + EXPECT_FALSE(TestMatcher::isSubtreeMatch(graph.dataOut, subtree)); + + EXPECT_TRUE(TestMatcher::isSubtreeMatch(graph.opG, subtree)); +} + +TEST(SubgraphMatcher, ReplaceSubtreeRealistic) { + auto graph = DataFlowTestGraph(); + auto subtree = DataFlowTestGraphCriteria(); + + TestMatcher::replaceSubtree( + graph.graph, subtree, [](TestGraph& g, TestGraph::NodeRef opG) { + auto opFused = g.createNode("opFused"); + + auto dataF = getInNode(opG, 0); + auto opF = getInNode(dataF, 0); + auto dataB = getInNode(opF, 0); + auto opB = getInNode(dataB, 0); + auto dataC = getInNode(opB, 0); + auto opC = getInNode(dataC, 0); + + g.deleteNode(dataF); + g.replaceNode(opG, opFused); + + auto outEdgesC = opC->getOutEdges(); + g.deleteNode(outEdgesC[0]->head()); + g.deleteNode(outEdgesC[1]->head()); + g.replaceNode(opC, opFused); + + g.deleteNode(opC); + g.deleteNode(opB); + g.deleteNode(dataB); + g.deleteNode(opF); + g.deleteNode(opG); + + return true; + }); + + // Now the nodes are: + // - NumInputs input nodes + // - dataI node + // - fused node + // - output node + auto nodes = graph.graph.getMutableNodes(); + + // Test that the graph is transformed as expected. + EXPECT_EQ(nodes.size(), graph.numInputs + 3); + TestGraph::NodeRef opFused; + TestGraph::NodeRef dataI; + TestGraph::NodeRef dataOut; + for (auto node : nodes) { + if (node->data() == "opFused") { + opFused = node; + } else if (node->data() == "dataOut") { + dataOut = node; + } else if (node->data() == "dataI") { + dataI = node; + } + } + + EXPECT_EQ(getInNode(dataOut, 0), opFused); + EXPECT_EQ(getInNode(opFused, 0), dataI); + for (int i = 1; i <= graph.numInputs; i++) { + EXPECT_EQ(getInNode(opFused, i)->data(), "input"); + } + + // Use nom::converters::convertToDotString(&graph.graph, TestGraphNodePrinter) + // to visualize. The transformed graph looks like This + /* + + +---------++---------+ + | input_A || input_D | + +---------++---------+ + | | + | | + v v ++---------+ +--------------------+ +---------+ +| input_B | --> | opFused | <-- | input_C | ++---------+ +--------------------+ +---------+ + | ^ + | | + v | + +---------++---------+ + | dataOut || dataI | + +---------++---------+ + */ +} diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h index 734d38d75e680d..9f88f192936fe4 100644 --- a/caffe2/core/operator.h +++ b/caffe2/core/operator.h @@ -323,6 +323,10 @@ class OperatorBase : public Observable { return !event_; } + virtual void SyncDevice() { + CAFFE_NOT_IMPLEMENTED; + } + // Checks whether stream is ready to execute new computation, // used in stream allocation optimization to skip stream that is currently // busy. Depends on context and operator's device, returns true by default @@ -577,6 +581,8 @@ class Operator : public OperatorBase { return &context_; } + void SyncDevice() final {} + virtual std::vector> InputFillers( const std::vector>& shapes) { CAFFE_ENFORCE(shapes.size() == Inputs().size()); diff --git a/caffe2/core/operator_gpu.cc b/caffe2/core/operator_gpu.cc new file mode 100644 index 00000000000000..03f227f7453524 --- /dev/null +++ b/caffe2/core/operator_gpu.cc @@ -0,0 +1,26 @@ +#include "caffe2/core/context_gpu.h" +#include "caffe2/core/operator.h" + +namespace caffe2 { + +template <> +void Operator::SyncDevice() { + auto* context = getContext(); + int device; + cudaGetDevice(&device); + + cudaEvent_t ev; + cudaSetDevice(context->cuda_gpu_id()); + cudaEventCreateWithFlags(&ev, cudaEventDisableTiming); + cudaEventRecord(ev, context->cuda_stream()); + cudaEventSynchronize(ev); + cudaEventDestroy(ev); + cudaSetDevice(device); + + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) { + CAFFE_THROW("Encountered CUDA error Stop: ", cudaGetErrorString(error)); + } +} + +} // namespace caffe2 diff --git a/caffe2/core/typeid.h b/caffe2/core/typeid.h index b4a01b57cc11e3..facea9fa64d2fa 100644 --- a/caffe2/core/typeid.h +++ b/caffe2/core/typeid.h @@ -14,8 +14,9 @@ #include +#include "ATen/core/Half.h" #include "caffe2/core/common.h" -#include "caffe2/utils/IdWrapper.h" +#include "ATen/core/IdWrapper.h" namespace caffe2 { class CaffeTypeId; @@ -32,16 +33,16 @@ class TypeMeta; * You need to register your types using CAFFE_KNOWN_TYPE(MyType) to be able to use CaffeTypeId with custom types. * This is for example used to store the dtype of tensors. */ -class CaffeTypeId final : public c10::guts::IdWrapper { +class CaffeTypeId final : public at::IdWrapper { public: static CaffeTypeId createTypeId(); friend std::ostream& ::operator<<(std::ostream& stream, CaffeTypeId typeId); friend bool operator<(CaffeTypeId lhs, CaffeTypeId rhs); - // TODO Can we get rid of uninitialized? + // This is 8, because 0 is uint8_t (due to ScalarType BC constraint) static constexpr CaffeTypeId uninitialized() { - return CaffeTypeId(0); + return CaffeTypeId(8); } private: @@ -57,7 +58,7 @@ inline bool operator<(CaffeTypeId lhs, CaffeTypeId rhs) { } -C10_DEFINE_HASH_FOR_IDWRAPPER(caffe2::CaffeTypeId) +AT_DEFINE_HASH_FOR_IDWRAPPER(caffe2::CaffeTypeId) inline std::ostream& operator<<(std::ostream& stream, caffe2::CaffeTypeId typeId) { return stream << typeId.underlyingId(); @@ -439,35 +440,41 @@ inline bool operator!=(const TypeMeta& lhs, const TypeMeta& rhs) noexcept { class Tensor; -// note: first preallocated id is 1, because 0 is used for uninitialized type -// ids. +// Note: we have preallocated the numbers 0-8 so they line up exactly +// with at::ScalarType's numbering. All other numbers do not matter. +// +// Notably, the "uninitialized" type id is 8, not 0, for hysterical raisins. + struct _CaffeHighestPreallocatedTypeId final {}; -CAFFE_DECLARE_KNOWN_TYPE(1, Tensor); -CAFFE_DECLARE_KNOWN_TYPE(2, float); +CAFFE_DECLARE_KNOWN_TYPE(0, uint8_t); +CAFFE_DECLARE_KNOWN_TYPE(1, int8_t); +CAFFE_DECLARE_KNOWN_TYPE(2, int16_t); CAFFE_DECLARE_KNOWN_TYPE(3, int); -CAFFE_DECLARE_KNOWN_TYPE(4, std::string); -CAFFE_DECLARE_KNOWN_TYPE(5, bool); -CAFFE_DECLARE_KNOWN_TYPE(6, uint8_t); -CAFFE_DECLARE_KNOWN_TYPE(7, int8_t); -CAFFE_DECLARE_KNOWN_TYPE(8, uint16_t); -CAFFE_DECLARE_KNOWN_TYPE(9, int16_t); -CAFFE_DECLARE_KNOWN_TYPE(10, int64_t); -CAFFE_DECLARE_KNOWN_TYPE(11, double); -CAFFE_DECLARE_KNOWN_TYPE(12, char); -CAFFE_DECLARE_KNOWN_TYPE(13, std::unique_ptr); -CAFFE_DECLARE_KNOWN_TYPE(14, std::unique_ptr>); -CAFFE_DECLARE_KNOWN_TYPE(15, std::vector); -CAFFE_DECLARE_KNOWN_TYPE(16, std::vector); -CAFFE_DECLARE_KNOWN_TYPE(17, std::vector); -CAFFE_DECLARE_KNOWN_TYPE(18, bool*); -CAFFE_DECLARE_KNOWN_TYPE(19, char*); -CAFFE_DECLARE_KNOWN_TYPE(20, int*); +CAFFE_DECLARE_KNOWN_TYPE(4, int64_t); +CAFFE_DECLARE_KNOWN_TYPE(5, at::Half); +CAFFE_DECLARE_KNOWN_TYPE(6, float); +CAFFE_DECLARE_KNOWN_TYPE(7, double); +// 8 = undefined type id + +CAFFE_DECLARE_KNOWN_TYPE(9, Tensor); +CAFFE_DECLARE_KNOWN_TYPE(10, std::string); +CAFFE_DECLARE_KNOWN_TYPE(11, bool); +CAFFE_DECLARE_KNOWN_TYPE(12, uint16_t); +CAFFE_DECLARE_KNOWN_TYPE(13, char); +CAFFE_DECLARE_KNOWN_TYPE(14, std::unique_ptr); +CAFFE_DECLARE_KNOWN_TYPE(15, std::unique_ptr>); +CAFFE_DECLARE_KNOWN_TYPE(16, std::vector); +CAFFE_DECLARE_KNOWN_TYPE(17, std::vector); +CAFFE_DECLARE_KNOWN_TYPE(18, std::vector); +CAFFE_DECLARE_KNOWN_TYPE(19, bool*); +CAFFE_DECLARE_KNOWN_TYPE(20, char*); +CAFFE_DECLARE_KNOWN_TYPE(21, int*); #ifdef CAFFE2_UNIQUE_LONG_TYPEMETA -CAFFE_DECLARE_KNOWN_TYPE(21, long); -CAFFE_DECLARE_KNOWN_TYPE(22, std::vector); +CAFFE_DECLARE_KNOWN_TYPE(22, long); +CAFFE_DECLARE_KNOWN_TYPE(23, std::vector); #endif // CAFFE2_UNIQUE_LONG_TYPEMETA -CAFFE_DECLARE_KNOWN_TYPE(23, _CaffeHighestPreallocatedTypeId); +CAFFE_DECLARE_KNOWN_TYPE(24, _CaffeHighestPreallocatedTypeId); } diff --git a/caffe2/ideep/utils/ideep_context.h b/caffe2/ideep/utils/ideep_context.h index 35c2008d4fdab0..c7215e0ed28b32 100644 --- a/caffe2/ideep/utils/ideep_context.h +++ b/caffe2/ideep/utils/ideep_context.h @@ -21,7 +21,7 @@ class IDEEPContext final : public BaseContext { CAFFE_ENFORCE_EQ(option.device_type(), IDEEP); } - ~IDEEPContext() noexcept {} + ~IDEEPContext() noexcept override {} BaseStaticContext* GetStaticContext() const override { return GetIDEEPStaticContext(); diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm index 45f55ab2407a2e..755e1b5a57b8a9 100644 --- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm +++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm @@ -489,7 +489,7 @@ bool RunOnDevice() override { "noise_size", 491 /* prime to avoid artifacts */); // Treaded as half4 in the kernel, so need half4 here. noiseSize = divRoundUp(noiseSize, 4) * 4; - if (!noiseBlob->IsType() || + if (!noiseBlob->IsType(CPU) || noiseBlob->Get().size() != noiseSize) { VLOG(2) << "Initializing stylizer with noise: " << noiseSize; caffe2::Timer rt; diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm index 9f032e6fe299d0..bcf588d8a384f0 100644 --- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm +++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm @@ -94,7 +94,7 @@ void testMPSCNN() { Workspace ws; for (auto i = 0; i < N; ++i) { - auto* t = ws.CreateBlob(cpu(i))->GetMutable(); + auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU); t->Resize(BS, C, H, W); CPUContext ctx; math::RandGaussian( @@ -152,7 +152,7 @@ void testMPSCNN() { Workspace ws; for (auto i = 0; i < N; ++i) { - auto* t = ws.CreateBlob(cpu(i))->GetMutable(); + auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU); switch (ndim) { case 1: t->Resize(5); @@ -210,7 +210,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNNormalizePlanarYUV Test: "; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(batch_size, channels, 8, 13); CPUContext ctx; math::RandGaussian( @@ -218,14 +218,14 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("mean")->GetMutable(); + auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU); t->Resize(1, channels); CPUContext ctx; math::RandGaussian( t->size(), 0, 1, t->mutable_data(), &ctx); } { - auto* t = ws.CreateBlob("stddev")->GetMutable(); + auto* t = ws.CreateBlob("stddev")->GetMutableTensor(CPU); t->Resize(1, channels); CPUContext ctx; math::RandUniform( @@ -290,7 +290,7 @@ void testMPSCNN() { for (const auto dim : {10, 40}) { Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(batchSize, channels, dim, dim); CPUContext ctx; // Too noisy. @@ -299,7 +299,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("W")->GetMutable(); + auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU); t->Resize(channels); CPUContext ctx; for (auto i = 0; i < t->size(); ++i) { @@ -310,7 +310,7 @@ void testMPSCNN() { // t->mutable_data(), &ctx); } { - auto* t = ws.CreateBlob("b")->GetMutable(); + auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU); t->Resize(channels); CPUContext ctx; for (auto i = 0; i < t->size(); ++i) { @@ -321,7 +321,7 @@ void testMPSCNN() { // t->mutable_data(), &ctx); } { - auto* t = ws.CreateBlob("pw")->GetMutable(); + auto* t = ws.CreateBlob("pw")->GetMutableTensor(CPU); t->Resize(prelu == PreluTy::SHARED ? 1 : channels); CPUContext ctx; // Too noisy. @@ -409,7 +409,7 @@ void testMPSCNN() { Workspace ws; const auto channels = array ? 12 : 3; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(batch_size, channels, 8, 13); CPUContext ctx; math::RandGaussian( @@ -417,7 +417,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("b")->GetMutable(); + auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU); t->Resize(shared ? channels : 1); CPUContext ctx; math::RandGaussian( @@ -480,7 +480,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNSpatialBN Test: " << channels; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(batch_size, channels, 8, 13); CPUContext ctx; math::RandGaussian( @@ -488,7 +488,7 @@ void testMPSCNN() { } for (const std::string name : {"scale", "bias", "mean", "var"}) { - auto* t = ws.CreateBlob(name)->GetMutable(); + auto* t = ws.CreateBlob(name)->GetMutableTensor(CPU); t->Resize(channels); CPUContext ctx; // High mean to avoid var division by zero. @@ -575,7 +575,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNFC Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(batchSize, CIn, H, W); CPUContext ctx; math::RandGaussian( @@ -583,7 +583,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("W")->GetMutable(); + auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU); t->Resize(COut, CIn * H * W); CPUContext ctx; math::RandGaussian( @@ -591,7 +591,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("b")->GetMutable(); + auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU); t->Resize(COut); CPUContext ctx; math::RandGaussian( @@ -683,7 +683,7 @@ void testMPSCNN() { Workspace ws; { auto* t = - ws.CreateBlob("X_cpu")->GetMutable(); + ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(batchSize, 8, 8, 13); CPUContext ctx; math::RandGaussian( @@ -784,7 +784,7 @@ void testMPSCNN() { std::vector>{{1, 3, 50, 80}, {1, 12, 50, 80}}) { Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(dims); CPUContext ctx; math::RandGaussian( @@ -860,7 +860,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNPreprocess Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(1, 8, 13, 4); CPUContext ctx; for (auto i = 0; i < t->size(); ++i) { @@ -869,7 +869,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("mean")->GetMutable(); + auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU); t->Resize(3); CPUContext ctx; t->mutable_data()[0] = 100; @@ -940,7 +940,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNDeprocess Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(1, 3, 8, 24); CPUContext ctx; for (auto i = 0; i < t->size(); ++i) { @@ -949,7 +949,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("mean")->GetMutable(); + auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU); t->Resize(3); CPUContext ctx; t->mutable_data()[0] = 100; @@ -999,7 +999,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNDeprocess Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(1, 3, 1280, 720); CPUContext ctx; for (auto i = 0; i < t->size(); ++i) { @@ -1008,7 +1008,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("mean")->GetMutable(); + auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU); t->Resize(3); CPUContext ctx; t->mutable_data()[0] = 30; @@ -1072,8 +1072,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNConv Test"; Workspace ws; { - auto* t = - ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(batchSize, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -1081,7 +1080,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("W")->GetMutable(); + auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU); t->Resize(8, 12, kernel_h, kernel_w); CPUContext ctx; math::RandGaussian( @@ -1093,7 +1092,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("b")->GetMutable(); + auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU); t->Resize(8); CPUContext ctx; math::RandGaussian( @@ -1189,7 +1188,7 @@ void testMPSCNN() { Workspace ws; int output_channels = input_channels * channel_multiplier; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(batchSize, input_channels, 57, 72); CPUContext ctx; math::RandGaussian( @@ -1197,7 +1196,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("W")->GetMutable(); + auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU); t->Resize(output_channels, 1, 3, 3); CPUContext ctx; math::RandGaussian( @@ -1205,7 +1204,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("b")->GetMutable(); + auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU); t->Resize(output_channels); CPUContext ctx; math::RandGaussian( @@ -1276,7 +1275,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNConvRelu Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(1, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -1284,7 +1283,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("W")->GetMutable(); + auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU); t->Resize(8, 12, 3, 3); CPUContext ctx; math::RandGaussian( @@ -1292,7 +1291,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("b")->GetMutable(); + auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU); t->Resize(8); CPUContext ctx; math::RandGaussian( @@ -1386,7 +1385,7 @@ void testMPSCNN() { LOG(INFO) << "MPSConv Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(1, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -1394,7 +1393,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("W")->GetMutable(); + auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU); t->Resize(8, 12, 3, 3); CPUContext ctx; math::RandGaussian( @@ -1402,7 +1401,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("b")->GetMutable(); + auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU); t->Resize(8); CPUContext ctx; math::RandGaussian( @@ -1494,7 +1493,7 @@ void testMPSCNN() { LOG(INFO) << "MPSConv Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(batchSize, C, 12, 16); CPUContext ctx; math::RandGaussian( @@ -1502,7 +1501,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("W")->GetMutable(); + auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU); t->Resize(M, C, K, K); CPUContext ctx; math::RandGaussian( @@ -1510,7 +1509,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("b")->GetMutable(); + auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU); t->Resize(M); CPUContext ctx; math::RandGaussian( @@ -1608,7 +1607,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNConv Test - group"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(batchSize, C, 12, 16); CPUContext ctx; math::RandGaussian( @@ -1616,7 +1615,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("W")->GetMutable(); + auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU); t->Resize(M, C / group, K, K); CPUContext ctx; math::RandGaussian( @@ -1624,7 +1623,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("b")->GetMutable(); + auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU); t->Resize(M); CPUContext ctx; math::RandGaussian( @@ -1727,7 +1726,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNMul Test"; Workspace ws; { - auto* t = ws.CreateBlob("X0_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU); t->Resize(1, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -1735,7 +1734,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("X1_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU); t->Resize(72); CPUContext ctx; math::RandGaussian( @@ -1792,7 +1791,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNSub Test"; Workspace ws; { - auto* t = ws.CreateBlob("X0_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU); t->Resize(1, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -1800,7 +1799,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("X1_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU); t->Resize(72); CPUContext ctx; math::RandGaussian( @@ -1857,7 +1856,7 @@ void testMPSCNN() { LOG(INFO) << "MPSAdd Test"; Workspace ws; { - auto* t = ws.CreateBlob("X0_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU); t->Resize(1, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -1865,7 +1864,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("X1_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU); t->Resize(1, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -1922,7 +1921,7 @@ void testMPSCNN() { LOG(INFO) << "MPSAdd Test"; Workspace ws; { - auto* t = ws.CreateBlob("X0_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU); t->Resize(1, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -1930,7 +1929,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("X1_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU); t->Resize(1, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -2012,7 +2011,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNNeuron Test: " << n; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(1, 4, 12, 12); CPUContext ctx; math::RandGaussian( @@ -2066,7 +2065,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNDropout Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(1, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -2137,7 +2136,7 @@ void testMPSCNN() { << " - scale: " << scale; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(1, channels, 40, 40); CPUContext ctx; math::RandGaussian( @@ -2145,7 +2144,7 @@ void testMPSCNN() { } { // Use the batch-first encoding (n, [bbox]) - auto* t = ws.CreateBlob("R")->GetMutable(); + auto* t = ws.CreateBlob("R")->GetMutableTensor(CPU); t->Resize(6, 5); for (auto i = 0; i < t->dim32(0); ++i) { t->mutable_data()[5 * i + 0] = 0; // batch @@ -2251,14 +2250,14 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNRoIWarp Test 2"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(1, 8, 40, 40); CPUContext ctx; math::RandGaussian( t->size(), 4, 2, t->mutable_data(), &ctx); } { - auto* t = ws.CreateBlob("R")->GetMutable(); + auto* t = ws.CreateBlob("R")->GetMutableTensor(CPU); t->Resize(6, 4); for (auto i = 0; i < t->dim32(0); ++i) { t->mutable_data()[4 * i + 0] = (i % 4 + 1) * 1.0 / scale; @@ -2363,7 +2362,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNResizeNearestOp Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(N, C, 37, 89); CPUContext ctx; math::RandGaussian( @@ -2498,7 +2497,7 @@ void testMPSCNN() { vector im_info{60, 80, 0.166667}; vector anchors{-38, -16, 53, 31, -120, -120, 135, 135}; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(num_images, A, H, W); for (auto i = 0; i < t->size(); ++i) { t->mutable_data()[i] = scores[i]; @@ -2506,7 +2505,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("bbox_delta_cpu")->GetMutable(); + auto* t = ws.CreateBlob("bbox_delta_cpu")->GetMutableTensor(CPU); t->Resize(num_images, 4 * A, H, W); for (auto i = 0; i < t->size(); ++i) { t->mutable_data()[i] = bbx[i]; @@ -2514,7 +2513,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("im_info")->GetMutable(); + auto* t = ws.CreateBlob("im_info")->GetMutableTensor(CPU); t->Resize(num_images, 3); for (auto i = 0; i < t->size(); ++i) { t->mutable_data()[i] = im_info[i]; @@ -2522,7 +2521,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("anchors")->GetMutable(); + auto* t = ws.CreateBlob("anchors")->GetMutableTensor(CPU); t->Resize(A, 4); for (auto i = 0; i < t->size(); ++i) { t->mutable_data()[i] = anchors[i]; @@ -2588,7 +2587,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNSoftmax Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); // Only works for spatial dimension of (1, 1) - weird. t->Resize(batchSize, 12, 1, 1); CPUContext ctx; @@ -2662,8 +2661,8 @@ void testMPSCNN() { LOG(INFO) << "MPSConvTranspose Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu") - ->GetMutable(); + auto* t = + ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(batchSize, inputChannels, 8, 12); CPUContext ctx; math::RandGaussian( @@ -2676,7 +2675,7 @@ void testMPSCNN() { { auto* t = - ws.CreateBlob("W")->GetMutable(); + ws.CreateBlob("W")->GetMutableTensor(CPU); t->Resize( inputChannels, outputChannels, @@ -2693,7 +2692,7 @@ void testMPSCNN() { { auto* t = - ws.CreateBlob("b")->GetMutable(); + ws.CreateBlob("b")->GetMutableTensor(CPU); t->Resize(outputChannels); CPUContext ctx; math::RandGaussian( @@ -2810,7 +2809,7 @@ void testMPSCNN() { << batchSize; Workspace ws; for (auto i = 0; i < numInputs; ++i) { - auto* t = ws.CreateBlob(cpu(i))->GetMutable(); + auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU); t->Resize(batchSize, array ? (i + 1) * 4 : 4, 10, 10); CPUContext ctx; math::RandGaussian( @@ -2892,7 +2891,7 @@ void testMPSCNN() { } Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(batchSize, inputChannels, 53, 47); CPUContext ctx; math::RandGaussian( @@ -2965,7 +2964,7 @@ void testMPSCNN() { << numInputs << ", " << batchSize; Workspace ws; for (auto i = 0; i < numInputs; ++i) { - auto* t = ws.CreateBlob(cpu(i))->GetMutable(); + auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU); t->Resize(batchSize, channelCount, 9, 17); CPUContext ctx; math::RandGaussian( @@ -3338,7 +3337,7 @@ void compareModels(const NetDef& initNet, NetDef predictNet) { cws.RunNetOnce(initNet); { auto* t = - cws.CreateBlob(predictNet.external_input(0))->GetMutable(); + cws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU); t->Resize(1, 224, 224, 4); for (auto i = 0; i < t->size(); ++i) { t->mutable_data()[i] = i % 225; @@ -3350,7 +3349,7 @@ void compareModels(const NetDef& initNet, NetDef predictNet) { mws.RunNetOnce(initNet); { auto* t = - mws.CreateBlob(predictNet.external_input(0))->GetMutable(); + mws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU); t->Resize(1, 224, 224, 4); for (auto i = 0; i < t->size(); ++i) { t->mutable_data()[i] = i % 225; @@ -3398,16 +3397,16 @@ void verifyRewrite( dumpDef(predictNet); dumpDef(metalPredictNet); -#define RUN_NET(ws, predictNet) \ - ws.RunNetOnce(initNet); \ - { \ - auto* t = \ - ws.CreateBlob(predictNet.external_input(0))->GetMutable(); \ - t->Resize(inputDims); \ - CPUContext ctx; \ - math::RandGaussian( \ - t->size(), 0, 1, t->mutable_data(), &ctx); \ - } \ +#define RUN_NET(ws, predictNet) \ + ws.RunNetOnce(initNet); \ + { \ + auto* t = \ + ws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU); \ + t->Resize(inputDims); \ + CPUContext ctx; \ + math::RandGaussian( \ + t->size(), 0, 1, t->mutable_data(), &ctx); \ + } \ ws.RunNetOnce(predictNet); // initialize diff --git a/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h index 70b9ac05747511..2896bc26ac08d4 100644 --- a/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h +++ b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h @@ -41,7 +41,7 @@ void RowsWhereRoILevelEquals(Eigen::Ref rois, // distribute those proposals to their appropriate FPN levels for Faster RCNN. // An anchor at one FPN level may predict an RoI that will map to another // level, hence the need to redistribute the proposals. -// Reference: detectron/lib/ops/collect_and_distribute_fpn_rpn_proposals.py +// Reference: facebookresearch/Detectron/detectron/ops/collect_and_distribute_fpn_rpn_proposals.py template class CollectAndDistributeFpnRpnProposalsOp final : public Operator { public: diff --git a/caffe2/operators/conv_op_cudnn.cc b/caffe2/operators/conv_op_cudnn.cc index ddb0f8f89c144b..2f11645f21c5cc 100644 --- a/caffe2/operators/conv_op_cudnn.cc +++ b/caffe2/operators/conv_op_cudnn.cc @@ -602,12 +602,12 @@ bool CudnnConvOp::DoRunWithType() { kernel_w())); } else { vector dims(filter.dims().begin(), filter.dims().end()); - dims[0] /= group_; #if !CUDNN_VERSION_MIN(7, 0, 0) + // We only need to divide dims by group_ when CUDNN version < 7.0 + // see CUDA group convolution doc: https://fburl.com/dgj6dvpd order_ == StorageOrder::NCHW ? dims[1] /= group_ : dims[filter.ndim() - 1] /= group_; #endif - dims[filter.ndim() - 1] /= group_; CUDNN_ENFORCE(cudnnSetFilterNdDescriptor( filter_desc_, cudnnTypeWrapper::type, @@ -959,10 +959,12 @@ bool CudnnConvGradientOp::DoRunWithType() { } else { vector dims(filter.dims().begin(), filter.dims().end()); #if !CUDNN_VERSION_MIN(7, 0, 0) - dims[0] /= group_; -#endif + // We only need to divide dims by group_ when CUDNN version < 7.0 + // see CUDA group convolution doc: https://fburl.com/dgj6dvpd order_ == StorageOrder::NCHW ? dims[1] /= group_ : dims[filter.ndim() - 1] /= group_; +#endif + CUDNN_ENFORCE(cudnnSetFilterNdDescriptor( filter_desc_, cudnnTypeWrapper::type, diff --git a/caffe2/operators/generate_proposals_op.h b/caffe2/operators/generate_proposals_op.h index 81f7d9ac43123f..faf4936495244f 100644 --- a/caffe2/operators/generate_proposals_op.h +++ b/caffe2/operators/generate_proposals_op.h @@ -59,7 +59,7 @@ ERMatXf ComputeAllAnchors( // regression result 'deltas' as well as predefined bounding box shapes // 'anchors'. Greedy non-maximum suppression is applied to generate the // final bounding boxes. -// Reference: detectron/lib/ops/generate_proposals.py +// Reference: facebookresearch/Detectron/detectron/ops/generate_proposals.py template class GenerateProposalsOp final : public Operator { public: diff --git a/caffe2/operators/generate_proposals_op_util_boxes.h b/caffe2/operators/generate_proposals_op_util_boxes.h index 0c4c345d382cb1..333514102b7d4b 100644 --- a/caffe2/operators/generate_proposals_op_util_boxes.h +++ b/caffe2/operators/generate_proposals_op_util_boxes.h @@ -5,7 +5,7 @@ #include "caffe2/utils/math.h" // Bounding box utils for generate_proposals_op -// Reference: detectron/lib/utils/boxes.py +// Reference: facebookresearch/Detectron/detectron/utils/boxes.py namespace caffe2 { namespace utils { diff --git a/caffe2/operators/generate_proposals_op_util_nms.h b/caffe2/operators/generate_proposals_op_util_nms.h index 5d6f87d4d30563..7b38cd6a1420d6 100644 --- a/caffe2/operators/generate_proposals_op_util_nms.h +++ b/caffe2/operators/generate_proposals_op_util_nms.h @@ -19,7 +19,7 @@ namespace utils { // Reject a bounding box if its region has an intersection-overunion (IoU) // overlap with a higher scoring selected bounding box larger than a // threshold. -// Reference: detectron/lib/utils/cython_nms.pyx +// Reference: facebookresearch/Detectron/detectron/utils/cython_nms.pyx // proposals: pixel coordinates of proposed bounding boxes, // size: (M, 4), format: [x1; y1; x2; y2] // scores: scores for each bounding box, size: (M, 1) @@ -78,7 +78,7 @@ std::vector nms_cpu_upright( /** * Soft-NMS implementation as outlined in https://arxiv.org/abs/1704.04503. - * Reference: detectron/lib/utils/cython_nms.pyx + * Reference: facebookresearch/Detectron/detectron/utils/cython_nms.pyx * out_scores: Output updated scores after applying Soft-NMS * proposals: pixel coordinates of proposed bounding boxes, * size: (M, 4), format: [x1; y1; x2; y2] @@ -426,7 +426,7 @@ std::vector nms_cpu( // Reject a bounding box if its region has an intersection-overunion (IoU) // overlap with a higher scoring selected bounding box larger than a // threshold. -// Reference: detectron/lib/utils/cython_nms.pyx +// Reference: facebookresearch/Detectron/detectron/lib/utils/cython_nms.pyx // proposals: pixel coordinates of proposed bounding boxes, // size: (M, 4), format: [x1; y1; x2; y2] // size: (M, 5), format: [ctr_x; ctr_y; w; h; angle (degrees)] for RRPN diff --git a/caffe2/opt/converter.cc b/caffe2/opt/converter.cc index b4866618b4e607..37d675eba83a49 100644 --- a/caffe2/opt/converter.cc +++ b/caffe2/opt/converter.cc @@ -146,9 +146,6 @@ REGISTER_CONVERTER(SpatialBN, BatchNormalizationConverter); TRIVIAL_CONVERTER(Flatten); REGISTER_CONVERTER(Flatten, FlattenConverter); -TRIVIAL_CONVERTER(BatchGather); -REGISTER_CONVERTER(BatchGather, BatchGatherConverter); - class AveragePoolConverter : public Converter { std::unique_ptr convertToNeuralNetOperator( const OperatorDef& op) override { @@ -205,37 +202,6 @@ class ConcatConverter : public Converter { }; REGISTER_CONVERTER(Concat, ConcatConverter); -class BatchMatMulConverter : public Converter { - std::unique_ptr convertToNeuralNetOperator( - const OperatorDef& op) override { - std::unique_ptr nnOp = - util::make_unique(); - auto argMap = getArgumentsFromOperator(op); - - auto c = dyn_cast(nnOp.get()); - if (argMap.count("trans_a")) { - CAFFE_ENFORCE(argMap["trans_a"].has_i(), "Invalid axis argument"); - int trans_a = static_cast(argMap["trans_a"].i()); - c->setTransA(!!trans_a); - } - if (argMap.count("trans_b")) { - CAFFE_ENFORCE(argMap["trans_b"].has_i(), "Invalid add_axis argument"); - int trans_b = static_cast(argMap["trans_b"].i()); - c->setTransB(!!trans_b); - } - if (argMap.count("broadcast")) { - CAFFE_ENFORCE(argMap["broadcast"].has_i(), "Invalid add_axis argument"); - int broadcast = static_cast(argMap["broadcast"].i()); - c->setBroadcast(!!broadcast); - } - return nnOp; - } - // Does not override default converter to OperatorDef - - virtual ~BatchMatMulConverter() {} -}; -REGISTER_CONVERTER(BatchMatMul, BatchMatMulConverter); - } // namespace std::unique_ptr convertToNeuralNetOperator( @@ -270,145 +236,6 @@ std::unique_ptr convertToNeuralNetOperator( return nnOp; } -void handleWhileOp( - repr::NNGraph& dfg, - repr::NNCFGraph& cfg, - repr::NNGraph::NodeRef& opNode, - repr::NNCFGraph::NodeRef& bbNode, - OperatorDef& op, - std::unordered_map& blobMap -) { - opNode->resetData(util::make_unique()); - auto argMap = Converter::getArgumentsFromOperator(op); - std::string bodyNetSerialized = argMap["body"].s(); - auto bodyNet = caffe2::NetDef(); - bodyNet.ParseFromString(bodyNetSerialized); - - std::unordered_map bodyBlobMap; - auto bodyNN = convertToNNModule(bodyNet, &bodyBlobMap); - repr::NNGraph bodyGraph = std::move(bodyNN.dataFlow); - repr::NNCFGraph bodyCFGraph = std::move(bodyNN.controlFlow); - - auto rev_sorted = algorithm::tarjans(&bodyGraph); - - for (auto& k : bodyBlobMap) { - auto name = k.first; - if (blobMap.count(name)) { - auto oldNode = blobMap[name]; - printf("Exit tensor %s is in the parent scope, inserting Phi node...\n", k.first.c_str()); - auto phiNode = dfg.createNode(util::make_unique()); // NN variant of a Phi node - // Clone the operator. - auto tensor = dyn_cast(blobMap[name]->data().get()); - auto* clonedTensor = tensor->clone(); - auto phiOut = dfg.createNode(std::unique_ptr(clonedTensor)); - dfg.createEdge(phiNode, phiOut); - dfg.createEdge(oldNode, phiNode); - dfg.createEdge(bodyBlobMap[name], phiNode); - blobMap[name] = phiOut; - for (auto& inEdge : opNode->getInEdges()) { - if (inEdge->tail() == oldNode) { - dfg.deleteEdge(inEdge); - dfg.createEdge(phiOut, opNode); - } - } - } - } - - // Dependencies simply have no producers - std::unordered_map inNodeMap; - for (auto& n : bodyGraph.getMutableNodes()) { - if (!isa(n->data())) { continue; } - if (n->getInEdges().size() == 0) { - auto name = dyn_cast(n->data().get())->getName(); - // TODO(bwasti): this may be needed, depending on constraints - //assert(blobMap.count(name) != 0 && "Loop body takes undefined dependency."); - if (blobMap.count(name)) { - inNodeMap[n] = blobMap[name]; - } - } - } - - CAFFE_ENFORCE(rev_sorted.front().getNodes().size() == 1, - "More than one exit node."); - CAFFE_ENFORCE(rev_sorted.back().getNodes().size() == 1, - "More than one entry node."); - - auto exit_tensor = *(rev_sorted.front().getNodes().begin()); - CAFFE_ENFORCE(isa(exit_tensor->data()), - "Exit node is not a tensor."); - - auto bodyNodes = bodyGraph.getMutableNodes(); - auto bodyEdges = bodyGraph.getMutableEdges(); - - for (auto node : bodyNodes) { - bodyGraph.importNode(node, dfg); - } - - for (auto edge : bodyEdges) { - bodyGraph.importEdge(edge, dfg); - } - - // Merge all dependencies - for (auto node : dfg.getMutableNodes()) { - if (inNodeMap.count(node)) { - dfg.replaceNode(node, inNodeMap[node]); - dfg.deleteNode(node); - } - } - - for (const auto& inEdge : opNode->getInEdges()) { - auto* inputData = dyn_cast(inEdge->tail()->data().get()); - auto* exitData = dyn_cast(exit_tensor->data().get()); - if (inputData->getName() == exitData->getName()) { - dfg.replaceNode(exit_tensor, inEdge->tail()); - dfg.deleteNode(exit_tensor); - } - } - - // CFG Handling - auto bodyCFNodes = bodyCFGraph.getMutableNodes(); - auto bodyCFEdges = bodyCFGraph.getMutableEdges(); - - // Create a while loop CFG node. - auto whileBasicBlock = util::make_unique>(); - for (auto& inEdge : opNode->getInEdges()) { - auto node = inEdge->tail(); - for (auto& parentInEdge : node->getInEdges()) { - auto parentNode = parentInEdge->tail(); - if (isa(parentNode->data().get())) { - whileBasicBlock->pushInstructionNode(parentNode); - } - } - } - whileBasicBlock->pushInstructionNode(opNode); - - auto whileCFNode = cfg.createNode(std::move(whileBasicBlock)); - cfg.createEdge(bbNode, whileCFNode, 0); - - // The true path executes the body of the loop, so we - // take that BB and point to it. - for (auto cfNode : bodyCFNodes) { - bodyCFGraph.importNode(cfNode, cfg); - // If the CFG node has no children, we loop back to the top of the - // while loop. - if (cfNode->getOutEdges().size() == 0) { - cfg.createEdge(cfNode, whileCFNode, 0); - } - // TODO check for a single entry point - if (cfNode->getInEdges().size() == 0) { - cfg.createEdge(whileCFNode, cfNode, 1); - } - } - for (auto cfEdge : bodyCFEdges) { - bodyCFGraph.importEdge(cfEdge, cfg); - } - - // Now create the false case. - bbNode = - cfg.createNode(util::make_unique>()); - cfg.createEdge(whileCFNode, bbNode, -1); -} - /// \brief Ingest a caffe2 protobuf model and output an NNModule. /// \param net The caffe2 protobuf NetDef @@ -455,13 +282,9 @@ repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_mapresetData(convertToNeuralNetOperator(op)); - auto currentBasicBlock = bbNode->mutableData()->get(); - currentBasicBlock->pushInstructionNode(opNode); - } + opNode->resetData(convertToNeuralNetOperator(op)); + auto currentBasicBlock = bbNode->mutableData()->get(); + currentBasicBlock->pushInstructionNode(opNode); } repr::NNModule module; diff --git a/caffe2/opt/converter_nomigraph_test.cc b/caffe2/opt/converter_nomigraph_test.cc index 69f51df49cbf74..0bab53f738d7c2 100644 --- a/caffe2/opt/converter_nomigraph_test.cc +++ b/caffe2/opt/converter_nomigraph_test.cc @@ -48,65 +48,3 @@ TEST(Converter, UnknownType) { auto new_netdef = caffe2::convertToCaffe2Proto(nn); } -/* Temporarily disabled While conversion tests -TEST(Converter, While) { - caffe2::NetDef net; - - caffe2::OperatorDef *def = net.add_op(); - def->set_type("While"); - def->add_input("X"); - - caffe2::NetDef body_net; - { - caffe2::OperatorDef *rdef = body_net.add_op(); - rdef->set_type("Relu"); - rdef->add_input("X"); - rdef->add_output("X"); - } - std::string body_net_serialized; - assert(body_net.SerializeToString(&body_net_serialized)); - ADD_ARG(def, "body", s, body_net_serialized); - - auto nn = caffe2::convertToNNModule(net); -} - -TEST(Converter, ComplexWhile) { - caffe2::NetDef net; - - { - caffe2::OperatorDef *rdef = net.add_op(); - rdef->set_type("Relu"); - rdef->add_input("X"); - rdef->add_output("X"); - } - - caffe2::OperatorDef *def = net.add_op(); - def->set_type("While"); - def->add_input("X"); - - caffe2::NetDef body_net; - { - caffe2::OperatorDef *rdef = body_net.add_op(); - rdef->set_type("Instr1"); - rdef->add_input("X"); - rdef->add_output("X"); - } - { - caffe2::OperatorDef *rdef = body_net.add_op(); - rdef->set_type("Instr2"); - rdef->add_input("X"); - rdef->add_output("X"); - } - { - caffe2::OperatorDef *rdef = body_net.add_op(); - rdef->set_type("Instr3"); - rdef->add_input("X"); - rdef->add_output("X"); - } - std::string body_net_serialized; - assert(body_net.SerializeToString(&body_net_serialized)); - ADD_ARG(def, "body", s, body_net_serialized); - - auto nn = caffe2::convertToNNModule(net); -} -*/ diff --git a/caffe2/opt/device.cc b/caffe2/opt/device.cc index 9abca6d67e08b3..0cfdd6c1dc91a3 100644 --- a/caffe2/opt/device.cc +++ b/caffe2/opt/device.cc @@ -9,15 +9,14 @@ std::vector getInputEdges( const NNGraph::SubgraphType& sg, const NNGraph& g) { std::vector inputTensorEdges; - for (const auto& node : sg.Nodes) { + for (const auto& node : sg.getNodes()) { NOM_REQUIRE_OR_CONT(nn::is(node)); NOM_REQUIRE_OR_CONT(nn::hasInputs(node)); // Check if tensor's parents are in the sg for (const auto& input : nn::getInputs(node)) { NOM_REQUIRE_OR_CONT( - !nn::hasProducer(input) || - sg.Nodes.count(nn::getProducer(input)) == 0); + !nn::hasProducer(input) || !sg.hasNode(nn::getProducer(input))); inputTensorEdges.emplace_back(g.getEdge(input, node)); } } @@ -28,13 +27,13 @@ std::vector getOutputEdges( const NNGraph::SubgraphType& sg, const NNGraph& g) { std::vector outputTensorEdges; - for (const auto& node : sg.Nodes) { + for (const auto& node : sg.getNodes()) { NOM_REQUIRE_OR_CONT(nn::is(node)); for (const auto& output : nn::getOutputs(node)) { auto consumers = nn::getConsumers(output); for (const auto& consumer : consumers) { - NOM_REQUIRE_OR_CONT(sg.Nodes.count(consumer) == 0); + NOM_REQUIRE_OR_CONT(!sg.hasNode(consumer)); outputTensorEdges.emplace_back(g.getEdge(node, output)); } NOM_REQUIRE_OR_CONT(consumers.size() == 0); diff --git a/caffe2/opt/fusion.cc b/caffe2/opt/fusion.cc index 8a1b736399562a..f5ea0f678ed515 100644 --- a/caffe2/opt/fusion.cc +++ b/caffe2/opt/fusion.cc @@ -1,5 +1,6 @@ -#include "caffe2/opt/converter.h" #include "caffe2/opt/fusion.h" +#include "caffe2/core/logging.h" +#include "caffe2/opt/converter.h" #include "caffe2/opt/passes.h" namespace caffe2 { @@ -18,27 +19,25 @@ bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) { for (auto convNode : repr::nn::nodeIterator(nn->dataFlow)) { auto output = repr::nn::getOutputs(convNode).front(); auto consumers = repr::nn::getConsumers(output); - if (consumers.size() != 1) { - continue; - } + NOM_REQUIRE_OR_CONT(consumers.size() == 1); + auto consumer = consumers.front(); - if (!repr::nn::is(consumer)) { - continue; - } + NOM_REQUIRE_OR_CONT(repr::nn::is(consumer)); + auto bnNode = consumer; auto bn = repr::nn::get(bnNode); + auto bnOutputs = nn::getOutputs(bnNode); + NOM_REQUIRE_OR_CONT(bnOutputs.size() == 1); + auto bnOutput = bnOutputs.front(); auto convInputs = repr::nn::getInputs(convNode); - if (convInputs.size() < 3) { - assert(0 && "Invalid convolution input size (TODO: optional bias)"); - continue; - } + CAFFE_ENFORCE( + convInputs.size() >= 3, + "Invalid convolution input size (TODO: optional bias)"); auto bnInputs = repr::nn::getInputs(bnNode); - if (bnInputs.size() < 5) { - assert(0 && "Invalid batch normalization input size"); - continue; - } + CAFFE_ENFORCE( + bnInputs.size() >= 5, "Invalid batch normalization input size"); #define EXPOSE_TENSOR_DATA(name, index, inputs) \ auto name = repr::nn::get(inputs[index]); \ @@ -69,6 +68,8 @@ bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) { biasConvData[c] = bias; } + nn->dataFlow.deleteNode(output); + nn->dataFlow.createEdge(convNode, bnOutput); nn->dataFlow.deleteNode(bnNode); return true; } diff --git a/caffe2/opt/mobile.cc b/caffe2/opt/mobile.cc index 6d0006818789bb..adbbbd19a1e367 100644 --- a/caffe2/opt/mobile.cc +++ b/caffe2/opt/mobile.cc @@ -11,23 +11,15 @@ using namespace nom; void addNNPACK(repr::NNModule* nn, bool low_memory) { for (auto node : nn->dataFlow.getMutableNodes()) { - auto* nodeData = node->data().get(); // Let graph retain ownership. - // Skip blobs. - if (!isa(nodeData)) { - continue; - } + NOM_REQUIRE_OR_CONT(repr::nn::is(node)); // Check if it is a convolution. - auto nnOp = dyn_cast(nodeData); - if (!isa(nnOp)) { - continue; - } + auto nnOp = repr::nn::get(node); + NOM_REQUIRE_OR_CONT(isa(nnOp)); // Requires X, W, b for NNPACK - if (node->getInEdges().size() < 3) { - continue; - } + NOM_REQUIRE_OR_CONT(node->getInEdges().size() >= 3); std::string engine = "NNPACK"; @@ -35,9 +27,7 @@ void addNNPACK(repr::NNModule* nn, bool low_memory) { bool validTransformCandidate = true; auto conv = dyn_cast(nnOp); - if (conv->getLayout() != nom::repr::Conv::NNLayout::NCHW) { - continue; - } + NOM_REQUIRE_OR_CONT(conv->getLayout() == nom::repr::Conv::NNLayout::NCHW); // NNPACK only supports stride == 1 for (auto stride : conv->getStrides()) { @@ -46,28 +36,21 @@ void addNNPACK(repr::NNModule* nn, bool low_memory) { break; } } - if (!validTransformCandidate) { - continue; - } + NOM_REQUIRE_OR_CONT(validTransformCandidate); // NNPACK only supports 2DConv. const auto& kernelShape = conv->getKernelShape(); - if (kernelShape.size() != 2) { - continue; - } + NOM_REQUIRE_OR_CONT(kernelShape.size() == 2); // Kx1 and 1xK convs are inefficient in NNPACK. if (kernelShape[0] != kernelShape[1]) { - if (kernelShape[0] == 1 || kernelShape[1] == 1) { - continue; - } + NOM_REQUIRE_OR_CONT(kernelShape[0] != 1 && kernelShape[1] != 1); } // We're good to use our engine. auto annotation = conv->getMutableAnnotation(); - if (!annotation || !isa(annotation)) { - continue; - } + NOM_REQUIRE_OR_CONT(annotation && isa(annotation)); + auto* op = dyn_cast(annotation)->getMutableOperatorDef(); op->set_engine(engine); if (!low_memory) { diff --git a/caffe2/opt/onnxifi_transformer.cc b/caffe2/opt/onnxifi_transformer.cc index 75baec0e9be66b..09528b99b5da51 100644 --- a/caffe2/opt/onnxifi_transformer.cc +++ b/caffe2/opt/onnxifi_transformer.cc @@ -323,8 +323,10 @@ void OnnxifiTransformer::Transform( // function to tell whether the ONNXIFI backend supports a given C2 op or not // TODO: choose backend id + onnxifi_library* backend = lib_; + onnxBackendID backend_id = backend_ids_[0]; auto supports = - [&exporter, &shape_hints, backend = lib_, backend_id = backend_ids_[0]]( + [&exporter, &shape_hints, backend, backend_id]( const caffe2::OperatorDef& op) { const OpSchema* schema = OpSchemaRegistry::Schema(op.type()); // NB: this might not be a hard constraint as we can just export C2 diff --git a/caffe2/predictor/predictor.cc b/caffe2/predictor/predictor.cc index 4c1e13d1008ac8..03264daf50f6a7 100644 --- a/caffe2/predictor/predictor.cc +++ b/caffe2/predictor/predictor.cc @@ -2,6 +2,7 @@ #ifdef CAFFE2_OPTIMIZER #include "caffe2/opt/optimizer.h" #endif +#include "caffe2/utils/proto_utils.h" #include #include "caffe2/core/init.h" @@ -96,7 +97,9 @@ Predictor::Predictor( GlobalInit(); #endif auto predict_net = config_.predict_net; - if (optimization) { + + if (optimization && + !ArgumentHelper::HasArgument(*predict_net, "disable_nomnigraph")) { #ifdef CAFFE2_OPTIMIZER try { *predict_net = opt::optimize(*predict_net, &ws_, optimization); diff --git a/caffe2/predictor/predictor.h b/caffe2/predictor/predictor.h index a3f05d7aacac89..458bf4401476c4 100644 --- a/caffe2/predictor/predictor.h +++ b/caffe2/predictor/predictor.h @@ -28,7 +28,7 @@ class Predictor { const NetDef& run_net, Workspace* parent = nullptr, bool run_init = true, - int optimization = 0); + int optimization = 1); ~Predictor() {} diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py index d10bfe209f7b39..dd1734a587c1fc 100644 --- a/caffe2/python/hypothesis_test.py +++ b/caffe2/python/hypothesis_test.py @@ -630,7 +630,7 @@ def _dense_gftrl(alpha, beta, lambda1, lambda2, w, nz, g): beta=st.floats(min_value=0.1, max_value=0.9), lambda1=st.floats(min_value=0.001, max_value=0.1), lambda2=st.floats(min_value=0.001, max_value=0.1), - engine=st.sampled_from([None]), + engine=st.sampled_from([None, "SIMD"]), **hu.gcs_cpu_only) def test_gftrl_sgd(self, inputs, in_place, alpha, beta, lambda1, lambda2, engine, gc, dc): diff --git a/caffe2/python/models/seq2seq/translate.py b/caffe2/python/models/seq2seq/translate.py index b1c0e1cd885ea4..d2b6a4f6399fff 100644 --- a/caffe2/python/models/seq2seq/translate.py +++ b/caffe2/python/models/seq2seq/translate.py @@ -5,10 +5,12 @@ from __future__ import print_function from __future__ import unicode_literals +from abc import ABCMeta, abstractmethod import argparse from future.utils import viewitems import logging import numpy as np +from six import with_metaclass import sys from caffe2.python import core, rnn_cell, workspace @@ -31,7 +33,60 @@ def _weighted_sum(model, values, weight, output_name): ) -class Seq2SeqModelCaffe2EnsembleDecoder(object): +class Seq2SeqModelCaffe2EnsembleDecoderBase(with_metaclass(ABCMeta, object)): + + @abstractmethod + def get_model_file(self, model): + pass + + @abstractmethod + def get_db_type(self): + pass + + def build_word_rewards(self, vocab_size, word_reward, unk_reward): + word_rewards = np.full([vocab_size], word_reward, dtype=np.float32) + word_rewards[seq2seq_util.PAD_ID] = 0 + word_rewards[seq2seq_util.GO_ID] = 0 + word_rewards[seq2seq_util.EOS_ID] = 0 + word_rewards[seq2seq_util.UNK_ID] = word_reward + unk_reward + return word_rewards + + def load_models(self): + db_reader = 'reader' + for model, scope_name in zip( + self.models, + self.decoder_scope_names, + ): + params_for_current_model = [ + param + for param in self.model.GetAllParams() + if str(param).startswith(scope_name) + ] + assert workspace.RunOperatorOnce(core.CreateOperator( + 'CreateDB', + [], [db_reader], + db=self.get_model_file(model), + db_type=self.get_db_type()) + ), 'Failed to create db {}'.format(self.get_model_file(model)) + assert workspace.RunOperatorOnce(core.CreateOperator( + 'Load', + [db_reader], + params_for_current_model, + load_all=1, + add_prefix=scope_name + '/', + strip_prefix='gpu_0/', + )) + logger.info('Model {} is loaded from a checkpoint {}'.format( + scope_name, self.get_model_file(model))) + + +class Seq2SeqModelCaffe2EnsembleDecoder(Seq2SeqModelCaffe2EnsembleDecoderBase): + + def get_model_file(self, model): + return model['model_file'] + + def get_db_type(self): + return 'minidb' def scope(self, scope_name, blob_name): return ( @@ -258,14 +313,6 @@ def _build_decoder( attention_weights, ) - def build_word_rewards(self, vocab_size, word_reward, unk_reward): - word_rewards = np.full([vocab_size], word_reward, dtype=np.float32) - word_rewards[seq2seq_util.PAD_ID] = 0 - word_rewards[seq2seq_util.GO_ID] = 0 - word_rewards[seq2seq_util.EOS_ID] = 0 - word_rewards[seq2seq_util.UNK_ID] = word_reward + unk_reward - return word_rewards - def __init__( self, translate_params, @@ -414,36 +461,6 @@ def __init__( for param in self.model.params: logger.info(param) - def load_models(self): - db_reader = 'reader' - for model, scope_name in zip( - self.models, - self.decoder_scope_names, - ): - params_for_current_model = [ - param - for param in self.model.GetAllParams() - if str(param).startswith(scope_name) - ] - assert workspace.RunOperatorOnce(core.CreateOperator( - 'CreateDB', - [], [db_reader], - db=model['model_file'], - db_type='minidb') - ), 'Failed to create db {}'.format(model['model_file']) - assert workspace.RunOperatorOnce(core.CreateOperator( - 'Load', - [db_reader], - params_for_current_model, - load_all=1, - add_prefix=scope_name + '/', - strip_prefix='gpu_0/', - )) - logger.info('Model {} is loaded from a checkpoint {}'.format( - scope_name, - model['model_file'], - )) - def decode(self, numberized_input, max_output_seq_len): workspace.FeedBlob( self.encoder_inputs, diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py index dab79b8b1fb0b4..93e45704fcfea6 100644 --- a/caffe2/python/onnx/backend.py +++ b/caffe2/python/onnx/backend.py @@ -35,6 +35,7 @@ import onnx.defs import onnx.optimizer import onnx.shape_inference +import onnx.utils from onnx.backend.base import Backend, Device, DeviceType, namedtupledict from caffe2.python.onnx.workspace import Workspace @@ -876,6 +877,7 @@ def _graph_to_net(cls, onnx_graph, opset_version): def _onnx_model_to_caffe2_net(cls, onnx_model, device, opset_version, include_initializers): device_option = get_device_option(Device(device)) + onnx_model = onnx.utils.polish_model(onnx_model) init_model = cls.optimize_onnx(onnx_model, init=True) pred_model = cls.optimize_onnx(onnx_model, predict=True) diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py index db870972f83946..ee60d776d55a82 100644 --- a/caffe2/python/optimizer.py +++ b/caffe2/python/optimizer.py @@ -1421,7 +1421,8 @@ def build_ftrl(model, engine="SIMD", **kwargs): def build_gftrl(model, engine="", **kwargs): - # SIMD version of GFTRL is not supported + if engine == "SIMD": + assert core.IsOperator('GFtrl_ENGINE_SIMD') gftrl_optimizer = GFtrlOptimizer(engine=engine, **kwargs) return _build(model, gftrl_optimizer) diff --git a/caffe2/python/predictor/mobile_exporter.py b/caffe2/python/predictor/mobile_exporter.py index 07f88def015544..3c42c2073163cd 100644 --- a/caffe2/python/predictor/mobile_exporter.py +++ b/caffe2/python/predictor/mobile_exporter.py @@ -20,6 +20,7 @@ def add_tensor(net, name, blob): np.dtype('int32'): "GivenTensorIntFill", np.dtype('int64'): "GivenTensorInt64Fill", np.dtype('uint8'): "GivenTensorStringFill", + np.dtype('O'): "GivenTensorStringFill" } shape = blob.shape @@ -29,6 +30,12 @@ def add_tensor(net, name, blob): if blob.dtype == np.dtype('uint8'): shape = [1] values = [str(blob.data)] + # Only allow string arrays as objects. + # The only intended use case for this is to store arrays of strings in the + # model which can be used for post processing results in subsequent ops. + if blob.dtype == np.dtype('O'): + for blob_val in blob: + assert(isinstance(blob_val, bytes)) op = core.CreateOperator( kTypeNameMapper[blob.dtype], diff --git a/caffe2/python/predictor/mobile_exporter_test.py b/caffe2/python/predictor/mobile_exporter_test.py index e7bbe2c90351c4..1c4cf77ea0512f 100644 --- a/caffe2/python/predictor/mobile_exporter_test.py +++ b/caffe2/python/predictor/mobile_exporter_test.py @@ -73,11 +73,15 @@ def test_mobile_exporter_datatypes(self): model = ModelHelper(name="mobile_exporter_test_model") model.Copy("data_int", "out") model.params.append("data_int") + model.Copy("data_obj", "out_obj") + model.params.append("data_obj") # Create our mobile exportable networks workspace.RunNetOnce(model.param_init_net) np_data_int = np.random.randint(100, size=(1, 1, 28, 28), dtype=np.int32) workspace.FeedBlob("data_int", np_data_int) + np_data_obj = np.array(['aa', 'bb']).astype(np.dtype('O')) + workspace.FeedBlob("data_obj", np_data_obj) init_net, predict_net = mobile_exporter.Export( workspace, model.net, model.params @@ -86,6 +90,7 @@ def test_mobile_exporter_datatypes(self): workspace.CreateNet(model.net) workspace.RunNet(model.net) ref_out = workspace.FetchBlob("out") + ref_out_obj = workspace.FetchBlob("out_obj") # Clear the workspace workspace.ResetWorkspace() @@ -97,9 +102,11 @@ def test_mobile_exporter_datatypes(self): workspace.CreateNet(predict_net, True) workspace.RunNet(predict_net.name) manual_run_out = workspace.FetchBlob("out") + manual_run_out_obj = workspace.FetchBlob("out_obj") np.testing.assert_allclose( ref_out, manual_run_out, atol=1e-10, rtol=1e-10 ) + np.testing.assert_equal(ref_out_obj, manual_run_out_obj) # Clear the workspace workspace.ResetWorkspace() @@ -109,11 +116,17 @@ def test_mobile_exporter_datatypes(self): init_net.SerializeToString(), predict_net.SerializeToString() ) - # Output is a vector of outputs but we only care about the first and only result + # Output is a vector of outputs. predictor_out = predictor.run([]) - assert len(predictor_out) == 1 - predictor_out = predictor_out[0] - + assert len(predictor_out) == 2 + predictor_out_int = predictor_out[1] + predictor_out_obj = predictor_out[0] + # The order in predictor_out is non-deterministic. Use type of the entry + # to figure out what to compare it to. + if isinstance(predictor_out[1][0], bytes): + predictor_out_int = predictor_out[0] + predictor_out_obj = predictor_out[1] np.testing.assert_allclose( - ref_out, predictor_out, atol=1e-10, rtol=1e-10 + ref_out, predictor_out_int, atol=1e-10, rtol=1e-10 ) + np.testing.assert_equal(ref_out_obj, predictor_out_obj) diff --git a/caffe2/python/transformations_test.py b/caffe2/python/transformations_test.py index 2edc88ce0458d4..6e66cd75315716 100644 --- a/caffe2/python/transformations_test.py +++ b/caffe2/python/transformations_test.py @@ -179,6 +179,7 @@ def test_transformer_SinkMaxPool(self): epsilon=st.floats(min_value=1e-5, max_value=1e-2), ) def test_transformer_FuseConvBN(self, size, input_channels, seed, order, epsilon): + workspace.ResetWorkspace() net = core.Net("net") c = input_channels h = size @@ -204,16 +205,22 @@ def test_transformer_FuseConvBN(self, size, input_channels, seed, order, epsilon workspace.FeedBlob("scale", np.random.rand(c).astype(np.float32)) workspace.FeedBlob("bias", np.random.rand(c).astype(np.float32)) workspace.FeedBlob("mean", np.random.rand(c).astype(np.float32)) - workspace.FeedBlob("var", np.random.rand(c).astype(np.float32)) + # This is necessary because 1/sqrt(var) is used and if var is too small + # we get floating point artifacts that cause test failures + workspace.FeedBlob("var", np.random.rand(c).astype(np.float32) + 0.5) workspace.RunNetOnce(net) - preTransformOutput = workspace.FetchBlob("Y2") + preTransformOutput = workspace.FetchBlob("Y2").flatten() + workspace.FeedBlob("Y2", np.zeros((1, 1))) transformer.FuseConvBN(net) # Ensure fusion assert len(net.Proto().op) == 1 workspace.RunNetOnce(net) - postTransformOutput = workspace.FetchBlob("Y2") + postTransformOutput = workspace.FetchBlob("Y2").flatten() # Check that there is no numerical difference assert np.allclose( - preTransformOutput, postTransformOutput, rtol=1e-05, atol=1e-08 + preTransformOutput, + postTransformOutput, + rtol=1e-02, + atol=1e-04 ) diff --git a/caffe2/requirements.txt b/caffe2/requirements.txt index 9a1d67efc7c2f3..07fd95b72582a2 100644 --- a/caffe2/requirements.txt +++ b/caffe2/requirements.txt @@ -1,2 +1,4 @@ numpy enum34 +pyyaml +typing diff --git a/caffe2/utils/Array.h b/caffe2/utils/Array.h index 921deb9b0b41aa..ad9a80ed9203b5 100644 --- a/caffe2/utils/Array.h +++ b/caffe2/utils/Array.h @@ -38,10 +38,10 @@ #pragma once -#include +#include #include #include -#include "caffe2/utils/C++17.h" +#include namespace c10 { namespace guts { @@ -101,32 +101,32 @@ class array final { // No explicit construct/copy/destroy for aggregate type. // DR 776. - C10_CPP14_CONSTEXPR void fill(const value_type& __u) + AT_CPP14_CONSTEXPR void fill(const value_type& __u) { std::fill_n(begin(), size(), __u); } - C10_CPP14_CONSTEXPR void swap(array& __other) + AT_CPP14_CONSTEXPR void swap(array& __other) { std::swap_ranges(begin(), end(), __other.begin()); } // Iterators. - C10_CPP14_CONSTEXPR iterator begin() noexcept + AT_CPP14_CONSTEXPR iterator begin() noexcept { return iterator(data()); } constexpr const_iterator begin() const noexcept { return const_iterator(data()); } - C10_CPP14_CONSTEXPR iterator end() noexcept + AT_CPP14_CONSTEXPR iterator end() noexcept { return iterator(data() + _Nm); } constexpr const_iterator end() const noexcept { return const_iterator(data() + _Nm); } - C10_CPP14_CONSTEXPR reverse_iterator rbegin() noexcept + AT_CPP14_CONSTEXPR reverse_iterator rbegin() noexcept { return reverse_iterator(end()); } constexpr const_reverse_iterator rbegin() const noexcept { return const_reverse_iterator(end()); } - C10_CPP14_CONSTEXPR reverse_iterator rend() noexcept + AT_CPP14_CONSTEXPR reverse_iterator rend() noexcept { return reverse_iterator(begin()); } constexpr const_reverse_iterator rend() const noexcept @@ -152,13 +152,13 @@ class array final { constexpr bool empty() const noexcept { return size() == 0; } // Element access. - C10_CPP14_CONSTEXPR reference operator[](size_type __n) noexcept + AT_CPP14_CONSTEXPR reference operator[](size_type __n) noexcept { return _AT_Type::_S_ref(_M_elems, __n); } constexpr const_reference operator[](size_type __n) const noexcept { return _AT_Type::_S_ref(_M_elems, __n); } - C10_CPP14_CONSTEXPR reference at(size_type __n) { + AT_CPP14_CONSTEXPR reference at(size_type __n) { if (__n >= _Nm) { detail::__throw_out_of_range(std::string() + "array::at: __n (which is " + to_string(__n) + ") " + @@ -177,13 +177,13 @@ class array final { _AT_Type::_S_ref(_M_elems, 0)); } - C10_CPP14_CONSTEXPR reference front() noexcept + AT_CPP14_CONSTEXPR reference front() noexcept { return *begin(); } constexpr const_reference front() const noexcept { return _AT_Type::_S_ref(_M_elems, 0); } - C10_CPP14_CONSTEXPR reference back() noexcept + AT_CPP14_CONSTEXPR reference back() noexcept { return _Nm ? *(end() - 1) : *end(); } constexpr const_reference back() const noexcept @@ -192,7 +192,7 @@ class array final { : _AT_Type::_S_ref(_M_elems, 0); } - C10_CPP14_CONSTEXPR pointer data() noexcept + AT_CPP14_CONSTEXPR pointer data() noexcept { return _AT_Type::_S_ptr(_M_elems); } constexpr const_pointer data() const noexcept diff --git a/caffe2/utils/C++17.cpp b/caffe2/utils/C++17.cpp deleted file mode 100644 index d75d9fc9dff490..00000000000000 --- a/caffe2/utils/C++17.cpp +++ /dev/null @@ -1 +0,0 @@ -#include "caffe2/utils/C++17.h" diff --git a/caffe2/utils/CMakeLists.txt b/caffe2/utils/CMakeLists.txt index 5db06663bf6403..67897c36fe485a 100644 --- a/caffe2/utils/CMakeLists.txt +++ b/caffe2/utils/CMakeLists.txt @@ -63,8 +63,6 @@ set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} set(LIB_SOURCES_CPU Array.cpp - C++17.cpp - IdWrapper.cpp Optional.cpp Metaprogramming.cpp TypeList.cpp diff --git a/caffe2/utils/IdWrapper.cpp b/caffe2/utils/IdWrapper.cpp deleted file mode 100644 index 7646a1392d4a6b..00000000000000 --- a/caffe2/utils/IdWrapper.cpp +++ /dev/null @@ -1 +0,0 @@ -#include "caffe2/utils/IdWrapper.h" diff --git a/caffe2/utils/IdWrapper.h b/caffe2/utils/IdWrapper.h deleted file mode 100644 index 0c8e548ca017f6..00000000000000 --- a/caffe2/utils/IdWrapper.h +++ /dev/null @@ -1,67 +0,0 @@ -#pragma once - -#include - -namespace c10 { namespace guts { - -/** - * This template simplifies generation of simple classes that wrap an id - * in a typesafe way. Namely, you can use it to create a very lightweight - * type that only offers equality comparators and hashing. Example: - * - * struct MyIdType final : IdWrapper { - * constexpr explicit MyIdType(uint32_t id): IdWrapper(id) {} - * }; - * - * Then in the global top level namespace: - * - * C10_DEFINE_IDWRAPPER(MyIdType); - * - * That's it - equality operators and hash functions are automatically defined - * for you, given the underlying type supports it. - */ -template -class IdWrapper { -public: - using underlying_type = UnderlyingType; - using concrete_type = ConcreteType; - -protected: - constexpr explicit IdWrapper(underlying_type id) noexcept(noexcept(underlying_type(std::declval()))) - : id_(id) {} - - constexpr underlying_type underlyingId() const noexcept(noexcept(underlying_type(std::declval()))) { - return id_; - } - -private: - friend size_t hash_value(const concrete_type& v) { - return std::hash()(v.id_); - } - - // TODO Making operator== noexcept if underlying type is noexcept equality comparable doesn't work with GCC 4.8. - // Fix this once we don't need GCC 4.8 anymore. - friend constexpr bool operator==(const concrete_type& lhs, const concrete_type& rhs) { - return lhs.id_ == rhs.id_; - } - - // TODO Making operator!= noexcept if operator== is noexcept doesn't work with GCC 4.8. - // Fix this once we don't need GCC 4.8 anymore. - friend constexpr bool operator!=(const concrete_type& lhs, const concrete_type& rhs) { - return !(lhs == rhs); - } - - underlying_type id_; -}; - -}} - -#define C10_DEFINE_HASH_FOR_IDWRAPPER(ClassName) \ - namespace std { \ - template <> \ - struct hash { \ - size_t operator()(ClassName x) const { \ - return hash_value(x); \ - } \ - }; \ - } diff --git a/caffe2/utils/TypeList.h b/caffe2/utils/TypeList.h index 3494843feae121..7c20fa6613b966 100644 --- a/caffe2/utils/TypeList.h +++ b/caffe2/utils/TypeList.h @@ -1,6 +1,6 @@ #pragma once -#include "caffe2/utils/C++17.h" +#include #include "caffe2/utils/TypeTraits.h" namespace c10 { namespace guts { namespace typelist { diff --git a/caffe2/utils/TypeTraits.h b/caffe2/utils/TypeTraits.h index 004586987a81f7..c60f8a00b1ebdd 100644 --- a/caffe2/utils/TypeTraits.h +++ b/caffe2/utils/TypeTraits.h @@ -1,6 +1,6 @@ #pragma once -#include "caffe2/utils/C++17.h" +#include #include namespace c10 { diff --git a/caffe2/utils/math_cpu.cc b/caffe2/utils/math_cpu.cc index e0ae5cc0336e2a..c573542af5763c 100644 --- a/caffe2/utils/math_cpu.cc +++ b/caffe2/utils/math_cpu.cc @@ -2605,6 +2605,13 @@ bool TransposeWithHPTT( axes_cm[i] = cm_fn(axes[cm_fn(i)]); dims_cm[i] = dims[cm_fn(i)]; } + + // HPTT doesn't handle 0 sized inputs. + for (auto dim : dims_cm) { + if (dim <= 0) { + return false; + } + } auto plan = hptt::create_plan( axes_cm.data(), ndim, diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake index bc30f35f2a2eee..3829219a933b5d 100644 --- a/cmake/Codegen.cmake +++ b/cmake/Codegen.cmake @@ -1,3 +1,9 @@ +# This ill-named file does a number of things: +# - Installs Caffe2 header files (this has nothing to do with code generation) +# - Configures caffe2/core/macros.h +# - Creates an ATen target for its generated C++ files and adds it +# as a dependency + if (DEFINED ENV{PYTORCH_PYTHON}) message(STATUS "Using python found in $ENV{PYTORCH_PYTHON}") set(PYCMD "$ENV{PYTORCH_PYTHON}") @@ -14,6 +20,11 @@ configure_file( install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../caffe2 DESTINATION include FILES_MATCHING PATTERN "*.h") +if (NOT BUILD_ATEN) + install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core + DESTINATION include/ATen/core + FILES_MATCHING PATTERN "*.h") +endif() install(FILES ${CMAKE_BINARY_DIR}/caffe2/core/macros.h DESTINATION include/caffe2/core) diff --git a/cmake/MiscCheck.cmake b/cmake/MiscCheck.cmake index 2a4e61f97b0b18..2f2628bb149866 100644 --- a/cmake/MiscCheck.cmake +++ b/cmake/MiscCheck.cmake @@ -83,22 +83,26 @@ endif() cmake_pop_check_state() # ---[ Check for NUMA support -cmake_push_check_state(RESET) -set(CMAKE_REQUIRED_FLAGS "-std=c++11") -CHECK_CXX_SOURCE_COMPILES( +if (USE_NUMA) + cmake_push_check_state(RESET) + set(CMAKE_REQUIRED_FLAGS "-std=c++11") + CHECK_CXX_SOURCE_COMPILES( "#include #include int main(int argc, char** argv) { }" CAFFE2_IS_NUMA_AVAILABLE) - -if (CAFFE2_IS_NUMA_AVAILABLE) - message(STATUS "NUMA is available") + if (CAFFE2_IS_NUMA_AVAILABLE) + message(STATUS "NUMA is available") + else() + message(STATUS "NUMA is not available") + set(CAFFE2_DISABLE_NUMA 1) + endif() + cmake_pop_check_state() else() - message(STATUS "NUMA is not available") + message(STATUS "NUMA is disabled") set(CAFFE2_DISABLE_NUMA 1) endif() -cmake_pop_check_state() # ---[ Check if we want to turn off deprecated warning due to glog. # Note(jiayq): on ubuntu 14.04, the default glog install uses ext/hash_set that @@ -157,6 +161,15 @@ if (${COMPILER_SUPPORTS_HIDDEN_INLINE_VISIBILITY}) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CAFFE2_VISIBILITY_FLAG}") endif() +# ---[ Checks if linker supports -rdynamic. `-rdynamic` tells linker +# -to add all (including unused) symbols into the dynamic symbol +# -table. We need this to get symbols when generating backtrace at +# -runtime. +check_cxx_compiler_flag("-rdynamic" COMPILER_SUPPORTS_RDYNAMIC) +if (${COMPILER_SUPPORTS_RDYNAMIC}) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -rdynamic") +endif() + # ---[ If we are using msvc, set no warning flags # Note(jiayq): if you are going to add a warning flag, check if this is # totally necessary, and only add when you see fit. If it is needed due to diff --git a/docs/libtorch.rst b/docs/libtorch.rst new file mode 100644 index 00000000000000..9ab59a4d749d66 --- /dev/null +++ b/docs/libtorch.rst @@ -0,0 +1,19 @@ +libtorch (C++-only) +=================== + +The core of pytorch can be built and used without Python. A +CMake-based build system compiles the C++ source code into a shared +object, libtorch.so. + +Building libtorch +----------------- + +There is a script which wraps the CMake build. Invoke it with + +:: + cd pytorch + BUILD_TORCH=ON ONNX_NAMESPACE=onnx_torch bash tools/build_pytorch_libs.sh --use-nnpack caffe2 + ls torch/lib/tmp_install # output is produced here + ls torch/lib/tmp_install/lib/libtorch.so # of particular interest + +Future work will simplify this further. diff --git a/docs/source/distributions.rst b/docs/source/distributions.rst index 93224462e3177e..de541b467e819e 100644 --- a/docs/source/distributions.rst +++ b/docs/source/distributions.rst @@ -203,6 +203,15 @@ Probability distributions - torch.distributions :undoc-members: :show-inheritance: +:hidden:`NegativeBinomial` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. currentmodule:: torch.distributions.negative_binomial +.. autoclass:: NegativeBinomial + :members: + :undoc-members: + :show-inheritance: + :hidden:`Normal` ~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/nn.rst b/docs/source/nn.rst index 987044bbd212f4..283409ea3676b8 100644 --- a/docs/source/nn.rst +++ b/docs/source/nn.rst @@ -338,6 +338,12 @@ Non-linear activations (weighted sum, nonlinearity) .. autoclass:: SELU :members: +:hidden:`CELU` +~~~~~~~~~~~~~~ + +.. autoclass:: CELU + :members: + :hidden:`Sigmoid` ~~~~~~~~~~~~~~~~~ @@ -604,6 +610,12 @@ Loss functions .. autoclass:: CrossEntropyLoss :members: +:hidden:`CTCLoss` +~~~~~~~~~~~~~~~~~ + +.. autoclass:: CTCLoss + :members: + :hidden:`NLLLoss` ~~~~~~~~~~~~~~~~~ @@ -984,6 +996,11 @@ Non-linear activation functions .. autofunction:: selu +:hidden:`celu` +~~~~~~~~~~~~~~ + +.. autofunction:: celu + :hidden:`leaky_relu` ~~~~~~~~~~~~~~~~~~~~ @@ -1180,6 +1197,11 @@ Loss functions .. autofunction:: cross_entropy +:hidden:`ctc_loss` +~~~~~~~~~~~~~~~~~~ + +.. autofunction:: ctc_loss + :hidden:`hinge_embedding_loss` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/scripts/build_activation_images.py b/docs/source/scripts/build_activation_images.py index ce424d1ff188fa..e973933e205692 100644 --- a/docs/source/scripts/build_activation_images.py +++ b/docs/source/scripts/build_activation_images.py @@ -36,6 +36,7 @@ 'ReLU6', 'RReLU', 'SELU', + 'CELU', 'Sigmoid', 'Softplus', 'Softshrink', diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst index c3c85797b4cd82..06b0305d28aae8 100644 --- a/docs/source/tensors.rst +++ b/docs/source/tensors.rst @@ -46,7 +46,7 @@ A tensor can be constructed from a Python :class:`list` or sequence using the If you have a numpy array and want to avoid a copy, use :func:`torch.as_tensor`. -An tensor of specific data type can be constructed by passing a +A tensor of specific data type can be constructed by passing a :class:`torch.dtype` and/or a :class:`torch.device` to a constructor or tensor creation op: diff --git a/docs/source/torch.rst b/docs/source/torch.rst index c1e914c03c74e7..c68ec039d74ce3 100644 --- a/docs/source/torch.rst +++ b/docs/source/torch.rst @@ -306,3 +306,7 @@ BLAS and LAPACK Operations .. autofunction:: svd .. autofunction:: symeig .. autofunction:: trtrs + +Utilities +---------------------------------- +.. autofunction:: compiled_with_cxx11_abi diff --git a/scripts/build_anaconda.sh b/scripts/build_anaconda.sh index 1db0f546724103..62185d1e9dc821 100755 --- a/scripts/build_anaconda.sh +++ b/scripts/build_anaconda.sh @@ -296,6 +296,10 @@ fi # Add packages required for all Caffe2 builds add_package 'glog' add_package 'gflags' +add_package 'mkl' '>=2018' +add_package 'mkl-include' +add_package 'typing' +append_to_section 'build' '- pyyaml' caffe2_cmake_args+=("-DUSE_LEVELDB=OFF") caffe2_cmake_args+=("-DUSE_LMDB=OFF") @@ -303,10 +307,6 @@ caffe2_cmake_args+=("-DUSE_LMDB=OFF") # Add packages required for pytorch if [[ -n $integrated ]]; then add_package 'cffi' - add_package 'mkl' '>=2018' - add_package 'mkl-include' - add_package 'typing' - append_to_section 'build' '- pyyaml' append_to_section 'build' '- setuptools' #caffe2_cmake_args+=("-DBLAS=MKL") if [[ -n $cuda_ver ]]; then diff --git a/setup.py b/setup.py index 042d8668bb7b96..2e2ef60fb41313 100644 --- a/setup.py +++ b/setup.py @@ -659,7 +659,9 @@ def run(self): # Clang has an unfixed bug leading to spurious missing # braces warnings, see # https://bugs.llvm.org/show_bug.cgi?id=21629 - '-Wno-missing-braces' + '-Wno-missing-braces', + # gcc7 seems to report spurious warnings with this enabled + "-Wno-stringop-overflow", ] if check_env_flag('WERROR'): extra_compile_args.append('-Werror') @@ -1023,6 +1025,7 @@ def make_relative_rpath(path): 'lib/torch_shm_manager', 'lib/*.h', 'lib/include/ATen/*.h', + 'lib/include/ATen/core/*.h', 'lib/include/ATen/detail/*.h', 'lib/include/ATen/cuda/*.h', 'lib/include/ATen/cuda/*.cuh', diff --git a/setup_caffe2.py b/setup_caffe2.py index 0fd620549b31d8..d8ebf4fc7ed84f 100644 --- a/setup_caffe2.py +++ b/setup_caffe2.py @@ -131,6 +131,7 @@ def run(self): # configure cmake_args = [ find_executable('cmake'), + '-DUSE_ATEN=ON', '-DBUILD_SHARED_LIBS=OFF', '-DPYTHON_EXECUTABLE:FILEPATH={}'.format(sys.executable), '-DPYTHON_INCLUDE_DIR={}'.format(sysconfig.get_python_inc()), diff --git a/test/common.py b/test/common.py index 1eb4076dbf360b..4dbe3c56c47c98 100644 --- a/test/common.py +++ b/test/common.py @@ -118,16 +118,6 @@ def dec(fn): return dec -def skipIfNoZeroSize(fn): - @wraps(fn) - def wrapper(*args, **kwargs): - if torch._C._use_zero_size_dim(): - fn(*args, **kwargs) - else: - raise unittest.SkipTest('Compiled without arbitrary zero size dimension support') - return wrapper - - def get_cuda_memory_usage(): # we don't need CUDA synchronize because the statistics are not tracked at # actual freeing, but at when marking the block as free. diff --git a/test/common_nn.py b/test/common_nn.py index 6172f4b15adc3f..0444ba4eb6ae46 100644 --- a/test/common_nn.py +++ b/test/common_nn.py @@ -125,6 +125,7 @@ def get_weight(m): module_name='ELU', constructor_args=(2.,), input_size=(3, 2, 5), + reference_fn=lambda x, _: torch.where(x >= 0, x, 2 * (x.exp() - 1)) ), # TODO: reference function dict( @@ -448,6 +449,43 @@ def marginrankingloss_reference(input1, input2, target, margin=0, reduction='ele return output +# this directly follows Graves et al's paper, in contrast to the production implementation, it does not use log-space +def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0, reduction='elementwise_mean'): + input_lengths = torch.tensor(input_lengths, dtype=torch.long) + target_lengths = torch.tensor(target_lengths, dtype=torch.long) + dt = log_probs.dtype + log_probs = log_probs.double() # we need the accuracy as we are not in logspace + targets = targets.long() + cum_target_lengths = target_lengths.cumsum(0) + losses = [] + for i in range(log_probs.size(1)): + input_length = input_lengths[i].item() + target_length = target_lengths[i].item() + cum_target_length = cum_target_lengths[i].item() + targets_prime = targets.new_full((2 * target_length + 1,), blank) + if targets.dim() == 2: + targets_prime[1::2] = targets[i, :target_length] + else: + targets_prime[1::2] = targets[cum_target_length - target_length:cum_target_length] + probs = log_probs[:input_length, i].exp() + alpha = log_probs.new_zeros((target_length * 2 + 1,)) + alpha[0] = probs[0, blank] + alpha[1] = probs[0, targets_prime[1]] + mask_third = (targets_prime[:-2] != targets_prime[2:]) + for t in range(1, input_length): + alpha_next = alpha.clone() + alpha_next[1:] += alpha[:-1] + alpha_next[2:] += torch.where(mask_third, alpha[:-2], alpha.new_zeros(1)) + alpha = probs[t, targets_prime] * alpha_next + losses.append(-alpha[-2:].sum().log()[None]) + output = torch.cat(losses, 0) + if reduction == 'elementwise_mean': + return (output / target_lengths.to(dtype=output.dtype, device=output.device)).mean() + elif reduction == 'sum': + return output.sum() + output = output.to(dt) + return output + loss_reference_fns = { 'KLDivLoss': kldivloss_reference, 'NLLLoss': nllloss_reference, @@ -460,6 +498,7 @@ def marginrankingloss_reference(input1, input2, target, margin=0, reduction='ele 'CosineEmbeddingLoss': cosineembeddingloss_reference, 'TripletMarginLoss': tripletmarginloss_reference, 'MarginRankingLoss': marginrankingloss_reference, + 'CTCLoss': ctcloss_reference, } @@ -841,7 +880,7 @@ def check_criterion_jacobian(self, criterion, input, target): class TestBase(object): - _required_arg_names = {'constructor_args', 'input'} + _required_arg_names = {'constructor_args', 'input', 'extra_args'} def __init__(self, constructor, desc='', reference_fn=None, fullname=None, **kwargs): self.desc = desc @@ -850,8 +889,8 @@ def __init__(self, constructor, desc='', reference_fn=None, fullname=None, **kwa self.reference_fn = reference_fn for name in self._required_arg_names: if name not in kwargs and name + '_fn' not in kwargs and name + '_size' not in kwargs: - if name == 'constructor_args': - kwargs['constructor_args'] = tuple() + if name in {'constructor_args', 'extra_args'}: + kwargs[name] = tuple() else: raise ValueError("{}: Specify {} by a value, a function to generate it, or it's size!" .format(self.get_name(), name)) @@ -879,6 +918,10 @@ def _unpack(self, value): def constructor_args(self): return self._get_arg('constructor_args', True) + @property + def extra_args(self): + return self._get_arg('extra_args', True) + def _get_arg(self, name, unpack): assert name in self._required_arg_names @@ -1103,9 +1146,9 @@ def __call__(self, test_case): target = self._get_target() if self.reference_fn is not None: - out = test_case._forward_criterion(module, input, target) - expected_out = self.reference_fn(deepcopy(input), - deepcopy(target), module) + out = test_case._forward_criterion(module, input, target, extra_args=self.extra_args) + ref_args = (deepcopy(input), deepcopy(target)) + self.extra_args + (module,) + expected_out = self.reference_fn(*ref_args) if isinstance(expected_out, torch.Tensor): expected_out = expected_out.item() test_case.assertEqual(out, expected_out) diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp index 8e66a66962d44f..be2fd6e0d969ba 100644 --- a/test/cpp/api/modules.cpp +++ b/test/cpp/api/modules.cpp @@ -237,7 +237,7 @@ TEST_CASE("modules") { REQUIRE(functional(torch::ones({}) * -1).toCFloat() == 0); } { - auto functional = Functional(torch::elu, /*alpha=*/1, /*scale=*/0); + auto functional = Functional(torch::elu, /*alpha=*/1, /*scale=*/0, /*input_scale=*/1); REQUIRE(functional(torch::ones({})).toCFloat() == 0); } } diff --git a/test/expect/TestBatched.test_for.expect b/test/expect/TestBatched.test_for.expect new file mode 100644 index 00000000000000..bcbcffaee486a3 --- /dev/null +++ b/test/expect/TestBatched.test_for.expect @@ -0,0 +1,22 @@ +graph(%x.1_data : Dynamic + %x.1_mask : Dynamic + %x.1_dims : Dynamic + %y_data : Dynamic + %y_mask : Dynamic + %y_dims : Dynamic) { + %6 : int = prim::Constant[value=10]() + %7 : int = prim::Constant[value=1]() + %x : Dynamic, %21 : Dynamic, %22 : Dynamic = prim::Loop(%6, %7, %x.1_data, %x.1_mask, %x.1_dims) + block0(%loop_num : int, %5_data : Dynamic, %5_mask : Dynamic, %5_dims : Dynamic) { + %13 : int = prim::Constant[value=1]() + %14 : Long() = prim::NumToTensor(%13) + %alpha : float = prim::TensorToNum(%14) + %data.1 : Dynamic = aten::add(%5_data, %y_data, %alpha) + %mask : Dynamic = aten::mul(%5_mask, %y_mask) + %dims : Dynamic = aten::__or__(%5_dims, %y_dims) + %19 : int = prim::Constant[value=1]() + %data : Dynamic = aten::where(%mask, %data.1, %5_data) + -> (%19, %data, %mask, %dims) + } + return (%x, %21, %22); +} diff --git a/test/expect/TestBatched.test_if_else.expect b/test/expect/TestBatched.test_if_else.expect new file mode 100644 index 00000000000000..0698584377a433 --- /dev/null +++ b/test/expect/TestBatched.test_if_else.expect @@ -0,0 +1,52 @@ +graph(%a.1_data : Dynamic + %a.1_mask : Dynamic + %a.1_dims : Dynamic + %b_data : Dynamic + %b_mask : Dynamic + %b_dims : Dynamic) { + %6 : Dynamic = aten::gt(%a.1_data, %b_data) + %7 : Dynamic = aten::mul(%a.1_mask, %b_mask) + %8 : Dynamic = aten::__or__(%a.1_dims, %b_dims) + %9 : int = prim::TensorToNum(%6) + %10 : int = prim::Constant[value=1]() + %11 : Long() = prim::NumToTensor(%10) + %alpha.1 : float = prim::TensorToNum(%11) + %data.1 : Dynamic = aten::add(%a.1_data, %b_data, %alpha.1) + %mask.1 : Dynamic = aten::mul(%a.1_mask, %b_mask) + %dims.1 : Dynamic = aten::__or__(%a.1_dims, %b_dims) + %16 : int = prim::Constant[value=1]() + %17 : Long() = prim::NumToTensor(%16) + %alpha : float = prim::TensorToNum(%17) + %data.4 : Dynamic = aten::sub(%a.1_data, %b_data, %alpha) + %mask : Dynamic = aten::mul(%a.1_mask, %b_mask) + %dims : Dynamic = aten::__or__(%a.1_dims, %b_dims) + %22 : Dynamic = aten::type_as(%7, %6) + %cond_mask.1 : Dynamic = aten::mul(%6, %22) + %24 : int = aten::dim(%cond_mask.1) + %25 : int = prim::Constant[value=1]() + %26 : int = aten::eq(%24, %25) + %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%26) + block0() { + %30 : int = aten::dim(%data.1) + %31 : int = prim::Constant[value=1]() + %32 : int = aten::sub(%30, %31) + %33 : int = prim::Constant[value=1]() + %data.3 : Dynamic = prim::Loop(%32, %33, %cond_mask.1) + block0(%_ : int, %36 : Dynamic) { + %37 : int = aten::dim(%36) + %data.2 : Dynamic = aten::unsqueeze(%36, %37) + %39 : int = prim::Constant[value=1]() + -> (%39, %data.2) + } + %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1) + %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask.1) + -> (%cond_data.1, %cond_mask.2, %data.3) + } + block1() { + -> (%cond_mask.1, %cond_mask.1, %cond_mask.1) + } + %res_data : Dynamic = aten::where(%cond_data, %data.1, %data.4) + %res_mask : Dynamic = aten::where(%cond_mask, %mask.1, %mask) + %res_dims : Dynamic = aten::__or__(%dims.1, %dims) + return (%res_data, %res_mask, %res_dims); +} diff --git a/test/expect/TestBatched.test_if_else_with_scalar.expect b/test/expect/TestBatched.test_if_else_with_scalar.expect new file mode 100644 index 00000000000000..c7755a5b5501fc --- /dev/null +++ b/test/expect/TestBatched.test_if_else_with_scalar.expect @@ -0,0 +1,53 @@ +graph(%a.1_data : Dynamic + %a.1_mask : Dynamic + %a.1_dims : Dynamic + %b_data : Dynamic + %b_mask : Dynamic + %b_dims : Dynamic) { + %6 : float = prim::Constant[value=0.1]() + %7 : Float() = prim::NumToTensor(%6) + %other : float = prim::TensorToNum(%7) + %9 : Dynamic = aten::gt(%a.1_data, %other) + %10 : int = prim::TensorToNum(%9) + %11 : int = prim::Constant[value=1]() + %12 : Long() = prim::NumToTensor(%11) + %alpha.1 : float = prim::TensorToNum(%12) + %data.1 : Dynamic = aten::add(%a.1_data, %b_data, %alpha.1) + %mask.1 : Dynamic = aten::mul(%a.1_mask, %b_mask) + %dims.1 : Dynamic = aten::__or__(%a.1_dims, %b_dims) + %17 : int = prim::Constant[value=1]() + %18 : Long() = prim::NumToTensor(%17) + %alpha : float = prim::TensorToNum(%18) + %data.4 : Dynamic = aten::sub(%a.1_data, %b_data, %alpha) + %mask : Dynamic = aten::mul(%a.1_mask, %b_mask) + %dims : Dynamic = aten::__or__(%a.1_dims, %b_dims) + %23 : Dynamic = aten::type_as(%a.1_mask, %9) + %cond_mask.1 : Dynamic = aten::mul(%9, %23) + %25 : int = aten::dim(%cond_mask.1) + %26 : int = prim::Constant[value=1]() + %27 : int = aten::eq(%25, %26) + %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%27) + block0() { + %31 : int = aten::dim(%data.1) + %32 : int = prim::Constant[value=1]() + %33 : int = aten::sub(%31, %32) + %34 : int = prim::Constant[value=1]() + %data.3 : Dynamic = prim::Loop(%33, %34, %cond_mask.1) + block0(%_ : int, %37 : Dynamic) { + %38 : int = aten::dim(%37) + %data.2 : Dynamic = aten::unsqueeze(%37, %38) + %40 : int = prim::Constant[value=1]() + -> (%40, %data.2) + } + %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1) + %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask.1) + -> (%cond_data.1, %cond_mask.2, %data.3) + } + block1() { + -> (%cond_mask.1, %cond_mask.1, %cond_mask.1) + } + %res_data : Dynamic = aten::where(%cond_data, %data.1, %data.4) + %res_mask : Dynamic = aten::where(%cond_mask, %mask.1, %mask) + %res_dims : Dynamic = aten::__or__(%dims.1, %dims) + return (%res_data, %res_mask, %res_dims); +} diff --git a/test/expect/TestBatched.test_if_noelse.expect b/test/expect/TestBatched.test_if_noelse.expect new file mode 100644 index 00000000000000..1d98fe9d02f29c --- /dev/null +++ b/test/expect/TestBatched.test_if_noelse.expect @@ -0,0 +1,46 @@ +graph(%a.1_data : Dynamic + %a.1_mask : Dynamic + %a.1_dims : Dynamic + %b_data : Dynamic + %b_mask : Dynamic + %b_dims : Dynamic) { + %6 : Dynamic = aten::gt(%a.1_data, %b_data) + %7 : Dynamic = aten::mul(%a.1_mask, %b_mask) + %8 : Dynamic = aten::__or__(%a.1_dims, %b_dims) + %9 : int = prim::TensorToNum(%6) + %10 : int = prim::Constant[value=1]() + %11 : Long() = prim::NumToTensor(%10) + %alpha : float = prim::TensorToNum(%11) + %data.1 : Dynamic = aten::add(%a.1_data, %b_data, %alpha) + %mask : Dynamic = aten::mul(%a.1_mask, %b_mask) + %dims : Dynamic = aten::__or__(%a.1_dims, %b_dims) + %16 : Dynamic = aten::type_as(%7, %6) + %cond_mask.1 : Dynamic = aten::mul(%6, %16) + %18 : int = aten::dim(%cond_mask.1) + %19 : int = prim::Constant[value=1]() + %20 : int = aten::eq(%18, %19) + %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%20) + block0() { + %24 : int = aten::dim(%data.1) + %25 : int = prim::Constant[value=1]() + %26 : int = aten::sub(%24, %25) + %27 : int = prim::Constant[value=1]() + %data.3 : Dynamic = prim::Loop(%26, %27, %cond_mask.1) + block0(%_ : int, %30 : Dynamic) { + %31 : int = aten::dim(%30) + %data.2 : Dynamic = aten::unsqueeze(%30, %31) + %33 : int = prim::Constant[value=1]() + -> (%33, %data.2) + } + %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1) + %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask) + -> (%cond_data.1, %cond_mask.2, %data.3) + } + block1() { + -> (%cond_mask.1, %cond_mask.1, %cond_mask.1) + } + %res_data : Dynamic = aten::where(%cond_data, %data.1, %a.1_data) + %res_mask : Dynamic = aten::where(%cond_mask, %mask, %a.1_mask) + %res_dims : Dynamic = aten::__or__(%dims, %a.1_dims) + return (%res_data, %res_mask, %res_dims); +} diff --git a/test/expect/TestBatched.test_if_noelse_with_scalar.expect b/test/expect/TestBatched.test_if_noelse_with_scalar.expect new file mode 100644 index 00000000000000..935bedb22b3f80 --- /dev/null +++ b/test/expect/TestBatched.test_if_noelse_with_scalar.expect @@ -0,0 +1,47 @@ +graph(%a.1_data : Dynamic + %a.1_mask : Dynamic + %a.1_dims : Dynamic + %b_data : Dynamic + %b_mask : Dynamic + %b_dims : Dynamic) { + %6 : float = prim::Constant[value=0.1]() + %7 : Float() = prim::NumToTensor(%6) + %other : float = prim::TensorToNum(%7) + %9 : Dynamic = aten::gt(%a.1_data, %other) + %10 : int = prim::TensorToNum(%9) + %11 : int = prim::Constant[value=1]() + %12 : Long() = prim::NumToTensor(%11) + %alpha : float = prim::TensorToNum(%12) + %data.1 : Dynamic = aten::add(%a.1_data, %b_data, %alpha) + %mask : Dynamic = aten::mul(%a.1_mask, %b_mask) + %dims : Dynamic = aten::__or__(%a.1_dims, %b_dims) + %17 : Dynamic = aten::type_as(%a.1_mask, %9) + %cond_mask.1 : Dynamic = aten::mul(%9, %17) + %19 : int = aten::dim(%cond_mask.1) + %20 : int = prim::Constant[value=1]() + %21 : int = aten::eq(%19, %20) + %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%21) + block0() { + %25 : int = aten::dim(%data.1) + %26 : int = prim::Constant[value=1]() + %27 : int = aten::sub(%25, %26) + %28 : int = prim::Constant[value=1]() + %data.3 : Dynamic = prim::Loop(%27, %28, %cond_mask.1) + block0(%_ : int, %31 : Dynamic) { + %32 : int = aten::dim(%31) + %data.2 : Dynamic = aten::unsqueeze(%31, %32) + %34 : int = prim::Constant[value=1]() + -> (%34, %data.2) + } + %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1) + %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask) + -> (%cond_data.1, %cond_mask.2, %data.3) + } + block1() { + -> (%cond_mask.1, %cond_mask.1, %cond_mask.1) + } + %res_data : Dynamic = aten::where(%cond_data, %data.1, %a.1_data) + %res_mask : Dynamic = aten::where(%cond_mask, %mask, %a.1_mask) + %res_dims : Dynamic = aten::__or__(%dims, %a.1_dims) + return (%res_data, %res_mask, %res_dims); +} diff --git a/test/expect/TestBatched.test_while.expect b/test/expect/TestBatched.test_while.expect new file mode 100644 index 00000000000000..a32cd392044f00 --- /dev/null +++ b/test/expect/TestBatched.test_while.expect @@ -0,0 +1,65 @@ +graph(%a.1_data : Dynamic + %a.1_mask : Dynamic + %a.1_dims : Dynamic + %b_data : Dynamic + %b_mask : Dynamic + %b_dims : Dynamic) { + %6 : int = prim::Constant[value=2147483647]() + %7 : Dynamic = aten::gt(%a.1_data, %b_data) + %8 : Dynamic = aten::mul(%a.1_mask, %b_mask) + %9 : Dynamic = aten::__or__(%a.1_dims, %b_dims) + %10 : int = prim::TensorToNum(%7) + %11 : Dynamic = aten::mul(%7, %8) + %12 : Dynamic = aten::sum(%11) + %13 : int = prim::Constant[value=0]() + %14 : Dynamic = aten::gt(%12, %13) + %15 : int = prim::TensorToNum(%14) + %64 : Dynamic, %65 : Dynamic, %66 : Dynamic, %a : Dynamic, %62 : Dynamic, %63 : Dynamic = prim::Loop(%6, %15, %7, %8, %9, %a.1_data, %a.1_mask, %a.1_dims) + block0(%loop_num : int, %cond_data.2 : Dynamic, %cond_mask.3 : Dynamic, %cond_dims : Dynamic, %6_data : Dynamic, %6_mask : Dynamic, %6_dims : Dynamic) { + %24 : int = prim::Constant[value=1]() + %25 : Long() = prim::NumToTensor(%24) + %alpha : float = prim::TensorToNum(%25) + %data.1 : Dynamic = aten::sub(%6_data, %b_data, %alpha) + %mask : Dynamic = aten::mul(%6_mask, %b_mask) + %dims : Dynamic = aten::__or__(%6_dims, %b_dims) + %30 : Dynamic = aten::gt(%data.1, %b_data) + %31 : Dynamic = aten::mul(%mask, %b_mask) + %32 : Dynamic = aten::__or__(%dims, %b_dims) + %33 : int = prim::TensorToNum(%30) + %34 : Dynamic = aten::type_as(%cond_mask.3, %cond_data.2) + %cond_mask.1 : Dynamic = aten::mul(%cond_data.2, %34) + %36 : int = aten::dim(%cond_mask.1) + %37 : int = prim::Constant[value=1]() + %38 : int = aten::eq(%36, %37) + %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%38) + block0() { + %42 : int = aten::dim(%data.1) + %43 : int = prim::Constant[value=1]() + %44 : int = aten::sub(%42, %43) + %45 : int = prim::Constant[value=1]() + %data.3 : Dynamic = prim::Loop(%44, %45, %cond_mask.1) + block0(%_ : int, %48 : Dynamic) { + %49 : int = aten::dim(%48) + %data.2 : Dynamic = aten::unsqueeze(%48, %49) + %51 : int = prim::Constant[value=1]() + -> (%51, %data.2) + } + %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1) + %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask) + -> (%cond_data.1, %cond_mask.2, %data.3) + } + block1() { + -> (%cond_mask.1, %cond_mask.1, %cond_mask.1) + } + %res_data : Dynamic = aten::where(%cond_data, %data.1, %6_data) + %res_mask : Dynamic = aten::where(%cond_mask, %mask, %6_mask) + %res_dims : Dynamic = aten::__or__(%dims, %6_dims) + %57 : Dynamic = aten::mul(%30, %31) + %58 : Dynamic = aten::sum(%57) + %59 : int = prim::Constant[value=0]() + %60 : Dynamic = aten::gt(%58, %59) + %61 : int = prim::TensorToNum(%60) + -> (%61, %30, %31, %32, %res_data, %res_mask, %res_dims) + } + return (%a, %62, %63); +} diff --git a/test/expect/TestJit.test_concat_fusion.expect b/test/expect/TestJit.test_concat_fusion.expect index 027c2de33e5926..454a84cba1db76 100644 --- a/test/expect/TestJit.test_concat_fusion.expect +++ b/test/expect/TestJit.test_concat_fusion.expect @@ -3,12 +3,11 @@ graph(%0 : Float(3, 20) %2 : Float(6, 20) = prim::FusionGroup_0[device=0](%0, %1) return (%2); } -with prim::FusionGroup_0 = graph(%4 : Float(3, 20) - %5 : Float(3, 20)) { - %7 : int = prim::Constant[value=1]() - %8 : Float(3, 20) = aten::add(%4, %5, %7) - %6 : Float(3, 20) = aten::mul(%4, %5) - %2 : int = prim::Constant[value=0]() - %3 : Float(6, 20) = aten::cat(%8, %6, %2) - return (%3); +with prim::FusionGroup_0 = graph(%3 : Float(3, 20) + %4 : Float(3, 20)) { + %6 : int = prim::Constant[value=1]() + %7 : Float(3, 20) = aten::add(%3, %4, %6) + %5 : Float(3, 20) = aten::mul(%3, %4) + %2 : Float(6, 20) = prim::FusedConcat[dim=0](%7, %5) + return (%2); } diff --git a/test/expect/TestJit.test_constant_prop_nested.expect b/test/expect/TestJit.test_constant_prop_nested.expect new file mode 100644 index 00000000000000..09ef82076edc4a --- /dev/null +++ b/test/expect/TestJit.test_constant_prop_nested.expect @@ -0,0 +1,15 @@ +graph(%a : Dynamic) { + %1 : int = prim::Constant[value=2]() + %2 : Dynamic = aten::lt(%a, %1) + %3 : int = prim::TensorToNum(%2) + %c : int = prim::If(%3) + block0() { + %5 : int = prim::Constant[value=5]() + -> (%5) + } + block1() { + %6 : int = prim::Constant[value=1]() + -> (%6) + } + return (%c); +} diff --git a/test/expect/TestJit.test_constant_prop_print.expect b/test/expect/TestJit.test_constant_prop_print.expect new file mode 100644 index 00000000000000..7cadfdbbc6b3ea --- /dev/null +++ b/test/expect/TestJit.test_constant_prop_print.expect @@ -0,0 +1,12 @@ +graph(%input_tensor : Dynamic) { + %1 : int = prim::Constant[value=6]() + %2 : Dynamic = ^FIXME_zerol()() + %a : Dynamic = aten::add(%1, %2) + = prim::Print(%a) + %4 : int = prim::Constant[value=2]() + %5 : int = prim::Constant[value=1]() + %b : Dynamic = aten::add(%a, %4, %5) + %7 : int = prim::Constant[value=1]() + %8 : Dynamic = aten::add(%b, %input_tensor, %7) + return (%8); +} diff --git a/test/expect/TestJit.test_constant_prop_rand.expect b/test/expect/TestJit.test_constant_prop_rand.expect new file mode 100644 index 00000000000000..a6c305258bff95 --- /dev/null +++ b/test/expect/TestJit.test_constant_prop_rand.expect @@ -0,0 +1,11 @@ +graph() { + %0 : int = prim::Constant[value=6]() + %1 : int = prim::Constant[value=0]() + %2 : int[] = prim::Constant[value=[0, -1]]() + %3 : int[] = prim::Constant[value=[3]]() + %a : Dynamic = aten::randn(%3, %0, %1, %2) + %5 : int = prim::Constant[value=2]() + %6 : int = prim::Constant[value=1]() + %b : Dynamic = aten::add(%a, %5, %6) + return (%b); +} diff --git a/test/expect/TestJit.test_constant_prop_simple.expect b/test/expect/TestJit.test_constant_prop_simple.expect new file mode 100644 index 00000000000000..029f9ac05a0783 --- /dev/null +++ b/test/expect/TestJit.test_constant_prop_simple.expect @@ -0,0 +1,5 @@ +graph(%input_tensor : Dynamic) { + %1 : int = prim::Constant[value=8]() + %2 : Dynamic = aten::add(%1, %input_tensor) + return (%2); +} diff --git a/test/expect/TestJit.test_lstm_fusion_concat.expect b/test/expect/TestJit.test_lstm_fusion_concat.expect index 7884a95c48c9a1..f0771c133c11d9 100644 --- a/test/expect/TestJit.test_lstm_fusion_concat.expect +++ b/test/expect/TestJit.test_lstm_fusion_concat.expect @@ -16,34 +16,33 @@ graph(%0 : Float(3, 10) %21 : Float(6, 20) = prim::FusionGroup_0[device=0](%2, %16, %20, %15, %19, %14, %18, %13, %17) return (%21); } -with prim::FusionGroup_0 = graph(%16 : Float(3, 20) +with prim::FusionGroup_0 = graph(%15 : Float(3, 20) + %25 : Float(3!, 20) %26 : Float(3!, 20) - %27 : Float(3!, 20) + %29 : Float(3!, 20) %30 : Float(3!, 20) - %31 : Float(3!, 20) + %33 : Float(3!, 20) %34 : Float(3!, 20) - %35 : Float(3!, 20) - %38 : Float(3!, 20) - %39 : Float(3!, 20)) { - %40 : int = prim::Constant[value=1]() - %41 : Float(3, 20) = aten::add(%38, %39, %40) - %36 : int = prim::Constant[value=1]() - %37 : Float(3, 20) = aten::add(%34, %35, %36) - %32 : int = prim::Constant[value=1]() - %33 : Float(3, 20) = aten::add(%30, %31, %32) - %28 : int = prim::Constant[value=1]() - %29 : Float(3, 20) = aten::add(%26, %27, %28) - %25 : Float(3, 20) = aten::sigmoid(%41) - %23 : Float(3, 20) = aten::sigmoid(%37) - %21 : Float(3, 20) = aten::tanh(%33) - %19 : Float(3, 20) = aten::sigmoid(%29) - %17 : Float(3, 20) = aten::mul(%23, %16) - %14 : Float(3, 20) = aten::mul(%25, %21) - %10 : int = prim::Constant[value=1]() - %11 : Float(3, 20) = aten::add(%17, %14, %10) - %7 : Float(3, 20) = aten::tanh(%11) - %6 : Float(3, 20) = aten::mul(%19, %7) - %2 : int = prim::Constant[value=0]() - %3 : Float(6, 20) = aten::cat(%6, %11, %2) - return (%3); + %37 : Float(3!, 20) + %38 : Float(3!, 20)) { + %39 : int = prim::Constant[value=1]() + %40 : Float(3, 20) = aten::add(%37, %38, %39) + %35 : int = prim::Constant[value=1]() + %36 : Float(3, 20) = aten::add(%33, %34, %35) + %31 : int = prim::Constant[value=1]() + %32 : Float(3, 20) = aten::add(%29, %30, %31) + %27 : int = prim::Constant[value=1]() + %28 : Float(3, 20) = aten::add(%25, %26, %27) + %24 : Float(3, 20) = aten::sigmoid(%40) + %22 : Float(3, 20) = aten::sigmoid(%36) + %20 : Float(3, 20) = aten::tanh(%32) + %18 : Float(3, 20) = aten::sigmoid(%28) + %16 : Float(3, 20) = aten::mul(%22, %15) + %13 : Float(3, 20) = aten::mul(%24, %20) + %9 : int = prim::Constant[value=1]() + %10 : Float(3, 20) = aten::add(%16, %13, %9) + %6 : Float(3, 20) = aten::tanh(%10) + %5 : Float(3, 20) = aten::mul(%18, %6) + %2 : Float(6, 20) = prim::FusedConcat[dim=0](%5, %10) + return (%2); } diff --git a/test/expect/TestScript.test_cat_lifts.expect b/test/expect/TestScript.test_cat_lifts.expect index ea2fa3737c0556..c8c82e5199c030 100644 --- a/test/expect/TestScript.test_cat_lifts.expect +++ b/test/expect/TestScript.test_cat_lifts.expect @@ -1,15 +1,18 @@ graph(%x : Dynamic) { %1 : int = prim::Constant[value=1]() - %2 : Dynamic = aten::cat(%x, %x, %1) - return (%2); + %2 : Dynamic[] = prim::ListConstruct(%x, %x) + %3 : Dynamic = aten::cat(%2, %1) + return (%3); } graph(%x : Dynamic) { %1 : int = prim::Constant[value=1]() - %2 : Dynamic = aten::cat(%1) - return (%2); + %2 : Dynamic[] = prim::ListConstruct() + %3 : Dynamic = aten::cat(%2, %1) + return (%3); } graph(%x : Dynamic) { %1 : int = prim::Constant[value=1]() - %2 : Dynamic = aten::cat(%x, %1) - return (%2); + %2 : Dynamic[] = prim::ListConstruct(%x) + %3 : Dynamic = aten::cat(%2, %1) + return (%3); } diff --git a/test/expect/TestScript.test_index_put_trace_with_view.expect b/test/expect/TestScript.test_index_put_trace_with_view.expect index 591e499da96671..37f08643f139a4 100644 --- a/test/expect/TestScript.test_index_put_trace_with_view.expect +++ b/test/expect/TestScript.test_index_put_trace_with_view.expect @@ -6,6 +6,7 @@ graph(%0 : Double(100) %5 : Double(4) = aten::view(%2, %4) %6 : int = prim::Constant[value=0]() %7 : Long(4) = aten::_cast_Long(%1, %6) - %19 : Double(100) = aten::index_put(%0, %7, %5) - return (%19); + %8 : Dynamic[] = prim::ListConstruct(%7) + %20 : Double(100) = aten::index_put(%0, %8, %5) + return (%20); } diff --git a/test/expect/TestScript.test_index_put_trace_without_view.expect b/test/expect/TestScript.test_index_put_trace_without_view.expect index 42f8e49142942e..772308223b454b 100644 --- a/test/expect/TestScript.test_index_put_trace_without_view.expect +++ b/test/expect/TestScript.test_index_put_trace_without_view.expect @@ -3,6 +3,7 @@ graph(%0 : Double(100) %2 : Double(4)) { %3 : int = prim::Constant[value=0]() %4 : Long(4) = aten::_cast_Long(%1, %3) - %16 : Double(100) = aten::index_put(%0, %4, %2) - return (%16); + %5 : Dynamic[] = prim::ListConstruct(%4) + %17 : Double(100) = aten::index_put(%0, %5, %2) + return (%17); } diff --git a/test/onnx/expect/TestOperators.test_elu.expect b/test/onnx/expect/TestOperators.test_elu.expect new file mode 100644 index 00000000000000..a8eff9ab2c1387 --- /dev/null +++ b/test/onnx/expect/TestOperators.test_elu.expect @@ -0,0 +1,63 @@ +ir_version: 3 +producer_name: "pytorch" +producer_version: "0.3" +graph { + node { + input: "0" + output: "1" + op_type: "Elu" + attribute { + name: "alpha" + f: 1 + type: FLOAT + } + } + name: "torch-jit-export" + input { + name: "0" + type { + tensor_type { + elem_type: FLOAT + shape { + dim { + dim_value: 1 + } + dim { + dim_value: 2 + } + dim { + dim_value: 3 + } + dim { + dim_value: 4 + } + } + } + } + } + output { + name: "1" + type { + tensor_type { + elem_type: FLOAT + shape { + dim { + dim_value: 1 + } + dim { + dim_value: 2 + } + dim { + dim_value: 3 + } + dim { + dim_value: 4 + } + } + } + } + } +} +opset_import { + version: 7 +} diff --git a/test/onnx/expect/TestOperators.test_equal.expect b/test/onnx/expect/TestOperators.test_equal.expect index 3d8210b14bcbee..fc23156d1cbf47 100644 --- a/test/onnx/expect/TestOperators.test_equal.expect +++ b/test/onnx/expect/TestOperators.test_equal.expect @@ -45,7 +45,7 @@ graph { name: "2" type { tensor_type { - elem_type: INT8 + elem_type: UINT8 shape { dim { dim_value: 3 diff --git a/test/onnx/expect/TestOperators.test_ge.expect b/test/onnx/expect/TestOperators.test_ge.expect index e50f2e12537d56..204a59e88ef5a6 100644 --- a/test/onnx/expect/TestOperators.test_ge.expect +++ b/test/onnx/expect/TestOperators.test_ge.expect @@ -50,7 +50,7 @@ graph { name: "3" type { tensor_type { - elem_type: INT8 + elem_type: UINT8 shape { dim { dim_value: 3 diff --git a/test/onnx/expect/TestOperators.test_gt.expect b/test/onnx/expect/TestOperators.test_gt.expect index 3cda8f244819b7..d3eb9cf08c30a6 100644 --- a/test/onnx/expect/TestOperators.test_gt.expect +++ b/test/onnx/expect/TestOperators.test_gt.expect @@ -45,7 +45,7 @@ graph { name: "2" type { tensor_type { - elem_type: INT8 + elem_type: UINT8 shape { dim { dim_value: 3 diff --git a/test/onnx/expect/TestOperators.test_le.expect b/test/onnx/expect/TestOperators.test_le.expect index 2aefbc6dbc8622..39ba6940e2289c 100644 --- a/test/onnx/expect/TestOperators.test_le.expect +++ b/test/onnx/expect/TestOperators.test_le.expect @@ -50,7 +50,7 @@ graph { name: "3" type { tensor_type { - elem_type: INT8 + elem_type: UINT8 shape { dim { dim_value: 3 diff --git a/test/onnx/expect/TestOperators.test_lt.expect b/test/onnx/expect/TestOperators.test_lt.expect index 83656cb3a5ce04..cd9c4eaaaf50a7 100644 --- a/test/onnx/expect/TestOperators.test_lt.expect +++ b/test/onnx/expect/TestOperators.test_lt.expect @@ -45,7 +45,7 @@ graph { name: "2" type { tensor_type { - elem_type: INT8 + elem_type: UINT8 shape { dim { dim_value: 3 diff --git a/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect b/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect index b1ff53c2e4e7d8..3c1321664dd3fd 100644 --- a/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect +++ b/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect @@ -10,33 +10,33 @@ graph { t { dims: 4 data_type: INT64 - raw_data: "\001\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000" + raw_data: "\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\003\000\000\000\000\000\000\000\004\000\000\000\000\000\000\000" } type: TENSOR } } node { - input: "0" - input: "1" output: "2" - op_type: "Reshape" - } - node { - output: "3" op_type: "Constant" attribute { name: "value" t { dims: 4 data_type: INT64 - raw_data: "\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\003\000\000\000\000\000\000\000\004\000\000\000\000\000\000\000" + raw_data: "\001\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000" } type: TENSOR } } node { + input: "0" input: "2" + output: "3" + op_type: "Reshape" + } + node { input: "3" + input: "1" output: "4" op_type: "Tile" } diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py index 1e2c401dcc3ac0..ba8292e616686a 100644 --- a/test/onnx/test_operators.py +++ b/test/onnx/test_operators.py @@ -364,6 +364,10 @@ def test_pow(self): y = Variable(torch.randn(1, 2, 3, 4), requires_grad=True) self.assertONNX(lambda x, y: x.pow(y), (x, y)) + def test_elu(self): + x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True) + self.assertONNX(nn.ELU(), x) + def test_selu(self): x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True) self.assertONNX(nn.SELU(), x) diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py index 85ef2eac5bf2ce..7130a7695cc69b 100644 --- a/test/onnx/test_pytorch_onnx_caffe2.py +++ b/test/onnx/test_pytorch_onnx_caffe2.py @@ -676,6 +676,52 @@ def forward(self, x): x = Variable(torch.randn(*shape)) self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False) + def test_cumsum(self): + shape = (3, 4, 5) + for params in [{'dim': i} for i in range(len(shape))]: + class MyModel(torch.nn.Module): + def __init__(self): + super(MyModel, self).__init__() + + def forward(self, x): + return torch.cumsum(x, **params) + x = Variable(torch.randn(*shape)) + self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False) + + def test_repeat(self): + class MyModel(torch.nn.Module): + def __init__(self): + super(MyModel, self).__init__() + + def forward(self, x): + return x.repeat(1, 2, 3, 4) + + x = Variable(torch.randn(4, 3, 2, 1), requires_grad=True) + self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False) + + def test_repeat_dim_overflow(self): + class MyModel(torch.nn.Module): + def __init__(self): + super(MyModel, self).__init__() + + def forward(self, x): + return x.repeat(1, 2, 3, 4) + + x = Variable(torch.randn(1, 2), requires_grad=True) + self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False) + + def test_repeat_dynamic(self): + class MyModel(torch.nn.Module): + def __init__(self): + super(MyModel, self).__init__() + + def forward(self, x, y): + return x.repeat(y.size()[0] / 2, y.size()[1] * 2) + + x = Variable(torch.randn(1, 2), requires_grad=True) + y = Variable(torch.randn(2, 4), requires_grad=True) + self.run_model_test(MyModel(), train=False, input=(x, y), batch_size=BATCH_SIZE, use_gpu=False) + def test_mean(self): shape = (3, 4, 5) for params in [{}] + [{'dim': i} for i in range(len(shape))]: diff --git a/test/test_autograd.py b/test/test_autograd.py index 3ef7c21d49fc90..9d39043db9b56d 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -15,7 +15,7 @@ from torch.autograd.function import once_differentiable from torch.autograd.profiler import profile from common import TEST_MKL, TestCase, run_tests, skipIfNoLapack, \ - suppress_warnings, skipIfNoZeroSize, TEST_WITH_ROCM + suppress_warnings, TEST_WITH_ROCM from torch.autograd import Variable, Function, detect_anomaly from torch.autograd.function import InplaceFunction from torch.testing import make_non_contiguous, randn_like @@ -1851,6 +1851,16 @@ def backward(ctx, grad_output): out.sum().backward() self.assertEqual(x.grad.data, y_data) + def test_broadcast_tensors(self): + f_args_variable = (torch.randn(3, requires_grad=True), + torch.randn(1, 2, 1, requires_grad=True), + torch.randn(1, 1, requires_grad=True), + torch.randn(5, 1, 1, requires_grad=True)) + f_args_tensor = deepcopy(unpack_variables(f_args_variable)) + run_functional_checks(self, "test_broadcast_tensors", "broadcast", + lambda a, b, c, d: torch.broadcast_tensors(a, b, c, d), + True, f_args_variable, f_args_tensor) + def test_cat(self): f_args_variable = (torch.randn(1, S, S, requires_grad=True), torch.randn(2, S, S, requires_grad=True), @@ -1892,7 +1902,6 @@ def test_cat_empty_legacy(self): False, f_args_variable, f_args_tensor) self.assertTrue(gradcheck(lambda a, b: torch.cat((a, b)), f_args_variable, eps=1e-6, atol=PRECISION)) - @skipIfNoZeroSize def test_cat_empty(self): f_args_variable = (torch.randn(0, S, requires_grad=True), torch.randn(S, S, requires_grad=True)) @@ -1901,7 +1910,6 @@ def test_cat_empty(self): lambda a, b: torch.cat((a, b)), True, f_args_variable, f_args_tensor) - @skipIfNoLapack def test_potrf(self): root = Variable(torch.tril(torch.rand(S, S)), requires_grad=True) @@ -3123,7 +3131,7 @@ class dont_convert(tuple): ('select', (S, S, S), (1, -1), 'wrap_dim', [0]), ('select', (S,), (0, 2), '1d'), ('narrow', (S, S, S), (1, 2, 2), 'dim', [0]), - ('narrow', (S, S, S), (1, 0, 0), 'empty_dim', [0], [skipIfNoZeroSize]), + ('narrow', (S, S, S), (1, 0, 0), 'empty_dim', [0]), ('squeeze', (S, 1, S, 1), NO_ARGS), ('squeeze', (1, 1, 1, 1), NO_ARGS, 'input_sizes_are_ones'), ('squeeze', (S, 1, S, 1), (1,), '1_dim', [0]), diff --git a/test/test_distributions.py b/test/test_distributions.py index 7effb9012e9fc6..8a607ece6931c5 100644 --- a/test/test_distributions.py +++ b/test/test_distributions.py @@ -42,8 +42,8 @@ Independent, Laplace, LogisticNormal, LogNormal, LowRankMultivariateNormal, Multinomial, MultivariateNormal, - Normal, OneHotCategorical, Pareto, Poisson, - RelaxedBernoulli, RelaxedOneHotCategorical, + NegativeBinomial, Normal, OneHotCategorical, Pareto, + Poisson, RelaxedBernoulli, RelaxedOneHotCategorical, StudentT, TransformedDistribution, Uniform, Weibull, constraints, kl_divergence) from torch.distributions.constraint_registry import biject_to, transform_to @@ -123,6 +123,16 @@ def is_all_nan(tensor): {'probs': torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True), 'total_count': torch.tensor(0.)}, ]), + Example(NegativeBinomial, [ + {'probs': torch.tensor([[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True), 'total_count': 10}, + {'probs': torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True), 'total_count': 10}, + {'probs': torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True), 'total_count': torch.tensor([10])}, + {'probs': torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True), 'total_count': torch.tensor([10, 8])}, + {'probs': torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True), + 'total_count': torch.tensor([[10., 8.], [5., 3.]])}, + {'probs': torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True), + 'total_count': torch.tensor(0.)}, + ]), Example(Multinomial, [ {'probs': torch.tensor([[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True), 'total_count': 10}, {'probs': torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True), 'total_count': 10}, @@ -442,6 +452,12 @@ def is_all_nan(tensor): {'probs': torch.tensor([[1.0, 0.0], [0.0, 2.0]], requires_grad=True), 'total_count': 10}, ]), + Example(NegativeBinomial, [ + {'probs': torch.tensor([[-0.0000001, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True), + 'total_count': 10}, + {'probs': torch.tensor([[1.0, 0.0], [0.0, 2.0]], requires_grad=True), + 'total_count': 10}, + ]), Example(Cauchy, [ {'loc': 0.0, 'scale': -1.0}, {'loc': torch.tensor([0.0]), 'scale': 0.0}, @@ -911,6 +927,37 @@ def test_binomial_enumerate_support(self): bin1 = Binomial(torch.tensor(5), torch.tensor(0.5)) self.assertEqual(bin1.enumerate_support(), torch.arange(6)) + def test_negative_binomial(self): + p = torch.tensor(torch.arange(0.05, 1, 0.1), requires_grad=True) + for total_count in [1, 2, 10]: + self._gradcheck_log_prob(lambda p: NegativeBinomial(total_count, p), [p]) + self._gradcheck_log_prob(lambda p: NegativeBinomial(total_count, None, p.log()), [p]) + self.assertRaises(NotImplementedError, NegativeBinomial(10, p).rsample) + self.assertRaises(NotImplementedError, NegativeBinomial(10, p).entropy) + + @unittest.skipIf(not TEST_NUMPY, "NumPy not found") + def test_negative_binomial_log_prob(self): + probs = torch.tensor(torch.arange(0.05, 1, 0.1)) + for total_count in [1, 2, 10]: + + def ref_log_prob(idx, x, log_prob): + p = probs.view(-1)[idx].item() + expected = scipy.stats.nbinom(total_count, 1 - p).logpmf(x) + self.assertAlmostEqual(log_prob, expected, places=3) + + self._check_log_prob(NegativeBinomial(total_count, probs), ref_log_prob) + logits = probs_to_logits(probs, is_binary=True) + self._check_log_prob(NegativeBinomial(total_count, logits=logits), ref_log_prob) + + @unittest.skipIf(not TEST_NUMPY, "NumPy not found") + def test_negative_binomial_log_prob_vectorized_count(self): + probs = torch.tensor([0.2, 0.7, 0.9]) + for total_count, sample in [(torch.tensor([10]), torch.tensor([7., 3., 9.])), + (torch.tensor([1, 2, 10]), torch.tensor([0., 1., 9.]))]: + log_prob = NegativeBinomial(total_count, probs).log_prob(sample) + expected = scipy.stats.nbinom(total_count.cpu().numpy(), 1 - probs.cpu().numpy()).logpmf(sample) + self.assertAlmostEqual(log_prob, expected, places=4) + def test_multinomial_1d(self): total_count = 10 p = torch.tensor([0.1, 0.2, 0.3], requires_grad=True) @@ -3475,7 +3522,7 @@ def setUp(self): ), ( Binomial(10, simplex_tensor), - scipy.stats.binom(10 * np.ones(simplex_tensor.shape), simplex_tensor) + scipy.stats.binom(10 * np.ones(simplex_tensor.shape), simplex_tensor.numpy()) ), ( Cauchy(random_var, positive_var), @@ -3862,6 +3909,9 @@ def get_constraints(self, is_cuda=False): constraints.greater_than(0), constraints.greater_than(2), constraints.greater_than(-2), + constraints.greater_than_eq(0), + constraints.greater_than_eq(2), + constraints.greater_than_eq(-2), constraints.less_than(tensor([-10., -2, 0, 2, 10])), constraints.less_than(0), constraints.less_than(2), @@ -3871,6 +3921,10 @@ def get_constraints(self, is_cuda=False): tensor([-3., 3, 1, 5, 5])), constraints.interval(-2, -1), constraints.interval(1, 2), + constraints.half_open_interval(tensor([-4., -2, 0, 2, 4]), + tensor([-3., 3, 1, 5, 5])), + constraints.half_open_interval(-2, -1), + constraints.half_open_interval(1, 2), constraints.simplex, constraints.lower_cholesky, ] diff --git a/test/test_indexing.py b/test/test_indexing.py index 00865d9f576b74..afe9e6d60c653c 100644 --- a/test/test_indexing.py +++ b/test/test_indexing.py @@ -1,4 +1,4 @@ -from common import TestCase, run_tests, skipIfNoZeroSize +from common import TestCase, run_tests import torch import warnings from torch import tensor @@ -93,7 +93,6 @@ def test_empty_index(self): y[mask] = -1 self.assertEqual(x, y) - @skipIfNoZeroSize def test_empty_ndim_index(self): devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] for device in devices: @@ -104,14 +103,12 @@ def test_empty_ndim_index(self): self.assertEqual(torch.empty(2, 0, 6, 4, 5, device=device), x[:, torch.empty(0, 6, dtype=torch.int64, device=device)]) - @skipIfNoZeroSize def test_empty_ndim_index_bool(self): devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] for device in devices: x = torch.randn(5, device=device) self.assertRaises(IndexError, lambda: x[torch.empty(0, 2, dtype=torch.uint8, device=device)]) - @skipIfNoZeroSize def test_empty_slice(self): devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] for device in devices: @@ -475,26 +472,18 @@ def test_boolean_indexing_twodim(self): def test_boolean_indexing_weirdness(self): # Weird boolean indexing things a = torch.ones((2, 3, 4)) - if torch._C._use_zero_size_dim(): - self.assertEqual((0, 2, 3, 4), a[False, True, ...].shape) - else: - self.assertEqual((0,), a[False, True, ...].shape) + self.assertEqual((0, 2, 3, 4), a[False, True, ...].shape) self.assertEqual(torch.ones(1, 2), a[True, [0, 1], True, True, [1], [[2]]]) - if torch._C._use_zero_size_dim(): - self.assertRaises(RuntimeError, lambda: a[False, [0, 1], ...]) + self.assertRaises(RuntimeError, lambda: a[False, [0, 1], ...]) def test_boolean_indexing_weirdness_tensors(self): # Weird boolean indexing things false = torch.tensor(False) true = torch.tensor(True) a = torch.ones((2, 3, 4)) - if torch._C._use_zero_size_dim(): - self.assertEqual((0, 2, 3, 4), a[False, True, ...].shape) - else: - self.assertEqual((0,), a[False, True, ...].shape) + self.assertEqual((0, 2, 3, 4), a[False, True, ...].shape) self.assertEqual(torch.ones(1, 2), a[true, [0, 1], true, true, [1], [[2]]]) - if torch._C._use_zero_size_dim(): - self.assertRaises(RuntimeError, lambda: a[false, [0, 1], ...]) + self.assertRaises(RuntimeError, lambda: a[false, [0, 1], ...]) def test_boolean_indexing_alldims(self): true = torch.tensor(True) diff --git a/test/test_jit.py b/test/test_jit.py index ab4c907e72d19f..b3bbe9892bc7db 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -1122,13 +1122,95 @@ def test_fn(ten, mask): ten = torch.rand(3, 3) self.assertEqual(test_fn(ten, mask), traced_test_fn(ten, mask)) + def test_constant_prop_simple(self): + @torch.jit.script + def constant_prop(input_tensor): + a = 2 * 3 + b = a + 2 + return b + input_tensor + + x = torch.tensor(2) + out_ref = constant_prop(x) + self.run_pass('constant_propagation', constant_prop.graph) + out_test = constant_prop(torch.tensor(2)) + self.assertEqual(out_ref, out_test) + self.assertExpected(canonical(constant_prop.graph)) + + def test_constant_prop_nested(self): + @torch.jit.script + def constant_prop(a): + b = 2 + 1 + if a < 2: + c = b + 2 + else: + c = b - 2 + return c + + out_ref = constant_prop(torch.tensor(2)) + self.run_pass('constant_propagation', constant_prop.graph) + out_test = constant_prop(torch.tensor(2)) + self.assertEqual(out_ref, out_test) + self.assertExpected(canonical(constant_prop.graph)) + + def test_constant_prop_print(self): + @torch.jit.script + def constant_prop(input_tensor): + a = 2 * 3 + FIXME_zerol() + print(a) + b = a + 2 + return b + input_tensor + + self.run_pass('constant_propagation', constant_prop.graph) + self.assertExpected(canonical(constant_prop.graph)) + + def test_constant_prop_rand(self): + @torch.jit.script + def constant_prop(): + a = torch.randn([3]) + b = a + 2 + return b + + self.run_pass('constant_propagation', constant_prop.graph) + self.assertExpected(canonical(constant_prop.graph)) + + # TODO: implement + @unittest.expectedFailure + def test_constant_prop_if_constant(self): + @torch.jit.script + def constant_prop(): + b = 3 + if True: + b = 1 + if False: + b = 2 + return b + + self.run_pass('constant_propagation', constant_prop.graph) + self.assertExpected(canonical(constant_prop.graph)) + + # TODO: implement + @unittest.expectedFailure + def test_constant_prop_loop_constant(self): + @torch.jit.script + def constant_prop(): + b = 0 + while True: + b = 1 + while False: + b = 2 + return b + + self.run_pass('constant_propagation', constant_prop.graph) + self.assertExpected(canonical(constant_prop.graph)) + class TestBatched(TestCase): # generate random examples and create an batchtensor with them def rand_batch(self, *dims): dims = [dim for dim in dims if dim != ()] - xs = [torch.rand(1, *(random.randint(1, size) if b else size for b, size in dims[1:])) for i in range(dims[0])] - xb = BatchTensor(xs, torch.tensor([b for b, d in dims[1:]])) + xs = [torch.rand(1, *(random.randint(1, size) if b else size for b, size in dims[1:]), + requires_grad=True) for i in range(dims[0])] + xb = BatchTensor(xs, torch.tensor([b for b, d in dims[1:]]).byte()) return xs, xb def test_create_batchtensor(self): @@ -1156,20 +1238,20 @@ def tanh(a): def test_batch_elementwise_binary(self): @torch.jit.batch(batch_size=4) - def mul(a, b): - return a * b + def add(a, b): + return a + b xs, batch = self.rand_batch(4, (True, 3), (False, 2)) xs2, batch2 = xs, batch - res_batch = mul(batch, batch2) - res = [torch.mul(xs[j], xs2[j]) for j in range(4)] + res_batch = add(batch, batch2) + res = [torch.add(xs[j], xs2[j]) for j in range(4)] self.assertEqual(res, res_batch.examples()) # test broadcast xs, batch = self.rand_batch(4, (False, 3), (False, 2)) b = torch.rand(3, 2) - res_batch = mul(batch, b) - res = [torch.mul(xs[j], b) for j in range(4)] + res_batch = add(batch, b) + res = [torch.add(xs[j], b) for j in range(4)] self.assertEqual(res, res_batch.examples()) def test_batch_mm(self): @@ -1216,6 +1298,33 @@ def matmul_test(xs, batch, xs2, batch2): xs2, batch2 = self.rand_batch(4, (False, 2), (True, 3)) matmul_test(xs, batch, xs2, batch2) + def test_batch_select(self): + @torch.jit.batch(batch_size=4) + def select(x): + return torch.select(x, 1, 0) + + xs, batch = self.rand_batch(4, (True, 3), (True, 2)) + res_batch = select(batch) + res = [torch.select(xs[j], 1, 0) for j in range(4)] + self.assertEqual(res, res_batch.examples()) + + xs, batch = self.rand_batch(4, (False, 3), (True, 2)) + res_batch = select(batch) + res = [torch.select(xs[j], 1, 0) for j in range(4)] + self.assertEqual(res, res_batch.examples()) + + def test_batch_index_select(self): + @torch.jit.batch(batch_size=4) + def index_select(x, ind): + return x.index_select(1, ind) + + xs, batch = self.rand_batch(4, (False, 5), (True, 2)) + ind = [torch.randint(0, 4, (1,), dtype=torch.long) for i in range(4)] + ind_batch = BatchTensor(ind, torch.tensor([]).byte()) + res_batch = index_select(batch, ind_batch) + res = [torch.index_select(xs[j], 1, ind[j]) for j in range(4)] + self.assertEqual(res, res_batch.examples()) + def test_batch_where(self): @torch.jit.batch(batch_size=4) def where(c, a, b): @@ -1232,43 +1341,300 @@ def where(c, a, b): res = [torch.where(xs_cond[j], xs[j], xs2[j]) for j in range(4)] self.assertEqual(res, res_batch.examples()) - @unittest.skip("Need support for scalar arguments") - def test_lstm_cell(self): - def LSTMCell(x, h, c, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c): - i_t = torch.matmul(x, w_xi) + torch.matmul(h, w_hi) + b_i - f_t = torch.matmul(x, w_xf) + torch.matmul(h, w_hf) + b_f - o_t = torch.matmul(x, w_xo) + torch.matmul(h, w_ho) + b_o - # activations - i_t = torch.sigmoid(i_t) - f_t = torch.sigmoid(f_t) - o_t = torch.sigmoid(o_t) - # cell computations - c_t = torch.matmul(x, w_xc) + torch.matmul(h, w_hc) + b_c - c_t = torch.tanh(c_t) - c_t = torch.mul(c, f_t) + torch.mul(i_t, c_t) - h_t = torch.mul(o_t, torch.tanh(c_t)) - return h_t + def test_batch_argmax(self): + @torch.jit.batch(batch_size=4) + def argmax(a): + return torch.argmax(a, 1) + + xs, batch = self.rand_batch(4, (True, 5), (True, 6)) + res_batch = argmax(batch) + res = [torch.argmax(xs[j], 1) for j in range(4)] + self.assertEqual(res, res_batch.examples()) + + @torch.jit.batch(batch_size=4) + def argmax(a): + return torch.argmax(a, 1, False) + + res_batch = argmax(batch) + res = [torch.argmax(xs[j], 1, False) for j in range(4)] + self.assertEqual(res, res_batch.examples()) + + def test_batch_topk(self): + @torch.jit.batch(batch_size=4) + def topk(a): + return torch.topk(a, 3, 1) + + xs, batch = self.rand_batch(4, (False, 5), (True, 6)) + + # along static dim + res_batch = topk(batch) + res = [torch.topk(xs[j], 3, 1)[0] for j in range(4)] + res_idx = [torch.topk(xs[j], 3, 1)[1] for j in range(4)] + self.assertEqual(res, res_batch[0].examples()) + self.assertEqual(res_idx, res_batch[1].examples()) + + @torch.jit.batch(batch_size=4) + def topk(a): + return torch.topk(a, 1, 2) + + # along dynamic dim + res_batch = topk(batch) + res = [torch.topk(xs[j], 1, 2)[0] for j in range(4)] + res_idx = [torch.topk(xs[j], 1, 2)[1] for j in range(4)] + self.assertEqual(res, res_batch[0].examples()) + self.assertEqual(res_idx, res_batch[1].examples()) + + def test_batch_softmax(self): + @torch.jit.batch(batch_size=4) + def softmax(a): + return torch.softmax(a, 1) + + xs, batch = self.rand_batch(4, (False, 5), (True, 6)) + + # along static dim + res_batch = softmax(batch) + res = [torch.softmax(xs[j], 1) for j in range(4)] + self.assertEqual(res, res_batch.examples()) + + @torch.jit.batch(batch_size=4) + def softmax(a): + return torch.softmax(a, 2) + + # along dynamic dim + res_batch = softmax(batch) + res = [torch.softmax(xs[j], 2) for j in range(4)] + self.assertEqual(res, res_batch.examples()) + + def test_batch_view(self): + @torch.jit.batch(batch_size=4) + def view(a): + return a.view([4, -1, 3]) + + xs, batch = self.rand_batch(4, (True, 5), (False, 3)) + res_batch = view(batch) + res = [xs[j].view([1, -1, 3]) for j in range(4)] + self.assertEqual(res, res_batch.examples()) + + def test_batch_cat(self): + @torch.jit.batch(batch_size=4) + def cat2(a, b): + return torch.cat([a, b], 2) + + xs, batch = self.rand_batch(4, (True, 5), (False, 3)) + xs2, batch2 = xs, batch + res_batch = cat2(batch, batch2) + res = [torch.cat([xs[j], xs2[j]], 2) for j in range(4)] + self.assertEqual(res, res_batch.examples()) + def test_batch_sum(self): @torch.jit.batch(batch_size=4) - def LSTMCell_batch(x, h, c, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c): - i_t = torch.matmul(x, w_xi) + torch.matmul(h, w_hi) + b_i - f_t = torch.matmul(x, w_xf) + torch.matmul(h, w_hf) + b_f - o_t = torch.matmul(x, w_xo) + torch.matmul(h, w_ho) + b_o - # activations - i_t = torch.sigmoid(i_t) - f_t = torch.sigmoid(f_t) - o_t = torch.sigmoid(o_t) - # cell computations - c_t = torch.matmul(x, w_xc) + torch.matmul(h, w_hc) + b_c - c_t = torch.tanh(c_t) - c_t = torch.mul(c, f_t) + torch.mul(i_t, c_t) - h_t = torch.mul(o_t, torch.tanh(c_t)) - return h_t + def batch_sum(a): + return a.sum() + + xs, batch = self.rand_batch(4, (True, 5), (False, 3)) + res_batch = batch_sum(batch) + res = [xs[j].sum().unsqueeze(0) for j in range(4)] + self.assertEqual(res, res_batch.examples()) + + def test_if_else(self): + def single_if(a, b): + if a > b: + a = a + b + else: + a = a - b + return a + + batch_if = torch.jit.batch(batch_size=4)(single_if) + + a, batch_a = self.rand_batch(4, ()) + b, batch_b = self.rand_batch(4, ()) + res_batch = batch_if(batch_a, batch_b) + res = [single_if(a[j], b[j]) for j in range(4)] + self.assertEqual(res, res_batch.examples()) + + script_if = torch.jit.script(single_if) + graph = torch.to_batch_graph(script_if.graph) + self.assertExpected(str(graph)) + + def test_if_else_with_scalar(self): + def single_if(a, b): + if a > 0.1: + a = a + b + else: + a = a - b + return a + + batch_if = torch.jit.batch(batch_size=4)(single_if) + + a, batch_a = self.rand_batch(4, ()) + b, batch_b = self.rand_batch(4, ()) + res_batch = batch_if(batch_a, batch_b) + res = [single_if(a[j], b[j]) for j in range(4)] + self.assertEqual(res, res_batch.examples()) + + script_if = torch.jit.script(single_if) + graph = torch.to_batch_graph(script_if.graph) + self.assertExpected(str(graph)) + + def test_if_noelse(self): + def single_if(a, b): + if a > b: + a = a + b + return a + + batch_if = torch.jit.batch(batch_size=4)(single_if) + + a, batch_a = self.rand_batch(4, ()) + b, batch_b = self.rand_batch(4, ()) + res_batch = batch_if(batch_a, batch_b) + res = [single_if(a[j], b[j]) for j in range(4)] + self.assertEqual(res, res_batch.examples()) + + script_if = torch.jit.script(single_if) + graph = torch.to_batch_graph(script_if.graph) + self.assertExpected(str(graph)) + + def test_if_noelse_with_scalar(self): + def single_if(a, b): + if a > 0.1: + a = a + b + return a + + batch_if = torch.jit.batch(batch_size=4)(single_if) + + a, batch_a = self.rand_batch(4, ()) + b, batch_b = self.rand_batch(4, ()) + res_batch = batch_if(batch_a, batch_b) + res = [single_if(a[j], b[j]) for j in range(4)] + self.assertEqual(res, res_batch.examples()) + + script_if = torch.jit.script(single_if) + graph = torch.to_batch_graph(script_if.graph) + self.assertExpected(str(graph)) + + def test_while(self): + def single_while(a, b): + while a > b: + a = a - b + return a + + batch_while = torch.jit.batch(batch_size=4)(single_while) + + a, batch_a = self.rand_batch(4, ()) + b = [torch.abs(torch.rand(1)) for i in range(4)] + batch_b = BatchTensor(b, torch.tensor([]).byte()) + res_batch = batch_while(batch_a, batch_b) + res = [single_while(a[j], b[j]) for j in range(4)] + self.assertEqual(res, res_batch.examples()) + + script_while = torch.jit.script(single_while) + graph = torch.to_batch_graph(script_while.graph) + self.assertExpected(str(graph)) + + def test_for(self): + def single_for(x, y): + for _ in range(10): + x = x + y + return x + + batch_for = torch.jit.batch(batch_size=4)(single_for) + + a, batch_a = self.rand_batch(4, ()) + b, batch_b = self.rand_batch(4, ()) + res_batch = batch_for(batch_a, batch_b) + res = [single_for(a[j], b[j]) for j in range(4)] + self.assertEqual(res, res_batch.examples()) + + script_for = torch.jit.script(single_for) + graph = torch.to_batch_graph(script_for.graph) + self.assertExpected(str(graph)) + + def test_lstm(self): + def LSTM(x_all, h, c, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c): + for i in range(x_all.size(1)): + x = x_all.select(1, i) + i_t = torch.matmul(x, w_xi) + torch.matmul(h, w_hi) + b_i + f_t = torch.matmul(x, w_xf) + torch.matmul(h, w_hf) + b_f + o_t = torch.matmul(x, w_xo) + torch.matmul(h, w_ho) + b_o + # activations + i_t = torch.sigmoid(i_t) + f_t = torch.sigmoid(f_t) + o_t = torch.sigmoid(o_t) + # cell computations + c_t = torch.matmul(x, w_xc) + torch.matmul(h, w_hc) + b_c + c_t = torch.tanh(c_t) + c_t = torch.mul(c_t, f_t) + torch.mul(i_t, c_t) + h_t = torch.mul(o_t, torch.tanh(c_t)) + h = h_t + c = c_t + return h + + LSTM_batch = torch.jit.batch(batch_size=4)(LSTM) batch_size, input_size, hidden_size = 4, 3, 2 + xs, batch = self.rand_batch(batch_size, (True, 4), (False, input_size)) + hx, h_batch = self.rand_batch(batch_size, (False, hidden_size)) + cx, c_batch = self.rand_batch(batch_size, (False, hidden_size)) + + # input to hidden weights + w_xi = torch.rand(input_size, hidden_size) + w_xf = torch.rand(input_size, hidden_size) + w_xo = torch.rand(input_size, hidden_size) + w_xc = torch.rand(input_size, hidden_size) + # hidden to hidden weights + w_hi = torch.rand(hidden_size, hidden_size) + w_hf = torch.rand(hidden_size, hidden_size) + w_ho = torch.rand(hidden_size, hidden_size) + w_hc = torch.rand(hidden_size, hidden_size) + # bias terms + b_i = torch.rand(hidden_size) + b_f = torch.rand(hidden_size) + b_o = torch.rand(hidden_size) + b_c = torch.rand(hidden_size) + + ys = [LSTM(xs[j], hx[j], cx[j], w_xi, w_xf, w_xo, w_xc, + w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c) for j in range(batch_size)] + ybs = LSTM_batch(batch, h_batch, c_batch, w_xi, w_xf, w_xo, w_xc, + w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c) + self.assertEqual(ys, ybs.examples()) + + def test_greedy_search(self): + def greedy(x, h, c, embed, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc, + b_i, b_f, b_o, b_c, w_hs, b_s, iter_num): + iter_count = torch.zeros_like(iter_num) + while(iter_count < iter_num): + iter_count += 1 + # LSTM Cell + i_t = torch.matmul(x, w_xi) + torch.matmul(h, w_hi) + b_i + f_t = torch.matmul(x, w_xf) + torch.matmul(h, w_hf) + b_f + o_t = torch.matmul(x, w_xo) + torch.matmul(h, w_ho) + b_o + # activations + i_t = torch.sigmoid(i_t) + f_t = torch.sigmoid(f_t) + o_t = torch.sigmoid(o_t) + # cell computations + c_t = torch.matmul(x, w_xc) + torch.matmul(h, w_hc) + b_c + c_t = torch.tanh(c_t) + c_t = torch.mul(c_t, f_t) + torch.mul(i_t, c_t) + h_t = torch.mul(o_t, torch.tanh(c_t)) + h = h_t + c = c_t + # calculate feature with max probability + s_t = torch.matmul(h_t, w_hs) + b_s + p_t = torch.softmax(s_t, 1) + i_t = torch.argmax(p_t, 1) + x = embed.index_select(1, i_t).squeeze(1) + return h + + greedy_batch = torch.jit.batch(batch_size=4)(greedy) + + batch_size, input_size, hidden_size, vocab_size = 4, 6, 8, 7 xs, batch = self.rand_batch(batch_size, (False, input_size)) hx, h_batch = self.rand_batch(batch_size, (False, hidden_size)) cx, c_batch = self.rand_batch(batch_size, (False, hidden_size)) + embed, embed_batch = self.rand_batch(batch_size, (False, vocab_size), (False, input_size)) + iter_num = [torch.randint(2, 5, (1,)) for i in range(batch_size)] + iter_num_batch = BatchTensor(iter_num, torch.tensor([]).byte()) # input to hidden weights w_xi = torch.rand(input_size, hidden_size) @@ -1285,11 +1651,102 @@ def LSTMCell_batch(x, h, c, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc, b_i, b_f = torch.rand(hidden_size) b_o = torch.rand(hidden_size) b_c = torch.rand(hidden_size) + # hidden to vocab weights, bias + w_hs = torch.rand(hidden_size, vocab_size) + b_s = torch.rand(vocab_size) + + ys = [greedy(xs[j], hx[j], cx[j], embed[j], w_xi, w_xf, w_xo, w_xc, + w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c, w_hs, b_s, iter_num[j]) for j in range(batch_size)] + ybs = greedy_batch(batch, h_batch, c_batch, embed_batch, w_xi, w_xf, w_xo, w_xc, + w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c, w_hs, b_s, iter_num_batch) + self.assertEqual(ys, ybs.examples()) - ys = [LSTMCell(xs[j].squeeze(0), hx[j], cx[j], w_xi, w_xf, w_xo, w_xc, - w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c) for j in range(batch_size)] - ybs = LSTMCell_batch(batch, h_batch, c_batch, w_xi, w_xf, w_xo, w_xc, - w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c) + def test_beam_search(self): + def beam(x, h, c, embed, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc, + b_i, b_f, b_o, b_c, w_hs, b_s, iter_num, idx): + k = 5 + vocab_size = embed.size(1) + iter_count = torch.zeros_like(iter_num) + max_len = idx.size(2) + while(iter_count < iter_num): + iter_count += 1 + # LSTM Cell + i_t = torch.matmul(x, w_xi) + torch.matmul(h, w_hi) + b_i + f_t = torch.matmul(x, w_xf) + torch.matmul(h, w_hf) + b_f + o_t = torch.matmul(x, w_xo) + torch.matmul(h, w_ho) + b_o + # activations + i_t = torch.sigmoid(i_t) + f_t = torch.sigmoid(f_t) + o_t = torch.sigmoid(o_t) + # cell computations + c_t = torch.matmul(x, w_xc) + torch.matmul(h, w_hc) + b_c + c_t = torch.tanh(c_t) + c_t = torch.mul(c_t, f_t) + torch.mul(i_t, c_t) + h_t = torch.mul(o_t, torch.tanh(c_t)) + h = h_t + c = c_t + # calculate features with max probability + s_t = torch.matmul(h_t, w_hs) + b_s + s_t = s_t.view([1, s_t.size(1) * s_t.size(2)]) + p_t = torch.softmax(s_t, 1) + prob_t, idx_t = torch.topk(p_t, k, 1) + if(int(idx_t.dim()) > 1): + idx_t_tmp = idx_t.squeeze(0) + else: + idx_t_tmp = idx_t + new_y = torch.fmod(idx_t_tmp, vocab_size) + pre_y = idx_t_tmp / vocab_size + x = embed.index_select(1, new_y) + h = h_t.index_select(1, pre_y) + c = c_t.index_select(1, pre_y) + iter = int(iter_count[0]) + idx = torch.cat([idx.narrow(2, 0, iter).index_select(1, pre_y), + torch.fmod(idx_t, vocab_size).unsqueeze(-1), + idx.narrow(2, iter, max_len - iter)], 2) + idx = idx.narrow(2, 0, max_len) + return idx + + beam_batch = torch.jit.batch(batch_size=4)(beam) + + k = 5 + batch_size, input_size, hidden_size, vocab_size = 4, 6, 8, 7 + max_len = 5 + xs, batch = self.rand_batch(batch_size, (False, 1), (False, input_size)) + hx, h_batch = self.rand_batch(batch_size, (False, 1), (False, hidden_size)) + cx, c_batch = self.rand_batch(batch_size, (False, 1), (False, hidden_size)) + embed, embed_batch = self.rand_batch(batch_size, (False, vocab_size), (False, input_size)) + iter_num = [torch.randint(2, max_len + 1, (1,)) for i in range(batch_size)] + iter_num_batch = BatchTensor(iter_num, torch.tensor([]).byte()) + + # input to hidden weights + w_xi = torch.rand(input_size, hidden_size) + w_xf = torch.rand(input_size, hidden_size) + w_xo = torch.rand(input_size, hidden_size) + w_xc = torch.rand(input_size, hidden_size) + # hidden to hidden weights + w_hi = torch.rand(hidden_size, hidden_size) + w_hf = torch.rand(hidden_size, hidden_size) + w_ho = torch.rand(hidden_size, hidden_size) + w_hc = torch.rand(hidden_size, hidden_size) + # bias terms + b_i = torch.rand(1, hidden_size) + b_f = torch.rand(1, hidden_size) + b_o = torch.rand(1, hidden_size) + b_c = torch.rand(1, hidden_size) + # hidden to vocab weights, bias + w_hs = torch.rand(hidden_size, vocab_size) + b_s = torch.rand(1, vocab_size) + + idx_batch = torch.jit.BatchTensor(torch.zeros([batch_size, k, max_len], dtype=torch.long), + torch.zeros([batch_size, 1, max_len]).byte(), + torch.tensor([0, 1]).byte()) + idx = [torch.zeros([1, k, max_len], dtype=torch.long) for _ in range(batch_size)] + + ys = [beam(xs[j], hx[j], cx[j], embed[j], w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc, + b_i, b_f, b_o, b_c, w_hs, b_s, iter_num[j], idx[j]).narrow(2, 0, int(iter_num[j])) + for j in range(batch_size)] + ybs = beam_batch(batch, h_batch, c_batch, embed_batch, w_xi, w_xf, w_xo, w_xc, + w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c, w_hs, b_s, iter_num_batch, idx_batch) self.assertEqual(ys, ybs.examples()) @@ -3648,10 +4105,10 @@ def test_unknown_builtin(self): def unknown_builtin(x): return x.splork(3) - def test_expected_tensor_found_tuple(self): - with self.assertRaisesRegex(RuntimeError, 'expected a tensor value but found'): + def test_return_tuple(self): + with self.assertRaisesRegex(RuntimeError, 'only supported return types'): @torch.jit.script - def return_tuple_wrong(x): + def return_tuple(x): a = (x, x) return a, x @@ -4370,6 +4827,17 @@ def tuple_arg(x): # type: (Tuple[Tensor, Tensor]) -> Tensor return x + 1 + def test_script_non_tensor_args_outputs(self): + @torch.jit.script + def fn(x, y): + # type: (Tensor, float) -> float + return float((x + y).sum()) + + x = torch.ones(2, 2) + z = fn(x, 1) + self.assertIsInstance(z, float) + self.assertEqual(z, 8.) + @unittest.skip('https://github.com/pytorch/pytorch/issues/9595') def test_inline_and_run_annotated_script_fn(self): @torch.jit.script @@ -4912,11 +5380,9 @@ def forward(self, x, y): 'test_expand_new_dim', 'test_expand_new_dim_front_old_front_1', 'test_expand_scalar_to_dims', - 'test_expand_scalar_to_scalar', 'test_expand_size', 'test_permute', 'test_permute_neg_dim', - 'test_permute_scalar', 'test_repeat', 'test_repeat_scalar', 'test_repeat_single_number', @@ -4924,12 +5390,10 @@ def forward(self, x, y): 'test_reshape', 'test_reshape_1d', 'test_reshape_scalar_to_1d', - 'test_reshape_scalar_to_scalar', 'test_reshape_size', 'test_view', 'test_view_1d', 'test_view_scalar_to_1d', - 'test_view_scalar_to_scalar', 'test_view_size', 'test_split_dim', 'test_split_dim_neg0', diff --git a/test/test_legacy_nn.py b/test/test_legacy_nn.py index 1463d15cf22d0c..de65e6fc8ce7a0 100644 --- a/test/test_legacy_nn.py +++ b/test/test_legacy_nn.py @@ -693,14 +693,18 @@ def _backward(self, module, input, output, grad_output, create_graph=False): return module.backward(input, grad_output) - def _forward_criterion(self, criterion, input, target): + def _forward_criterion(self, criterion, input, target, extra_args=None): + if extra_args is None: + extra_args = tuple() with torch.no_grad(): - return criterion.forward(input, target) + return criterion.forward(input, target, *extra_args) - def _backward_criterion(self, criterion, input, target, gradOutput=None): + def _backward_criterion(self, criterion, input, target, gradOutput=None, extra_args=None): + if extra_args is None: + extra_args = tuple() # Ignore gradOutput. It's used for non-legacy tests. with torch.no_grad(): - return criterion.backward(input, target) + return criterion.backward(input, target, *extra_args) def _zero_grad_parameters(self, module): return module.zeroGradParameters() diff --git a/test/test_nn.py b/test/test_nn.py index ccd698747ae8d5..8682463cf9bc6c 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -36,7 +36,7 @@ TEST_CUDNN_VERSION from common_nn import NNTestCase, ModuleTest, CriterionTest, TestBase, \ module_tests, criterion_tests, loss_reference_fns, get_reduction, \ - get_weight, smoothl1loss_reference, kldivloss_reference + get_weight, smoothl1loss_reference, kldivloss_reference, ctcloss_reference if TEST_SCIPY: @@ -383,6 +383,8 @@ class NewCriterionTest(InputVariableMixin, CriterionTest): def __init__(self, *args, **kwargs): super(NewCriterionTest, self).__init__(*args, **kwargs) self.check_gradgrad = kwargs.get('check_gradgrad', True) + self.check_half = kwargs.get('check_half', True) + self.convert_target = kwargs.get('convert_target', True) def _do_extra_tests(self, test_case, module, input, target): if not self.check_gradgrad: @@ -407,7 +409,7 @@ def apply_fn(input1, input2, *params): gradcheck(apply_fn, inputs) gradgradcheck(apply_fn, inputs) - def test_cuda(self, test_case, dtype=None): + def test_cuda(self, test_case, dtype=None, extra_args=None): def convert_dtype(obj, dtype, requires_grad=False): if isinstance(obj, torch.Tensor): return torch.tensor(obj.data, dtype=dtype, requires_grad=requires_grad) @@ -430,7 +432,7 @@ def convert_dtype(obj, dtype, requires_grad=False): if dtype is not None: cpu_input = convert_dtype(cpu_input, dtype, True) # NLLLoss requires target to be LongTensor - if not isinstance(cpu_target, torch.LongTensor): + if not isinstance(cpu_target, torch.LongTensor) and self.convert_target: cpu_target = convert_dtype(cpu_target, dtype) cpu_module.type(dtype) gpu_module.type(dtype) @@ -447,13 +449,13 @@ def convert_dtype(obj, dtype, requires_grad=False): # Loss modules with weights require consistent input/module weight types cpu_module = self.constructor(*self.constructor_args) - cpu_output = test_case._forward_criterion(cpu_module, cpu_input, cpu_target) - gpu_output = test_case._forward_criterion(gpu_module, gpu_input, gpu_target) + cpu_output = test_case._forward_criterion(cpu_module, cpu_input, cpu_target, extra_args=extra_args) + gpu_output = test_case._forward_criterion(gpu_module, gpu_input, gpu_target, extra_args=extra_args) # dtype can be None, so set precision in this way instead of a precision map test_case.assertEqual(cpu_output, gpu_output, 1e-1 if dtype == torch.half else 4e-4) - cpu_gradInput = test_case._backward_criterion(cpu_module, cpu_input, cpu_target) - gpu_gradInput = test_case._backward_criterion(gpu_module, gpu_input, gpu_target) + cpu_gradInput = test_case._backward_criterion(cpu_module, cpu_input, cpu_target, extra_args=extra_args) + gpu_gradInput = test_case._backward_criterion(gpu_module, gpu_input, gpu_target, extra_args=extra_args) test_case.assertEqual(cpu_gradInput, gpu_gradInput, 1e-1 if dtype == torch.half else 4e-4) except NotImplementedError: pass @@ -465,6 +467,10 @@ def _get_target(self): def constructor_args(self): return self._get_arg('constructor_args', False) + @property + def extra_args(self): + return self._get_arg('extra_args', False) + class TestNN(NNTestCase): _do_cuda_memory_leak_check = True @@ -479,20 +485,24 @@ def _backward(self, module, input, output, grad_output, create_graph=False): return None return input.grad.data - def _forward_criterion(self, criterion, input, target): + def _forward_criterion(self, criterion, input, target, extra_args=None): + if extra_args is None: + extra_args = tuple() if isinstance(input, tuple): - args = input + (target,) + args = input + (target,) + extra_args output = criterion(*args) else: - output = criterion(input, target) + output = criterion(input, target, *extra_args) return output.item() - def _backward_criterion(self, criterion, input, target, gradOutput=None): + def _backward_criterion(self, criterion, input, target, gradOutput=None, extra_args=None): + if extra_args is None: + extra_args = tuple() input_tuple = input if isinstance(input, tuple) else (input,) for i in input_tuple: if i.grad is not None: i.grad.data.zero_() - args = input_tuple + (target,) + args = input_tuple + (target,) + extra_args if gradOutput is None: gradOutput = torch.ones(()) criterion(*args).backward(gradOutput.type_as(input_tuple[0])) @@ -1585,6 +1595,7 @@ def test(nonlinearity, *args, **kwargs): test('relu6') test('elu') test('selu') + test('celu') test('rrelu') test('rrelu', inplace=True) test('hardtanh') @@ -3578,6 +3589,19 @@ def test_NLLLoss_mismatched_batch(self): with self.assertRaisesRegex(ValueError, 'Expected.*batch_size'): F.nll_loss(x, t) + @unittest.skipIf(not (TEST_CUDNN and TEST_CUDNN_VERSION >= 7000), "needs cudnn >= 7.0") + def test_CTCLoss_cudnn(self): + target_lengths = [30, 25, 20] + input_lengths = [50, 50, 50] + targets = torch.randint(1, 15, (sum(target_lengths),), dtype=torch.int) + log_probs = torch.randn(50, 3, 15, dtype=torch.float, device='cuda').log_softmax(2) + res = torch.nn.functional.ctc_loss(log_probs, targets, input_lengths, target_lengths) + expected = ctcloss_reference(log_probs, targets.cuda(), input_lengths, target_lengths).float() + with torch.backends.cudnn.flags(enabled=False): + res2 = torch.nn.functional.ctc_loss(log_probs, targets.cuda().long(), input_lengths, target_lengths) + self.assertEqual(res, expected) + self.assertEqual(res2, res) + def test_RNN_cell_no_broadcasting(self): def test(cell_module, input, hx, input_size, hidden_size): cell = cell_module(input_size, hidden_size) @@ -4351,7 +4375,7 @@ def _verify_pixel_shuffle(self, input, output, upscale_factor): self.assertEqual(output[:, c, h, w], input[:, channel_idx, height_idx, weight_idx]) def test_inplace_thnn(self): - modules = [nn.ReLU, nn.ELU, nn.SELU, nn.RReLU] + modules = [nn.ReLU, nn.ELU, nn.SELU, nn.CELU, nn.RReLU] for mod in modules: r = mod(inplace=True) input = torch.randn(5, 5, requires_grad=True) @@ -4812,6 +4836,12 @@ def test_triplet_margin_loss_swap_no_reduce(self): self.assertEqual(F.triplet_margin_loss(input1, input2, input3, swap=True, reduction='none'), loss_reference_fns['TripletMarginLoss'](input1, input2, input3, swap=True, reduction='none')) + def test_pointwise_loss_target_grad_none_reduction(self): + i = torch.randn(5, 10) + t = torch.randn(5, 10, requires_grad=True) + self.assertEqual(F.mse_loss(i, t, reduction='none').size(), t.size()) + self.assertEqual(F.l1_loss(i, t, reduction='none').size(), t.size()) + def test_cosine_similarity(self): input1 = torch.randn(4, 4, requires_grad=True) input2 = torch.randn(4, 4, requires_grad=True) @@ -4842,30 +4872,30 @@ def test_grid_sample(self): def test_cpu_against_cuda(N, C, H, W, padding_mode): def test_shape(N, C, IH, IW, H, W, padding_mode): - input_cpu = Variable(torch.randn(C, N, IH, IW).transpose(0, 1), requires_grad=True) - grid_cpu = Variable(torch.randn(H, N, W, 2).transpose(0, 1), requires_grad=True) + input_cpu = torch.randn(C, N, IH, IW).transpose(0, 1).requires_grad_() + grid_cpu = torch.randn(H, N, W, 2).transpose(0, 1).requires_grad_() out_cpu = F.grid_sample(input_cpu, grid_cpu, padding_mode=padding_mode) self.assertTrue(out_cpu.size() == torch.Size([N, C, H, W])) - input_cuda = Variable(input_cpu.data.transpose(0, 1).cuda().transpose(0, 1), requires_grad=True) - grid_cuda = Variable(grid_cpu.data.transpose(0, 1).cuda().transpose(0, 1), requires_grad=True) + input_cuda = input_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_() + grid_cuda = grid_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_() out_cuda = F.grid_sample(input_cuda, grid_cuda, padding_mode=padding_mode) self.assertEqual(out_cpu, out_cuda) - gradients = out_cpu.data.new(out_cpu.size()).normal_() + gradients = torch.randn_like(out_cpu) out_cpu.backward(gradients) out_cuda.backward(gradients.cuda()) self.assertEqual(input_cpu.grad, input_cuda.grad) self.assertEqual(grid_cpu.grad, grid_cuda.grad, prec=5e-5) # check that zero-dimensional input strides don't error out - base_input = torch.randn(C, IH, IW) - input_cpu = Variable(base_input.expand(input_cuda.size()), requires_grad=True) + base_input = torch.randn(N, C, 1, IW) + input_cpu = base_input.expand_as(input_cuda).requires_grad_() grid_cpu = torch.randn(N, H, W, 2, requires_grad=True) out_cpu = F.grid_sample(input_cpu, grid_cpu, padding_mode=padding_mode) - input_cuda = Variable(base_input.cuda().expand(input_cuda.size()), requires_grad=True) - grid_cuda = Variable(grid_cpu.data.cuda(), requires_grad=True) + input_cuda = base_input.cuda().expand_as(input_cuda).requires_grad_() + grid_cuda = grid_cpu.detach().cuda().requires_grad_() out_cuda = F.grid_sample(input_cuda, grid_cuda, padding_mode=padding_mode) self.assertEqual(out_cpu, out_cuda) @@ -4873,21 +4903,21 @@ def test_shape(N, C, IH, IW, H, W, padding_mode): test_shape(N, C, H, W, H, W, padding_mode) # test larger output - N = random.randint(1, 8) - C = random.randint(1, 8) - IH = random.randint(1, 8) - IW = random.randint(1, 8) + N = random.randint(2, 8) + C = random.randint(2, 8) + IH = random.randint(2, 8) + IW = random.randint(2, 8) H = random.randint(IH + 1, 12) W = random.randint(IW + 1, 12) test_shape(N, C, IH, IW, H, W, padding_mode) # test smaller output - N = random.randint(1, 8) - C = random.randint(1, 8) - IH = random.randint(1, 8) - IW = random.randint(1, 8) - H = random.randint(1, IH) - W = random.randint(1, IW) + N = random.randint(2, 8) + C = random.randint(2, 8) + IH = random.randint(2, 8) + IW = random.randint(2, 8) + H = random.randint(2, IH) + W = random.randint(2, IW) test_shape(N, C, IH, IW, H, W, padding_mode) # test known input on CPU @@ -4926,42 +4956,38 @@ def test_shape(N, C, IH, IW, H, W, padding_mode): # test CUDA against CPU if TEST_CUDA: test_cpu_against_cuda(N, C, H, W, padding_mode) - - # test channels >1024, which doesn't work on cudnn 7102 and further - N, C, H, W = 1, 1025, 3, 3 - self.assertTrue(gradcheck( - lambda inp, grid: F.grid_sample(inp, grid, padding_mode=padding_mode), - (input, grid))) - test_cpu_against_cuda(N, C, H, W, padding_mode) + if TEST_CUDNN: + with cudnn.flags(enabled=False): + test_cpu_against_cuda(N, C, H, W, padding_mode) def test_grid_sample_3d(self): def test_cpu_against_cuda(N, C, D, H, W, padding_mode): def test_shape(N, C, ID, IH, IW, D, H, W, padding_mode): - input_cpu = Variable(torch.randn(C, N, ID, IH, IW).transpose(0, 1), requires_grad=True) - grid_cpu = Variable(torch.randn(D, N, H, W, 3).transpose(0, 1), requires_grad=True) + input_cpu = torch.randn(C, N, ID, IH, IW).transpose(0, 1).requires_grad_() + grid_cpu = torch.randn(D, N, H, W, 3).transpose(0, 1).requires_grad_() out_cpu = F.grid_sample(input_cpu, grid_cpu, padding_mode=padding_mode) self.assertTrue(out_cpu.size() == torch.Size([N, C, D, H, W])) - input_cuda = Variable(input_cpu.data.transpose(0, 1).cuda().transpose(0, 1), requires_grad=True) - grid_cuda = Variable(grid_cpu.data.transpose(0, 1).cuda().transpose(0, 1), requires_grad=True) + input_cuda = input_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_() + grid_cuda = grid_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_() out_cuda = F.grid_sample(input_cuda, grid_cuda, padding_mode=padding_mode) self.assertEqual(out_cpu, out_cuda) - gradients = out_cpu.data.new(out_cpu.size()).normal_() + gradients = torch.randn_like(out_cpu) out_cpu.backward(gradients) out_cuda.backward(gradients.cuda()) self.assertEqual(input_cpu.grad, input_cuda.grad) self.assertEqual(grid_cpu.grad, grid_cuda.grad, prec=5e-5) # check that zero-dimensional input strides don't error out - base_input = torch.randn(C, ID, IH, IW) - input_cpu = Variable(base_input.expand(input_cuda.size()), requires_grad=True) + base_input = torch.randn(N, C, 1, IH, IW) + input_cpu = base_input.expand_as(input_cuda).requires_grad_() grid_cpu = torch.randn(N, D, H, W, 3, requires_grad=True) out_cpu = F.grid_sample(input_cpu, grid_cpu, padding_mode=padding_mode) - input_cuda = Variable(base_input.cuda().expand(input_cuda.size()), requires_grad=True) - grid_cuda = Variable(grid_cpu.data.cuda(), requires_grad=True) + input_cuda = base_input.cuda().expand_as(input_cuda).requires_grad_() + grid_cuda = grid_cpu.detach().cuda().requires_grad_() out_cuda = F.grid_sample(input_cuda, grid_cuda, padding_mode=padding_mode) self.assertEqual(out_cpu, out_cuda) @@ -4969,35 +4995,35 @@ def test_shape(N, C, ID, IH, IW, D, H, W, padding_mode): test_shape(N, C, D, H, W, D, H, W, padding_mode) # test larger output - N = random.randint(1, 8) - C = random.randint(1, 8) - ID = random.randint(1, 8) - IH = random.randint(1, 8) - IW = random.randint(1, 8) + N = random.randint(2, 8) + C = random.randint(2, 8) + ID = random.randint(2, 8) + IH = random.randint(2, 8) + IW = random.randint(2, 8) D = random.randint(ID + 1, 12) H = random.randint(IH + 1, 12) W = random.randint(IW + 1, 12) test_shape(N, C, ID, IH, IW, D, H, W, padding_mode) # test smaller output - N = random.randint(1, 8) - C = random.randint(1, 8) - ID = random.randint(1, 8) - IH = random.randint(1, 8) - IW = random.randint(1, 8) - D = random.randint(1, ID) - H = random.randint(1, IH) - W = random.randint(1, IW) + N = random.randint(2, 8) + C = random.randint(2, 8) + ID = random.randint(2, 8) + IH = random.randint(2, 8) + IW = random.randint(2, 8) + D = random.randint(2, ID) + H = random.randint(2, IH) + W = random.randint(2, IW) test_shape(N, C, ID, IH, IW, D, H, W, padding_mode) # test known input on CPU for padding_mode in ['zeros', 'border']: # do gradcheck - N = random.randint(1, 8) - C = random.randint(1, 8) - D = random.randint(1, 8) - H = random.randint(1, 8) - W = random.randint(1, 8) + N = random.randint(2, 8) + C = random.randint(2, 8) + D = random.randint(2, 8) + H = random.randint(2, 8) + W = random.randint(2, 8) input = torch.randn(N, C, D, H, W, requires_grad=True) grid = torch.randn(N, D, H, W, 3, requires_grad=True) self.assertTrue(gradcheck( @@ -5540,6 +5566,11 @@ def test_unfold_invalid_arg(self): unfold = nn.Unfold(kernel_size=(1, 3), padding=(1, 1), dilation=(1, 2)) unfold(torch.randn(1, 2, 2, 2)) + def test_softmin(self): + x = torch.randn(2, 16) + self.assertEqual(F.softmin(x, 1), F.softmax(-x, 1)) + self.assertEqual(F.softmin(x, 0), F.softmax(-x, 0)) + def test_adaptive_log_softmax(self): # args validation with self.assertRaises(ValueError): @@ -6006,15 +6037,20 @@ def add(test_name, fn): add(test_name, lambda self, test=test: test(self)) cuda_test_name = test_name + '_cuda' # With dtype enable, it's good enough to test against three floating types + kwargs = {} + if 'extra_args' in get_function_arglist(test.test_cuda): + kwargs['extra_args'] = test.extra_args + if 'dtype' in get_function_arglist(test.test_cuda): add(cuda_test_name + '_float', lambda self, - test=test: test.test_cuda(self, dtype=torch.float)) + test=test, kwargs=kwargs: test.test_cuda(self, dtype=torch.float, **kwargs)) add(cuda_test_name + '_double', lambda self, - test=test: test.test_cuda(self, dtype=torch.double)) - add(cuda_test_name + '_half', lambda self, - test=test: test.test_cuda(self, dtype=torch.half)) + test=test, kwargs=kwargs: test.test_cuda(self, dtype=torch.double, **kwargs)) + if getattr(test, 'check_half', True): + add(cuda_test_name + '_half', lambda self, + test=test: test.test_cuda(self, dtype=torch.half, **kwargs)) else: - add(cuda_test_name, lambda self, test=test: test.test_cuda(self)) + add(cuda_test_name, lambda self, test=test, kwargs=kwargs: test.test_cuda(self, **kwargs)) def wrap_functional(fn, **kwargs): @@ -6174,6 +6210,45 @@ def forward(self, *args): check_sum_reduction=True, check_gradgrad=False, ), + dict( + module_name='CTCLoss', + constructor_args=(14,), # blank=14 + extra_args=([50, 50, 50], [30, 25, 20]), # input_lengths, target_lengths + input_fn=lambda: torch.randn(50, 3, 15).log_softmax(2), + target_fn=lambda: torch.randint(0, 14, (3, 30), dtype=torch.long), + reference_fn=lambda i, t, il, tl, m: + ctcloss_reference(i, t, il, tl, blank=14, reduction=get_reduction(m)), + check_sum_reduction=True, + check_gradgrad=False, + check_half=False, + ), + dict( + module_name='CTCLoss', + desc='1d_target', + constructor_args=(14,), # blank=14 + extra_args=([50, 50, 50], [30, 25, 20]), # input_lengths, target_lengths + input_fn=lambda: torch.randn(50, 3, 15).log_softmax(2), + target_fn=lambda: torch.randint(0, 14, (3, 30), dtype=torch.long), + reference_fn=lambda i, t, il, tl, m: + ctcloss_reference(i, t, il, tl, blank=14, reduction=get_reduction(m)), + check_sum_reduction=True, + check_gradgrad=False, + check_half=False, + ), + dict( + module_name='CTCLoss', + desc='2d_int_target', + constructor_args=(0,), # blank=0 + extra_args=([50, 50, 50], [30, 25, 20]), # input_lengths, target_lengths + input_fn=lambda: torch.randn(50, 3, 15).log_softmax(2), + target_fn=lambda: torch.randint(1, 15, (3, 30), dtype=torch.int), + reference_fn=lambda i, t, il, tl, m: + ctcloss_reference(i, t, il, tl, blank=0, reduction=get_reduction(m)), + check_sum_reduction=True, + check_gradgrad=False, + check_half=False, + convert_target=False, + ), ] @@ -7766,6 +7841,21 @@ def multimarginloss_weights_no_reduce_test(): check_inplace=True, desc='scalar' ), + dict( + module_name='CELU', + input_size=(3, 2, 5), + constructor_args=(2.,), + check_inplace=True, + reference_fn=lambda x, _: torch.where(x >= 0, x, 2. * ((.5 * x).exp() - 1)) + ), + dict( + module_name='CELU', + input_size=(), + constructor_args=(2.,), + check_inplace=True, + reference_fn=lambda x, _: torch.where(x >= 0, x, 2. * ((.5 * x).exp() - 1)), + desc='scalar' + ), dict( module_name='GLU', input_size=(5, 6), diff --git a/test/test_optim.py b/test/test_optim.py index 41c3bfc1964f33..2d5b876dd3a8e1 100644 --- a/test/test_optim.py +++ b/test/test_optim.py @@ -31,7 +31,6 @@ def wrapper(closure, params, state): class TestOptim(TestCase): - def _test_rosenbrock(self, constructor, old_fn): params_t = torch.Tensor([1.5, 1.5]) state = {} @@ -505,6 +504,20 @@ def forward(self, x): return self.conv2(F.relu(self.conv1(x))) +class LambdaLRTestObject: + def __init__(self, value): + self.value = value + + def __call__(self, epoch): + return self.value * epoch + + def __eq__(self, other): + if isinstance(other, self.__class__): + return self.__dict__ == other.__dict__ + else: + return False + + class TestLRScheduler(TestCase): def setUp(self): self.net = SchedulerTestNet() @@ -672,6 +685,28 @@ def test_reduce_lr_on_plateau_state_dict(self): if key not in {'optimizer', 'is_better'}: self.assertEqual(scheduler.__dict__[key], scheduler_copy.__dict__[key], allow_inf=True) + def test_lambda_lr_state_dict_fn(self): + scheduler = LambdaLR(self.opt, lr_lambda=lambda x: x) + state = scheduler.state_dict() + self.assertIsNone(state['lr_lambdas'][0]) + + scheduler_copy = LambdaLR(self.opt, lr_lambda=lambda x: x) + scheduler_copy.load_state_dict(state) + for key in scheduler.__dict__.keys(): + if key not in {'optimizer', 'lr_lambdas'}: + self.assertEqual(scheduler.__dict__[key], scheduler_copy.__dict__[key], allow_inf=True) + + def test_lambda_lr_state_dict_obj(self): + scheduler = LambdaLR(self.opt, lr_lambda=LambdaLRTestObject(10)) + state = scheduler.state_dict() + self.assertIsNotNone(state['lr_lambdas'][0]) + + scheduler_copy = LambdaLR(self.opt, lr_lambda=LambdaLRTestObject(-1)) + scheduler_copy.load_state_dict(state) + for key in scheduler.__dict__.keys(): + if key not in {'optimizer'}: + self.assertEqual(scheduler.__dict__[key], scheduler_copy.__dict__[key], allow_inf=True) + def _check_scheduler_state_dict(self, constr, constr2, epochs=10): scheduler = constr() for _ in range(epochs): diff --git a/test/test_torch.py b/test/test_torch.py index 2a8c897713111f..edd69473f8505b 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -22,7 +22,7 @@ from torch import multiprocessing as mp from common import TestCase, iter_indices, TEST_NUMPY, TEST_SCIPY, TEST_MKL, \ TEST_LIBROSA, run_tests, download_file, skipIfNoLapack, suppress_warnings, \ - IS_WINDOWS, PY3, NO_MULTIPROCESSING_SPAWN, skipIfNoZeroSize, TEST_WITH_ROCM + IS_WINDOWS, PY3, NO_MULTIPROCESSING_SPAWN, TEST_WITH_ROCM from multiprocessing.reduction import ForkingPickler if TEST_NUMPY: @@ -866,7 +866,6 @@ def test_multidim(x, dim): def test_dim_reduction(self): self._test_dim_reduction(self, lambda t: t) - @skipIfNoZeroSize def test_reduction_empty(self): fns_to_test = [ # name, function, identity @@ -930,7 +929,6 @@ def test_reduction_empty(self): self.assertEqual(torch.ones((2, 1, 4), device=device), xb.all(1, keepdim=True)) self.assertEqual(torch.ones((), device=device), xb.all()) - @skipIfNoZeroSize def test_pairwise_distance_empty(self): devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] for device in devices: @@ -1690,6 +1688,7 @@ def test_einsum(self): ("...ii->...i", I), # batch diagonal # -- Other ("bn,anm,bm->ba", l, w, r), # as torch.bilinear + ("... ii->...i ", I), # batch diagonal with spaces ] for test in test_list: actual = torch.einsum(test[0], test[1:]) @@ -2240,7 +2239,6 @@ def test_tensor_factory_cuda_type(self): self.assertTrue(x.is_cuda) torch.set_default_tensor_type(saved_type) - @skipIfNoZeroSize def test_tensor_factories_empty(self): # ensure we can create empty tensors from each factory function shapes = [(5, 0, 1), (0,), (0, 0, 1, 0, 2, 0, 0)] @@ -2927,7 +2925,6 @@ def _test_in_place_broadcastable(t0, t1, t2=None): def test_broadcast(self): self._test_broadcast(self, lambda t: t) - @skipIfNoZeroSize def test_broadcast_empty(self): # empty + empty self.assertRaises(RuntimeError, lambda: torch.randn(5, 0) + torch.randn(0, 5)) @@ -2943,6 +2940,17 @@ def test_broadcast_empty(self): torch.randn(0, 7, 0, 6, 5, 0, 1) + torch.randn(1, 1, 5, 1, 7)) self.assertRaises(RuntimeError, lambda: torch.randn(7, 0) + torch.randn(2, 1)) + def test_broadcast_tensors(self): + x0 = torch.randn(2, 1, 3) + x1 = torch.randn(3) + x2 = torch.randn(3, 1) + expected_size = (2, 3, 3) + + y0, y1, y2 = torch.broadcast_tensors(x0, x1, x2) + self.assertTrue(y0.size() == expected_size) + self.assertTrue(y1.size() == expected_size) + self.assertTrue(y2.size() == expected_size) + @staticmethod def _test_contiguous(self, cast): x = cast(torch.randn(1, 16, 5, 5)) @@ -2957,9 +2965,7 @@ def test_contiguous(self): return self._test_contiguous(self, lambda t: t) def test_empty_tensor_props(self): - sizes = [(0,)] - if torch._C._use_zero_size_dim(): - sizes += [(0, 3), (5, 0), (5, 0, 3, 0, 2), (0, 3, 0, 2), (0, 5, 0, 2, 0)] + sizes = [(0,), (0, 3), (5, 0), (5, 0, 3, 0, 2), (0, 3, 0, 2), (0, 5, 0, 2, 0)] devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] for size in sizes: for device in devices: @@ -3476,9 +3482,6 @@ def test_cat_empty_legacy(self): @staticmethod def _test_cat_empty(self, use_cuda=False): - if not torch._C._use_zero_size_dim(): - return - dtype = torch.float32 device = 'cuda' if use_cuda else 'cpu' @@ -3524,9 +3527,6 @@ def test_narrow(self): self.assertEqual(x.narrow(-2, -1, 1), torch.Tensor([[6, 7, 8]])) def test_narrow_empty(self): - if not torch._C._use_zero_size_dim(): - return - devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] for device in devices: x = torch.randn(2, 3, 4, device=device) @@ -3658,7 +3658,7 @@ def test_randn(self): self.assertEqual(res1, res2) def test_slice(self): - empty = torch.empty(0, 4) if torch._C._use_zero_size_dim() else torch.Tensor() + empty = torch.empty(0, 4) x = torch.arange(0., 16).view(4, 4) self.assertEqual(x[:], x) self.assertEqual(x[:4], x) @@ -4951,10 +4951,7 @@ def consec(size, start=1): reference = conv_fn(consec((3, 3, 3))) # empty tensor indexing - if torch._C._use_zero_size_dim(): - self.assertEqual(reference[conv_fn(torch.LongTensor())], reference.new(0, 3, 3)) - else: - self.assertEqual(reference[conv_fn(torch.LongTensor())], reference.new()) + self.assertEqual(reference[conv_fn(torch.LongTensor())], reference.new(0, 3, 3)) self.assertEqual(reference[0], consec((3, 3)), 0) self.assertEqual(reference[1], consec((3, 3), 10), 0) @@ -5000,14 +4997,9 @@ def consec(size, start=1): self.assertEqual(reference[None, 2:5, None, None], reference.unsqueeze(0)[:, 2:5].unsqueeze(2).unsqueeze(2)) # indexing 0-length slice - if torch._C._use_zero_size_dim(): - self.assertEqual(torch.empty(0, 5, 5), reference[slice(0)]) - self.assertEqual(torch.empty(0, 5), reference[slice(0), 2]) - self.assertEqual(torch.empty(0, 5), reference[2, slice(0)]) - else: - self.assertEqual(torch.tensor([]), reference[slice(0)]) - self.assertEqual(torch.tensor([]), reference[slice(0), 2]) - self.assertEqual(torch.tensor([]), reference[2, slice(0)]) + self.assertEqual(torch.empty(0, 5, 5), reference[slice(0)]) + self.assertEqual(torch.empty(0, 5), reference[slice(0), 2]) + self.assertEqual(torch.empty(0, 5), reference[2, slice(0)]) self.assertEqual(torch.tensor([]), reference[2, 1:1, 2]) # indexing with step @@ -5717,7 +5709,6 @@ def check(src, idx): check(src, idx) check(src.transpose(1, 2), idx) - @skipIfNoZeroSize def test_take_empty(self): devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] for device in devices: @@ -5748,7 +5739,6 @@ def test_put_accumulate(self): dst.put_(idx, src, accumulate=True) self.assertEqual(dst.tolist(), [[5, 7], [1, 1]]) - @skipIfNoZeroSize def test_put_empty(self): devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] for device in devices: @@ -6070,7 +6060,6 @@ def _test_view(self, cast): def test_view(self): TestTorch._test_view(self, lambda x: x) - @skipIfNoZeroSize def test_view_empty(self): x = torch.randn(0, 6) self.assertEqual((1, 0, 6, 1, 1), x.view(1, 0, 6, 1, 1).shape) @@ -6096,12 +6085,8 @@ def test_reshape(self): self.assertEqual(empty, empty.reshape(-1)) self.assertEqual(empty, empty.reshape([0])) # TODO: fix these once we have multi-dimensional empty tensors - if torch._C._use_zero_size_dim(): - self.assertEqual(empty.reshape([0, 1]).shape, (0, 1)) - self.assertEqual(empty.reshape([1, -1]).shape, (1, 0)) - else: - self.assertEqual(empty.reshape([0, 1]).shape, (0,)) - self.assertEqual(empty.reshape([1, -1]).shape, (0,)) + self.assertEqual(empty.reshape([0, 1]).shape, (0, 1)) + self.assertEqual(empty.reshape([1, -1]).shape, (1, 0)) self.assertRaises(RuntimeError, lambda: empty.reshape(1)) x = torch.randn(3, 3) @@ -6109,7 +6094,6 @@ def test_reshape(self): self.assertEqual(x.data_ptr(), x.reshape_as(torch.rand(1, 9, 1)).data_ptr()) self.assertRaises(RuntimeError, lambda: x.reshape_as(torch.rand(10))) - @skipIfNoZeroSize def test_empty_reshape(self): x = torch.randn(0, 6) self.assertEqual((1, 0, 6, 1, 1), x.reshape(1, 0, 6, 1, 1).shape) @@ -6119,7 +6103,6 @@ def test_empty_reshape(self): # match NumPy semantics -- don't infer the size of dimension with a degree of freedom self.assertRaises(RuntimeError, lambda: x.reshape(0, -1)) - @skipIfNoZeroSize def test_tensor_shape_empty(self): devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] for device in devices: @@ -6185,7 +6168,6 @@ def test_tensor_shape_empty(self): self.assertEqual([(0, 1, 3, 0)], [z.shape for z in torch.split(x, 0, dim=0)]) # functions that operate over a dimension but don't reduce. - @skipIfNoZeroSize def test_dim_function_empty(self): devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] for device in devices: @@ -6309,7 +6291,6 @@ def test_dim_function_empty(self): c = torch.randn((0, 1, 2), device=device) self.assertEqual(c, c.index_select(0, ind_empty)) - @skipIfNoZeroSize def test_blas_empty(self): devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] for device in devices: @@ -6379,7 +6360,6 @@ def fn(torchfn, *args): A_LU, pivots = fn(torch.btrifact, (2, 0, 0)) self.assertEqual([(2, 0, 0), (2, 0)], [A_LU.shape, pivots.shape]) - @skipIfNoZeroSize def test_blas_alpha_beta_empty(self): devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] for device in devices: @@ -6405,7 +6385,6 @@ def test_blas_alpha_beta_empty(self): self.assertEqual(torch.full((2, 3), beta * value, device=device), torch.addmm(input=input, mat1=mat, mat2=mat2, alpha=alpha, beta=beta, out=out)) - @skipIfNoZeroSize @skipIfNoLapack def test_lapack_empty(self): # FIXME: these are just a selection of LAPACK functions -- we need a general strategy here. @@ -6896,9 +6875,6 @@ def test_nonzero(self): self.assertNotEqual(tensor[dst1[i, 0], dst1[i, 1], dst1[i, 2]].item(), 0) def test_nonzero_empty(self): - if not torch._C._use_zero_size_dim(): - return - devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] for device in devices: x = torch.randn(0, 2, 0, 5, 0, device=device) @@ -7523,15 +7499,11 @@ def test_load_error_msg(self): expected_err_msg = (".*You can only torch.load from a file that is seekable. " + "Please pre-load the data into a buffer like io.BytesIO and " + "try to load from it instead.") - if PY3: - import urllib.request - import io - resource = urllib.request.urlopen('https://download.pytorch.org/test_data/linear.pt') - self.assertRaisesRegex(io.UnsupportedOperation, expected_err_msg, lambda: torch.load(resource)) - else: - import urllib - resource = urllib.urlopen('https://download.pytorch.org/test_data/linear.pt') - self.assertRaisesRegex(AttributeError, expected_err_msg, lambda: torch.load(resource)) + + resource = FilelikeMock(data=b"data") + delattr(resource, "tell") + delattr(resource, "seek") + self.assertRaisesRegex(AttributeError, expected_err_msg, lambda: torch.load(resource)) def test_from_buffer(self): a = bytearray([1, 2, 3, 4]) @@ -7894,10 +7866,7 @@ def test_from_numpy(self): # check zero dimensional x = np.zeros((0, 2)) - if torch._C._use_zero_size_dim(): - self.assertEqual(torch.from_numpy(x).shape, (0, 2)) - else: - self.assertEqual(torch.from_numpy(x).shape, (0,)) + self.assertEqual(torch.from_numpy(x).shape, (0, 2)) # check ill-sized strides raise exception x = np.array([3., 5., 8.]) @@ -7947,6 +7916,20 @@ def test_ctor_with_numpy_array(self): for i in range(len(array)): self.assertEqual(tensor[i], array[i]) + @unittest.skipIf(not TEST_NUMPY, "Numpy not found") + def test_ctor_with_numpy_scalar_ctor(self): + dtypes = [ + np.double, + np.float, + np.float16, + np.int64, + np.int32, + np.int16, + np.uint8 + ] + for dtype in dtypes: + self.assertEqual(dtype(42), torch.tensor(dtype(42)).item()) + @unittest.skipIf(not TEST_NUMPY, "Numpy not found") def test_numpy_index(self): i = np.int32([0, 1, 2]) @@ -8034,6 +8017,17 @@ def test_numpy_array_interface(self): for i in range(len(x)): self.assertEqual(geq2_x[i], geq2_array[i]) + @unittest.skipIf(not TEST_NUMPY, "Numpy not found") + def test_multiplication_numpy_scalar(self): + np_sc = np.float64(2.0) + t = torch.ones(2, requires_grad=True) + r1 = np_sc * t + self.assertIsInstance(r1, torch.Tensor) + self.assertTrue(r1.requires_grad) + r2 = t * np_sc + self.assertIsInstance(r2, torch.Tensor) + self.assertTrue(r2.requires_grad) + def test_error_msg_type_translation(self): with self.assertRaisesRegex( RuntimeError, diff --git a/third_party/eigen b/third_party/eigen index e9e95489a0b241..cafae68f33f7f4 160000 --- a/third_party/eigen +++ b/third_party/eigen @@ -1 +1 @@ -Subproject commit e9e95489a0b241412e31f0525e85b2fab386c786 +Subproject commit cafae68f33f7f41270b2e8c2dd181f510aa4d918 diff --git a/third_party/onnx b/third_party/onnx index c761845c7f6880..32ac71b1b9c1bd 160000 --- a/third_party/onnx +++ b/third_party/onnx @@ -1 +1 @@ -Subproject commit c761845c7f6880ab7eb7e2866d673834c7149e89 +Subproject commit 32ac71b1b9c1bd7f196eed3b311734ec6ab3c367 diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index 14fd6d7cf5e09c..a66cb77f8ce9dd 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -201,6 +201,9 @@ - name: conv_tbc(Tensor self, Tensor weight, Tensor bias, int64_t pad) self, weight, bias: conv_tbc_backward(grad, self, weight, bias, pad) +- name: _ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank) + log_probs: _ctc_loss_backward(grad, log_probs, targets, input_lengths, target_lengths, result0, result1, blank) + - name: det(Tensor self) self: det_backward(grad, self, result) @@ -308,6 +311,12 @@ self: gesv_backward_self(grad, self, A) A: gesv_backward_A(grad, self, A, result0) +- name: grid_sampler_2d(Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) + input, grid: grid_sampler_2d_backward(grad, input, grid, interpolation_mode, padding_mode) + +- name: grid_sampler_3d(Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) + input, grid: grid_sampler_3d_backward(grad, input, grid, interpolation_mode, padding_mode) + - name: gt_(Tensor self, Scalar other) self: zeros_like(self) @@ -802,8 +811,8 @@ - name: relu(Tensor self) self: threshold_backward(grad, self, 0, 0) -- name: elu_forward(Tensor self, Scalar alpha, Scalar scale) - self: elu_backward(grad, alpha, scale, output) +- name: elu_forward(Tensor self, Scalar alpha, Scalar scale, Scalar input_scale) + self: elu_backward(grad, alpha, scale, input_scale, output) - name: glu_forward(Tensor self, int64_t dim) self: glu_backward(grad, self, dim) @@ -974,12 +983,6 @@ - name: thnn_conv_dilated3d_backward(Tensor grad_output, Tensor self, Tensor weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, Tensor columns, Tensor ones, std::array output_mask) grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, dilation, false, {{0, 0, 0}}, 1, false, false, false, grad_input_mask) -- name: thnn_grid_sampler_bilinear2d_forward(Tensor self, Tensor grid, int64_t padding_mode) - self, grid: thnn_grid_sampler_bilinear2d_backward(grad, self, grid, padding_mode) - -- name: thnn_grid_sampler_bilinear3d_forward(Tensor self, Tensor grid, int64_t padding_mode) - self, grid: thnn_grid_sampler_bilinear3d_backward(grad, self, grid, padding_mode) - # NN double backwards support - name: adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) @@ -1006,9 +1009,9 @@ grad_output: avg_pool3d(grad, kernel_size, stride, padding, ceil_mode, count_include_pad) self: zeros_like(self) -- name: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Tensor output) - grad_output: elu_backward(grad, alpha, scale, output) - output: grad * grad_output * (output < 0).toType(grad.type()) +- name: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output) + grad_output: elu_backward(grad, alpha, scale, input_scale, output) + output: grad * grad_output * input_scale * (output < 0).toType(grad.type()) - name: fractional_max_pool2d_backward(Tensor grad_output, Tensor self, IntList kernel_size, IntList output_size, Tensor indices) grad_output: max_pool_double_backward(grad, indices, 2) @@ -1145,6 +1148,8 @@ output: -2 * output * grad * grad_output # cudnn +- name: _cudnn_ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank, bool deterministic) + log_probs: result1 - name: cudnn_convolution_transpose(Tensor self, Tensor weight, Tensor bias, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) self, weight, bias: cudnn_convolution_transpose_backward(self, grad, weight, padding, output_padding, stride, dilation, groups, benchmark, deterministic, grad_input_mask) diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py index 45af42655f96cc..2bee61b024317e 100644 --- a/tools/autograd/gen_variable_type.py +++ b/tools/autograd/gen_variable_type.py @@ -340,6 +340,8 @@ def save_variables(saved_variables, is_output): elif arg['type'] == 'TensorList': name += '_' expr = 'make_saved_variable_list({})'.format(arg['name']) + elif arg['type'] == 'IntList': + expr = expr + ".vec()" stmts.append('grad_fn->{} = {};'.format(name, expr)) return stmts diff --git a/tools/autograd/templates/Functions.cpp b/tools/autograd/templates/Functions.cpp index f859f814b4f8bc..0622fae5f2e8e8 100644 --- a/tools/autograd/templates/Functions.cpp +++ b/tools/autograd/templates/Functions.cpp @@ -175,7 +175,7 @@ Tensor prod_safe_zeros_backward(const Tensor &grad, const Tensor& inp, int64_t d return grad; } - std::vector ones_size(inp.sizes()); + auto ones_size = inp.sizes().vec(); ones_size[dim] = 1; Tensor ones = at::ones(ones_size, grad.type()); Tensor exclusive_normal_nocp = at::cat({ones, inp.narrow(dim, 0, inp.size(dim) - 1)}, dim); @@ -328,7 +328,7 @@ Tensor cumprod_backward(const Tensor &grad, const Tensor &input, int64_t dim) { return sum_scan_exclusive(result * grad, dim) / input; } - std::vector ones_size(input.sizes()); + auto ones_size = input.sizes().vec(); ones_size[dim] = 1; Tensor ones = at::ones({1}, grad.type()).expand(ones_size); Tensor grad_input = at::zeros(input.sizes(), grad.type()); @@ -461,7 +461,7 @@ Tensor mm_mat2_backward(const Tensor & grad, const Tensor & mat1, IntList sizes, } Tensor renorm_backward(const Tensor & grad, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) { - auto transposed_sizes = std::vector(self.transpose(dim, 0).sizes()); + auto transposed_sizes = self.transpose(dim, 0).sizes().vec(); auto flatten = [&](const Tensor & t) { return t.transpose(dim, 0).contiguous().view({t.size(dim), -1}); }; @@ -637,7 +637,7 @@ Tensor split_with_sizes_backward(const std::vector &g grads_all_defined[j] = grads[j]; } else { auto length = split_sizes[j]; - std::vector grad_size(sizes); + auto grad_size = sizes.vec(); grad_size[dim] = length; grads_all_defined[j] = at::zeros(grad_size, type); } @@ -659,7 +659,7 @@ Tensor split_backward(const std::vector &grads, Tensor max_pool_double_backward(const Tensor & grad, const Tensor & indices, int dim) { AT_ASSERT(indices.dim() >= dim); - auto size = std::vector(indices.sizes().slice(0, indices.dim() - dim)); + auto size = indices.sizes().slice(0, indices.dim() - dim).vec(); size.push_back(-1); auto indices_view = indices.view(size); return grad.contiguous().view(size).gather(-1, indices_view).view(indices.sizes()); @@ -686,7 +686,7 @@ Tensor glu_double_backward(const Tensor & grad, const Tensor & grad_output, cons Tensor glu_double_backward_grad_output(const Tensor & grad, const Tensor & input, int64_t dim) { if (dim < 0) dim += input.dim(); - std::vector sizes = input.sizes(); + auto sizes = input.sizes().vec(); sizes[dim] /= 2; auto tmp = grad * glu_backward(at::ones(sizes, input.type()), input, dim); return tmp.narrow(dim, 0, sizes[dim]) + tmp.narrow(dim, sizes[dim], sizes[dim]); @@ -1545,27 +1545,27 @@ Tensor symeig_backward(const std::vector &grads, cons bool eigenvectors, bool upper, const Tensor& lambda, const Tensor& v) { auto glambda = grads[0]; auto gv = grads[1]; - + auto vt = v.t(); - + if (!eigenvectors) { throw std::runtime_error(std::string("cannot compute backward without " "computing eigenvectors in forward pass")); } - + Tensor result; if (gv.defined()) { Tensor F = lambda.unsqueeze(0).expand_as(self).clone(); F.sub_(at::unsqueeze(lambda, 1)); F.diagonal().fill_(INFINITY); F.pow_(-1); - + F.mul_(vt.mm(gv)); result = v.mm(F.mm(vt)); } else { result = at::zeros_like(self); } - + if (glambda.defined()) { result.add_((v * glambda).mm(vt)); } diff --git a/tools/autograd/templates/Functions.h b/tools/autograd/templates/Functions.h index ae95bf7197770e..00d927f1fdf7f8 100644 --- a/tools/autograd/templates/Functions.h +++ b/tools/autograd/templates/Functions.h @@ -29,7 +29,7 @@ struct TypeAndSize { TypeAndSize() : type(nullptr) {} /* implicit */ TypeAndSize(const Tensor & t) - : sizes(t.sizes()) + : sizes(t.sizes().vec()) , type(&t.type()) {} Tensor zeros() { return at::zeros(sizes, *type); } diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp index 2f1adf0ab59f4b..bd4c59cfe9d380 100644 --- a/tools/autograd/templates/VariableType.cpp +++ b/tools/autograd/templates/VariableType.cpp @@ -398,7 +398,7 @@ Tensor VariableType::contiguous(const Tensor & self) const { static std::vector> to_args_sizes(TensorList tensors) { std::vector> args_sizes(tensors.size()); for (size_t i = 0; i < tensors.size(); ++i) { - args_sizes[i] = tensors[i].sizes(); + args_sizes[i] = tensors[i].sizes().vec(); } return args_sizes; } diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh index 4a0dbd04c905f1..8f79c2830e96c0 100755 --- a/tools/build_pytorch_libs.sh +++ b/tools/build_pytorch_libs.sh @@ -97,7 +97,11 @@ if [[ $(uname) == 'Darwin' ]]; then LDFLAGS="$LDFLAGS -Wl,-rpath,@loader_path" LD_POSTFIX=".dylib" else - LDFLAGS="$LDFLAGS -Wl,-rpath,\$ORIGIN" + if [[ $USE_ROCM -eq 1 ]]; then + LDFLAGS="$LDFLAGS -Wl,-rpath,\\\\\\\$ORIGIN" + else + LDFLAGS="$LDFLAGS -Wl,-rpath,\$ORIGIN" + fi fi CPP_FLAGS=" -std=c++11 " GLOO_FLAGS="" diff --git a/tools/clang_tidy.py b/tools/clang_tidy.py index abbadc70691b46..77b101dedf0f3e 100644 --- a/tools/clang_tidy.py +++ b/tools/clang_tidy.py @@ -7,6 +7,7 @@ import subprocess import sys + DEFAULT_FILE_PATTERN = r".*\.[ch](pp)?" # @@ -start,count +start,count @@ @@ -26,6 +27,11 @@ def run_shell_command(arguments, process_name=None): return output.decode() +def normalize_directory_path(path): + """Normalizes a directory path.""" + return path.rstrip('/') + + def transform_globs_into_regexes(globs): """Turns glob patterns into regular expressions.""" return [glob.replace("*", ".*").replace("?", ".") for glob in globs] @@ -49,16 +55,37 @@ def git_diff(args, verbose): return run_shell_command(command, process_name="git diff") -def filter_files(files, file_patterns): +def filter_files(files, file_patterns, verbose): """Returns all files that match any of the patterns.""" filtered = [] for file in files: + has_match = False for pattern in file_patterns: - if pattern.match(file): + if pattern.search(file): filtered.append(file) + has_match = True + if not has_match and verbose: + message = "{} does not match any ".format(file) + message += "file pattern in {{{}}}".format(', '.join(map(str, file_patterns))) + print(message) return filtered +def remove_recursive_files(files, paths, verbose): + """ + Removes all files that are not immediately under one of the given paths. + """ + for file in files: + if os.path.dirname(file) in paths: + yield file + else: + if verbose: + + message = "{} ({}) does not match any ".format(file, os.path.dirname(file)) + message += "non-recursive path in {{{}}}".format(", ".join(paths)) + print(message) + + def get_changed_files(revision, paths, verbose): """Runs git diff to get the paths of all changed files.""" # --diff-filter AMU gets us files that are (A)dded, (M)odified or (U)nmerged (in the working copy). @@ -152,7 +179,17 @@ def parse_options(): ) parser.add_argument("-r", "--revision", help="Git revision to get changes from") parser.add_argument( - "-p", "--paths", nargs="+", default=["."], help="Lint only the given paths" + "-p", + "--paths", + nargs="+", + default=["."], + help="Lint only the given paths (recursively)", + ) + parser.add_argument( + "-n", + "--no-recursive", + action="store_true", + help="If paths are supplied with -p/--paths, do not recurse into paths", ) parser.add_argument( "-s", @@ -173,12 +210,15 @@ def parse_options(): def main(): options = parse_options() + paths = map(normalize_directory_path, options.paths) if options.revision: - files = get_changed_files(options.revision, options.paths, options.verbose) + files = get_changed_files(options.revision, paths, options.verbose) else: - files = get_all_files(options.paths) + files = get_all_files(paths) + if options.no_recursive: + files = remove_recursive_files(files, paths, options.verbose) file_patterns = get_file_patterns(options.glob, options.regex) - files = filter_files(files, file_patterns) + files = filter_files(files, file_patterns, options.verbose) # clang-tidy error's when it does not get input files. if not files: diff --git a/tools/cpp_build/build_caffe2.sh b/tools/cpp_build/build_caffe2.sh index b35435acb388c6..6a50c14e05523e 100755 --- a/tools/cpp_build/build_caffe2.sh +++ b/tools/cpp_build/build_caffe2.sh @@ -24,6 +24,7 @@ cmake -DUSE_CUDA:BOOL=$USE_CUDA \ -DCMAKE_BUILD_TYPE:STRING=$BUILD_TYPE \ -DCMAKE_INSTALL_PREFIX:STRING=$INSTALL_PREFIX \ -DCMAKE_INSTALL_MESSAGE=NEVER \ + -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=ON \ -G "$GENERATE" \ $PYTORCHPATH/ $MAKE -j "$JOBS" install diff --git a/tools/cpp_build/build_libtorch.sh b/tools/cpp_build/build_libtorch.sh index 92a9b9981ed697..6dd9a589cf1074 100755 --- a/tools/cpp_build/build_libtorch.sh +++ b/tools/cpp_build/build_libtorch.sh @@ -24,6 +24,7 @@ cmake -DUSE_CUDA:BOOL=$USE_CUDA \ -DCMAKE_INSTALL_MESSAGE=NEVER \ -Dnanopb_BUILD_GENERATOR:BOOL=OFF \ -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON \ + -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=ON \ -DVERBOSE:BOOL=${VERBOSE:-0} \ -G "$GENERATE" \ $PYTORCHPATH/torch diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py index ad9ad2e05c4f4c..5a76d447ad2498 100644 --- a/tools/jit/gen_jit_dispatch.py +++ b/tools/jit/gen_jit_dispatch.py @@ -52,28 +52,6 @@ def jit_type_of(arg): typ = '{}?'.format(typ) return typ -# map from aten 'simple_type' to the function that will cast a attribute value -# to that type -FROM_ATTRIBUTE = { - 'Device': 'as_device(node->is(attr::{}))', - 'IntList': 'std::vector(node->is(attr::{}))', - 'Layout': 'static_cast(node->i(attr::{}))', - 'Scalar': 'Scalar(node->t(attr::{}))', - 'ScalarType': 'static_cast(node->i(attr::{}))', - 'Tensor': 'node->t(attr::{})', - 'bool': 'bool(node->i(attr::{}))', - 'double': 'node->f(attr::{})', - 'int64_t': 'node->i(attr::{})', - 'std::array': 'as_bool_array<2>(node->is(attr::{}))', - 'std::array': 'as_bool_array<3>(node->is(attr::{}))', - 'std::array': 'as_bool_array<4>(node->is(attr::{}))', -} - - -def from_attribute(arg): - simple_type = arg['simple_type'] - return FROM_ATTRIBUTE[simple_type].format(arg['name']) - # map from aten 'simple_type' to the function that will turn a tensor into # that type @@ -84,6 +62,7 @@ def from_attribute(arg): 'Scalar': '{}.toScalar()', 'ScalarType': 'static_cast({}.toInt())', 'Tensor': '{}.toTensor()', + 'TensorList': '{}.toTensorList()->elements()', 'bool': 'bool({}.toInt())', 'double': '{}.toDouble()', 'int64_t': '{}.toInt()', @@ -98,15 +77,13 @@ def from_ivalue(arg, value): return FROM_IVALUE[simple_type].format(value) -KW_ACCESS = CodeTemplate("""(node->${method}(Symbol::attr("${name}")))""") - CALL_NAMESPACE = CodeTemplate("""\ auto result = at::${name}( ${args} ); """) CALL_METHOD = CodeTemplate("""\ -DeviceGuard device_guard(deviceForInputs(stack, ${num_dynamic_inputs})); +DeviceGuard device_guard(deviceForInputs(stack, ${num_inputs})); auto result = (${first}).${name}( ${args} ); @@ -122,24 +99,20 @@ def from_ivalue(arg, value): ); """) -# TODO (apaszke): remove the attributed codepath once we remove them CONSTRUCTOR = CodeTemplate("""\ -[](Node *node) { - ${kw_assignments} - return Operation([=](Stack & stack) { +[](Stack & stack) { autograd::profiler::RecordFunction record("${name}"); ${call} - drop(stack, ${num_dynamic_inputs}); + drop(stack, ${num_inputs}); pack(stack, std::move(result)); return 0; - }); } """) OPERATOR = CodeTemplate("""\ Operator( "${signature}", - ${ops} + ${op} ), """) @@ -171,9 +144,6 @@ def is_jit_op(decl): # we currently only support vararg tensor lists when they are the _first_ argument # and the only tensor argument arguments = decl['arguments'] - # Only support a single TensorList arg - if sum(arg['simple_type'] == 'TensorList' for arg in arguments) > 1: - return False return ((not decl['api_name'].endswith('_') or is_magic_method(decl['api_name'])) and not decl['name'].endswith('_out') and @@ -197,7 +167,7 @@ def gen_jit_dispatch(declarations, out, template_path): ops = [] - def get_invocation(decl, args, num_dynamic_inputs): + def get_invocation(decl, args, num_inputs): # because the arg list can get lengthy we put them on a separate line def pack_arguments(args): @@ -211,109 +181,36 @@ def pack_arguments(args): elif 'namespace' in decl['method_of']: return CALL_NAMESPACE.substitute(name=decl['name'], args=pack_arguments(args), - num_dynamic_inputs=num_dynamic_inputs) + num_inputs=num_inputs) else: return CALL_METHOD.substitute( name=decl['name'], first=args[0], args=pack_arguments(args[1:]), - num_dynamic_inputs=num_dynamic_inputs) + num_inputs=num_inputs) - def emit_decl_variant(decl, is_positional_arg, has_tensorlist): - # is_positional_arg is a boolean list the same length as decl['arguments'] - # that indicates if the argument should come from the postional list - # of inputs. If false, the argument comes from the constant attributes + def emit_decl_variant(decl): kw_assignments = [] arguments = [] - - if has_tensorlist: - kw_assignments.append('size_t varargs_length = node->inputs().size();') - # arguments look like: [tensor list], arg1, arg2, arg3 - # we use peek(, static_inputs) to read the non-vararg inputs - # from the end of the stack - static_inputs = sum(is_positional_arg) - 1 - num_dynamic_inputs = 'varargs_length' - tensorlist_idx = [i for i, arg in enumerate(decl['arguments']) if arg['simple_type'] == 'TensorList'][0] - else: - static_inputs = sum(is_positional_arg) - num_dynamic_inputs = static_inputs + num_inputs = len(decl['arguments']) real_inputs = 0 - for i, arg in enumerate(decl['arguments']): - # This conditional allows us to process argument lists with a flattened argument list - # with a single TensorList. Given the sequence of arguments: - # a b c [d e f g] h i # [] is the list - # - # 1. For the section where we are processing positional inputs before the - # TensorList: - # a b c [d e f g] h i # [] is the list - # ~~~~~~~~~~~~ <- N - # we set this view_length to the total number of varargs inputs (i.e. the length) - # of the whole argument list. This means that indexing into the list using peek() - # we will retrieve arguments ar their true indices (i.e. peek at 0 points to a, - # 1 points to b, etc...). Similarly, we can use peekSlice() to index into the - # list itself this way. - # 2. After the list: - # a b c [d e f g] h i # [] is the list - # ~~~~~~ <- N - # Here we set the view length to static_inputs. In our example, - # we effectively ignore the fact that we have a list here. What is - # significant is that our index i is equivalent when the view length - # is right-justified, whether we have the list or not. Concretely, - # indexing h or i from `a b c [d e f g] h i` is equvalent to indexing - # h or i from `a b c h i`. - view_length = 'varargs_length' if has_tensorlist and i < tensorlist_idx else static_inputs - - if arg['simple_type'] == 'TensorList': - # NOTE: don't advance real_inputs here. After this we are going - # to switch over to indexing from the end as if we only had - # the static arguments. - arguments.append('toTensors(peekSlice(stack, {}, varargs_length - {}, varargs_length))' - .format(real_inputs, static_inputs)) - elif arg['simple_type'] in default_only_types: + for arg in decl['arguments']: + if arg['simple_type'] in default_only_types: arguments.append(arg['default']) - elif is_tensor_arg(arg) or is_positional_arg[i]: - value = '(std::move(peek(stack, {}, {})))'.format(real_inputs, view_length) + else: + value = '(std::move(peek(stack, {}, {})))'.format(real_inputs, num_inputs) arguments.append(from_ivalue(arg, value)) real_inputs += 1 - else: - assign = "auto {} = {};".format(arg['name'], from_attribute(arg)) - kw_assignments.append(assign) - arguments.append(arg['name']) - call = get_invocation(decl, arguments, num_dynamic_inputs) + call = get_invocation(decl, arguments, num_inputs) returns = decl['returns'] - all_scalars = all(r['dynamic_type'] != 'TensorList' for r in returns) constructor = CONSTRUCTOR.substitute(name=decl['name'], call=call, kw_assignments=kw_assignments, - num_dynamic_inputs=num_dynamic_inputs) + num_inputs=num_inputs) return constructor - def emit_decl(decl): - arguments = decl['arguments'] - has_tensorlist = any(arg['simple_type'] == 'TensorList' for arg in arguments) - num_tensor_args = sum(map(is_tensor_arg, arguments)) - - # Right now, we generate dispatch methods that either take all non-tensor arguments - # as attributes, or don't use any attributes at all. In the future we might want to - # have something in the middle too (might be useful for e.g. constant propagation - # into attributes, as that would allow us to avoid reparsing tensors into scalar - # args at every invocation). - - all_real_arguments_are_inputs = tuple(arg['simple_type'] not in default_only_types for arg in arguments) - only_tensors_are_inputs = tuple(is_tensor_arg(arg) for arg in arguments) - - variants = [emit_decl_variant(decl, all_real_arguments_are_inputs, has_tensorlist)] - # in some cases there are no inputs that are possibly attributes, so the - # variants are actually the same. If so avoid generating both to save compilation - # time. - if all_real_arguments_are_inputs != only_tensors_are_inputs: - variants += [',', emit_decl_variant(decl, only_tensors_are_inputs, has_tensorlist)] - - ops.append(OPERATOR.substitute(signature=signature(decl), - ops=variants)) - # This function declares an order on declarations. This is necessary because # there is some ambiguity in the choice of overload: if an argument is overloaded # to accept both Scalar and Tensor, the schema with the Tensor should come first @@ -376,7 +273,8 @@ def declkey(decl): jit_decls = sort_decls(jit_decls) for decl in jit_decls: - emit_decl(decl) + ops.append(OPERATOR.substitute(signature=signature(decl), + op=emit_decl_variant(decl))) # Sort the generated snippets to ensure that the generation is deterministic env = { diff --git a/tools/jit/templates/register_aten_ops.cpp b/tools/jit/templates/register_aten_ops.cpp index 06ad9c2840b1cc..3dc973463d6e90 100644 --- a/tools/jit/templates/register_aten_ops.cpp +++ b/tools/jit/templates/register_aten_ops.cpp @@ -29,7 +29,6 @@ using autograd::Variable; using autograd::variable_list; using at::Scalar; using at::Tensor; -using at::TensorList; using at::TensorOptions; using at::DeviceGuard; @@ -42,26 +41,20 @@ int deviceForInputs(Stack & stack, size_t N) { return t.type().is_cuda() ? (int) t.get_device() : -1; } -std::vector toTensors(at::ArrayRef ivalues) { - return fmap(ivalues, [](const IValue& v) { - return v.toTensor(); - }); -} - template -std::array as_bool_array(const std::vector& vec) { +std::array as_bool_array(at::ArrayRef vec) { std::array res; JIT_ASSERT(vec.size() == N); std::copy(vec.begin(), vec.end(), res.begin()); return res; } -at::Device as_device(const std::vector& elements) { +at::Device as_device(ArrayRef elements) { return at::Device(static_cast(elements[0]), elements[1]); } RegisterOperators reg({ -${constructors} + ${constructors} }); } // anon namespace diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index 88546fda7ed604..057bf6efeac3dd 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -102,6 +102,7 @@ add_custom_command( "${TOOLS_PATH}/autograd/gen_autograd.py" "${TOOLS_PATH}/autograd/gen_autograd_functions.py" "${TOOLS_PATH}/autograd/gen_variable_type.py" + "${TOOLS_PATH}/jit/gen_jit_dispatch.py" "${TOOLS_PATH}/jit/templates/register_aten_ops.cpp" "${TOOLS_PATH}/jit/templates/aten_interned_strings.h" WORKING_DIRECTORY "${TORCH_SRC_DIR}/..") @@ -138,6 +139,7 @@ set(TORCH_SRCS ${TORCH_SRC_DIR}/csrc/jit/operator.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/batch_mm.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/canonicalize.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/constant_propagation.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/common_subexpression_elimination.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/create_autodiff_subgraphs.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/dead_code_elimination.cpp @@ -161,8 +163,6 @@ set(TORCH_SRCS ${TORCH_SRC_DIR}/csrc/jit/test_jit.cpp ${TORCH_SRC_DIR}/csrc/jit/tracer.cpp ${TORCH_SRC_DIR}/csrc/jit/type.cpp - ${TORCH_SRC_DIR}/csrc/onnx/onnx.cpp - ${TORCH_SRC_DIR}/csrc/onnx/onnx.npb.cpp ${TORCH_SRC_DIR}/csrc/torch.cpp ${TORCH_SRC_DIR}/csrc/utils/tensor_flatten.cpp ${TORCH_SRC_DIR}/csrc/utils/variadic.cpp @@ -267,6 +267,12 @@ if(OPENMP_FOUND) target_link_libraries(torch -fopenmp) endif() +if (NOT NO_API AND NOT USE_ROCM) + target_include_directories(torch PUBLIC + ${TORCH_SRC_DIR}/csrc/api + ${TORCH_SRC_DIR}/csrc/api/include) +endif() + if(USE_CUDA) if(MSVC) set(TORCH_CUDA_LIBRARIES @@ -365,7 +371,7 @@ install(TARGETS torch ARCHIVE DESTINATION "${TORCH_INSTALL_LIB_DIR}") # JIT Tests. TODO: Put into test/cpp/jit folder -if (NOT MSVC AND NOT APPLE AND NOT USE_ROCM) +if (BUILD_TORCH_TEST AND NOT MSVC AND NOT APPLE AND NOT USE_ROCM) add_executable(test_jit ${TORCH_SRC_DIR}/csrc/jit/test_jit.cpp) target_link_libraries(test_jit torch ${TORCH_CUDA_LIBRARIES}) target_compile_definitions(test_jit PUBLIC USE_CATCH _FORCE_INLINES) @@ -379,10 +385,6 @@ if (NOT MSVC AND NOT APPLE AND NOT USE_ROCM) endif() if (BUILD_TORCH_TEST AND NOT NO_API AND NOT USE_ROCM) - target_include_directories(torch PUBLIC - ${TORCH_SRC_DIR}/csrc/api - ${TORCH_SRC_DIR}/csrc/api/include) - set(TORCH_API_TEST_DIR "${TORCH_SRC_DIR}/../test/cpp/api") add_executable(test_api diff --git a/torch/__init__.py b/torch/__init__.py index 3fbb0b76fcc386..a40111bcca6b02 100644 --- a/torch/__init__.py +++ b/torch/__init__.py @@ -298,3 +298,8 @@ def manager_path(): # attach docstrings to torch and tensor functions from . import _torch_docs, _tensor_docs, _storage_docs del _torch_docs, _tensor_docs, _storage_docs + + +def compiled_with_cxx11_abi(): + r"""Returns whether PyTorch was built with _GLIBCXX_USE_CXX11_ABI=1""" + return _C._GLIBCXX_USE_CXX11_ABI diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp index 2194310a46d522..af367c3e544905 100644 --- a/torch/csrc/Module.cpp +++ b/torch/csrc/Module.cpp @@ -402,16 +402,6 @@ PyObject *THPModule_isDefaultTypeCuda(PyObject *_unused, PyObject *arg) { END_HANDLE_TH_ERRORS } -PyObject *THPModule_useZeroSizeDim(PyObject *_unused, PyObject *arg) { - HANDLE_TH_ERRORS -#ifdef USE_TH_SIZE_ZERO_DIM - Py_RETURN_TRUE; -#else - Py_RETURN_FALSE; -#endif - END_HANDLE_TH_ERRORS -} - static PyMethodDef TorchMethods[] = { {"_initExtension", (PyCFunction)THPModule_initExtension, METH_O, NULL}, {"_autograd_init", (PyCFunction)THPAutograd_initExtension, METH_NOARGS, NULL}, @@ -442,7 +432,6 @@ static PyMethodDef TorchMethods[] = { {"set_flush_denormal", (PyCFunction)THPModule_setFlushDenormal, METH_O, NULL}, {"get_default_dtype", (PyCFunction)THPModule_getDefaultDtype, METH_NOARGS, NULL}, {"_is_default_type_cuda", (PyCFunction)THPModule_isDefaultTypeCuda, METH_NOARGS, NULL}, - {"_use_zero_size_dim", (PyCFunction)THPModule_useZeroSizeDim, METH_NOARGS, NULL}, {NULL, NULL, 0, NULL} }; @@ -624,6 +613,13 @@ static PyObject* initModule() { ASSERT_TRUE(PyModule_AddObject(module, "has_mkl", at::hasMKL() ? Py_True : Py_False) == 0); +#ifdef _GLIBCXX_USE_CXX11_ABI + ASSERT_TRUE(PyModule_AddObject(module, "_GLIBCXX_USE_CXX11_ABI", + _GLIBCXX_USE_CXX11_ABI ? Py_True : Py_False) == 0); +#else + ASSERT_TRUE(PyModule_AddObject(module, "_GLIBCXX_USE_CXX11_ABI", Py_False) == 0); +#endif + auto& defaultGenerator = at::globalContext().defaultGenerator(at::kCPU); THPDefaultGenerator = (THPGenerator*)THPGenerator_NewWithGenerator( defaultGenerator); diff --git a/torch/csrc/api/include/torch/nn/cursor.h b/torch/csrc/api/include/torch/nn/cursor.h index c0f56eea72fbd0..2ae5c5d93752c1 100644 --- a/torch/csrc/api/include/torch/nn/cursor.h +++ b/torch/csrc/api/include/torch/nn/cursor.h @@ -48,7 +48,7 @@ class CursorBase { /// A `(key, value)` pair exposed by cursor iterators. struct Item { - Item(const std::string& key_, T& module_); + Item(const std::string& key_, T& value_); T& operator*(); const T& operator*() const; diff --git a/torch/csrc/autograd/anomaly_mode.h b/torch/csrc/autograd/anomaly_mode.h index 7327d03f11b887..1f12f0a65c7460 100644 --- a/torch/csrc/autograd/anomaly_mode.h +++ b/torch/csrc/autograd/anomaly_mode.h @@ -18,7 +18,7 @@ struct AnomalyMode { struct AnomalyMetadata { - virtual ~AnomalyMetadata(){}; + virtual ~AnomalyMetadata() = default; virtual void store_stack() = 0; virtual void print_stack() = 0; }; diff --git a/torch/csrc/autograd/aten_variable_hooks.cpp b/torch/csrc/autograd/aten_variable_hooks.cpp index 7a2c3974c2227c..2f3899e4f8b59a 100644 --- a/torch/csrc/autograd/aten_variable_hooks.cpp +++ b/torch/csrc/autograd/aten_variable_hooks.cpp @@ -6,6 +6,7 @@ namespace torch { namespace autograd { struct VariableHooks : public at::VariableHooksInterface { VariableHooks(at::VariableHooksArgs) {} void registerVariableTypeFor(at::Context*, at::Backend, at::ScalarType) const override; + at::Type& getVariableType(const at::Type&) const override; }; // Sigh, the registry doesn't support namespaces :( @@ -20,4 +21,8 @@ void VariableHooks::registerVariableTypeFor(at::Context* context, at::Backend ba register_variable_type_for(baseType); } +at::Type& VariableHooks::getVariableType(const at::Type& baseType) const { + return *VariableType::getType(baseType); +} + }} // torch::autograd diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp index 8309ba1ce1038c..74e15f5caefe9d 100644 --- a/torch/csrc/autograd/engine.cpp +++ b/torch/csrc/autograd/engine.cpp @@ -159,7 +159,7 @@ struct GraphTask { std::unordered_map exec_info; std::vector captured_vars; - void init_to_execute(Function& graph_root, const edge_list& captures); + void init_to_execute(Function& graph_root, const edge_list& outputs); // The value of worker_device in the thread that created this task. // See Note [Reentrant backwards] @@ -499,14 +499,14 @@ struct ClearCallbacks { std::mutex& callbacks_lock; }; -auto Engine::execute(const edge_list& input_roots, +auto Engine::execute(const edge_list& roots, const variable_list& inputs, bool keep_graph, bool create_graph, const edge_list& outputs) -> variable_list { std::call_once(start_threads_flag, &Engine::start_threads, this); - validate_outputs(input_roots, const_cast(inputs), [](const std::string& msg) { + validate_outputs(roots, const_cast(inputs), [](const std::string& msg) { return msg; }); @@ -517,7 +517,7 @@ auto Engine::execute(const edge_list& input_roots, std::unique_lock lock(graph_task.mutex); // Now compute the dependencies for all executable functions and queue the root - auto graph_root = std::make_shared(input_roots, inputs); + auto graph_root = std::make_shared(roots, inputs); compute_dependencies(graph_root.get(), graph_task); if (!outputs.empty()) { graph_task.init_to_execute(*graph_root, outputs); diff --git a/torch/csrc/autograd/engine.h b/torch/csrc/autograd/engine.h index db8b3357ac2536..94490303ccc240 100644 --- a/torch/csrc/autograd/engine.h +++ b/torch/csrc/autograd/engine.h @@ -57,7 +57,7 @@ struct TORCH_API Engine { ReadyQueue& ready_queue(int device); void start_threads(); virtual void thread_init(int device); - virtual void thread_main(GraphTask *task); + virtual void thread_main(GraphTask *graph_task); virtual void thread_on_exception(FunctionTask& task, std::exception& e); std::once_flag start_threads_flag; diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h index b02bdf3928f2ff..46a80b90b29ffa 100644 --- a/torch/csrc/autograd/function.h +++ b/torch/csrc/autograd/function.h @@ -328,7 +328,7 @@ struct TORCH_API Function : std::enable_shared_from_this { /// See Function::is_traceable() for definition. struct TraceableFunction : public Function { using Function::Function; - bool is_traceable() final override { + bool is_traceable() final { return true; } }; diff --git a/torch/csrc/autograd/function_hook.h b/torch/csrc/autograd/function_hook.h index 03c52fea54535c..f3cf5b2e793c6a 100644 --- a/torch/csrc/autograd/function_hook.h +++ b/torch/csrc/autograd/function_hook.h @@ -10,12 +10,12 @@ struct Variable; using variable_list = std::vector; struct FunctionPreHook { - virtual ~FunctionPreHook() {} + virtual ~FunctionPreHook() = default; virtual variable_list operator()(const variable_list& grads) = 0; }; struct FunctionPostHook { - virtual ~FunctionPostHook() {} + virtual ~FunctionPostHook() = default; virtual variable_list operator()(const variable_list& grad_input, const variable_list& grad_output) = 0; }; diff --git a/torch/csrc/autograd/functions/accumulate_grad.h b/torch/csrc/autograd/functions/accumulate_grad.h index 44d4b7f106c860..db86ae428d4060 100644 --- a/torch/csrc/autograd/functions/accumulate_grad.h +++ b/torch/csrc/autograd/functions/accumulate_grad.h @@ -6,9 +6,9 @@ namespace torch { namespace autograd { struct AccumulateGrad : public Function { - explicit AccumulateGrad(Variable variable); + explicit AccumulateGrad(Variable variable_); - variable_list apply(variable_list&& inputs) override; + variable_list apply(variable_list&& grads) override; Variable variable; }; diff --git a/torch/csrc/autograd/functions/basic_ops.cpp b/torch/csrc/autograd/functions/basic_ops.cpp index b04b0f25ca42d5..c4a54d99d08702 100644 --- a/torch/csrc/autograd/functions/basic_ops.cpp +++ b/torch/csrc/autograd/functions/basic_ops.cpp @@ -11,7 +11,7 @@ namespace torch { namespace autograd { -auto Error::apply(variable_list&& grad_outputs) -> variable_list { +auto Error::apply(variable_list&& inputs) -> variable_list { throw std::runtime_error(msg); } diff --git a/torch/csrc/autograd/functions/tensor.h b/torch/csrc/autograd/functions/tensor.h index aa4b422136930f..1a21a360ba9fc2 100644 --- a/torch/csrc/autograd/functions/tensor.h +++ b/torch/csrc/autograd/functions/tensor.h @@ -13,7 +13,7 @@ namespace torch { namespace autograd { struct CopyBackwards : public Function { - variable_list apply(variable_list&& inputs) override; + variable_list apply(variable_list&& grads) override; at::Type *src_type; int32_t src_device = -1; @@ -23,9 +23,12 @@ struct CopyBackwards : public Function { // grad[idx] is defined by the relative sizes, strides, and offset of base and // view. struct CopySlices : public Function { - CopySlices(const Variable& base, at::TensorGeometry view, std::shared_ptr fn); + CopySlices( + const Variable& base_var, + at::TensorGeometry view_, + std::shared_ptr fn_); - variable_list apply(variable_list&& grads) override; + variable_list apply(variable_list&& inputs) override; void release_variables() override; at::TensorGeometry base; diff --git a/torch/csrc/autograd/input_buffer.h b/torch/csrc/autograd/input_buffer.h index 2e0febfc84b0bc..f1c02e0d78e565 100644 --- a/torch/csrc/autograd/input_buffer.h +++ b/torch/csrc/autograd/input_buffer.h @@ -22,14 +22,14 @@ struct InputBuffer { InputBuffer& operator=(InputBuffer&& other) = default; // Accumulates the variable at a specified index. - void add(size_t idx, Variable var); + void add(size_t pos, Variable var); int device() const; Variable operator[](size_t pos) { return buffer[pos]; } // Returns the inputs as a list of variables. Destroys given InputBuffer. - static std::vector variables(InputBuffer&& buffer); + static std::vector variables(InputBuffer&& g); private: std::vector buffer; diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h index dd77dc193ba9bd..ba0fee1510baa2 100644 --- a/torch/csrc/autograd/profiler.h +++ b/torch/csrc/autograd/profiler.h @@ -185,7 +185,7 @@ struct TORCH_API RecordFunction { using thread_event_lists = std::vector>; // NOTE: changing profiler modes is **NOT THREAD SAFE**. You should ensure that // there no autograd functions are being executed when these function are used. -TORCH_API void enableProfiler(ProfilerState state); +TORCH_API void enableProfiler(ProfilerState new_state); TORCH_API thread_event_lists disableProfiler(); } // namespace profiler diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp index 08e494530040eb..e9d29bd0caa688 100644 --- a/torch/csrc/autograd/python_function.cpp +++ b/torch/csrc/autograd/python_function.cpp @@ -45,7 +45,7 @@ namespace torch { namespace autograd { VariableInfo::VariableInfo(const Variable& var) : type(&var.type()) - , size(var.sizes()) + , size(var.sizes().vec()) , requires_grad(var.requires_grad()) { if (var.type().is_cuda()) { device = var.get_device(); diff --git a/torch/csrc/autograd/python_variable_indexing.cpp b/torch/csrc/autograd/python_variable_indexing.cpp index cd8329cad01434..1aa21f84d45cf2 100644 --- a/torch/csrc/autograd/python_variable_indexing.cpp +++ b/torch/csrc/autograd/python_variable_indexing.cpp @@ -154,14 +154,6 @@ static Variable applySlicing(const Variable& self, PyObject* index, variable_lis result = applySelect(result, dim, THPUtils_unpackLong(obj)); } else if (PySlice_Check(obj)) { result = applySlice(result, dim, obj); -#ifndef USE_TH_SIZE_ZERO_DIM - if (result.numel() == 0) { - // TODO: currently we don't have support for 0-sized dims, so slicing a dim - // to size 0 will return a size 0 tensor. for now, just shortcircuit slicing - // and return that size 0 tensor. - return result; - } -#endif dim++; } else if (obj == Py_Ellipsis) { dim += self.dim() - specified_dims; diff --git a/torch/csrc/autograd/saved_variable.h b/torch/csrc/autograd/saved_variable.h index 61a1d3b3eac172..037f06a7f95c11 100644 --- a/torch/csrc/autograd/saved_variable.h +++ b/torch/csrc/autograd/saved_variable.h @@ -45,10 +45,10 @@ class TORCH_API SavedVariable { std::weak_ptr grad_accumulator_; VariableVersion version_counter_; - uint32_t saved_version_; - uint32_t output_nr_; + uint32_t saved_version_ = 0; + uint32_t output_nr_ = 0; bool was_default_constructed_ = true; - bool requires_grad_; - bool has_grad_fn_; + bool requires_grad_ = false; + bool has_grad_fn_ = false; }; }} // namespace torch::autograd diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp index 9bbae25d9c4d96..30aded0a85e73a 100644 --- a/torch/csrc/autograd/variable.cpp +++ b/torch/csrc/autograd/variable.cpp @@ -22,7 +22,7 @@ namespace torch { namespace autograd { Variable::Impl::Impl(at::Tensor data, bool requires_grad, Edge gradient_edge) - : TensorImpl(VariableType::getType(data), nullptr), + : TensorImpl(data.type().backend(), data.type().scalarType(), nullptr, /* is variable */ true), data_(std::move(data)), grad_fn_(std::move(gradient_edge.function)), requires_grad_(false), @@ -118,7 +118,9 @@ void Variable::Impl::backward( void Variable::Impl::set_data(Tensor new_data) { if (new_data.type() != data_.type()) { - type_ = VariableType::getType(new_data.type()); + scalar_type_ = new_data.type().scalarType(); + backend_ = new_data.type().backend(); + is_variable_ = true; // Clear grad_accumulator if it exists, since it stores the old type info. grad_accumulator_.reset(); } @@ -154,8 +156,8 @@ std::shared_ptr& Variable::ViewImpl::get_grad_fn() { AT_ASSERT(output_nr_ == 0); auto fn = std::make_shared(); fn->self_geometry = at::TensorGeometry(base_); - fn->size = sizes(); - fn->stride = strides(); + fn->size = sizes().vec(); + fn->stride = strides().vec(); fn->storage_offset = data_.storage_offset(); fn->set_next_edges(collect_next_edges(base_)); fn->add_input_metadata(base_.type(), sizes()); diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h index c97a0322359a4d..d46008bbdd10b0 100644 --- a/torch/csrc/autograd/variable.h +++ b/torch/csrc/autograd/variable.h @@ -263,7 +263,7 @@ struct Variable::Impl : public at::TensorImpl { TORCH_API explicit Impl( at::Tensor data, bool requires_grad = false, - Edge edge = Edge()); + Edge gradient_edge = Edge()); ~Impl() override; @@ -327,9 +327,6 @@ struct Variable::Impl : public at::TensorImpl { /// Reset all expensive fields to free up resources void release_resources() override; - // Make this field public so we can access it from `Variable`. - using at::TensorImpl::type_; - std::string name; at::Tensor data_; diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp index 0e869876e8e1fa..8237239f99b639 100644 --- a/torch/csrc/cuda/comm.cpp +++ b/torch/csrc/cuda/comm.cpp @@ -74,7 +74,7 @@ tensor_list2d broadcast_coalesced(TensorList tensors, IntList devices, size_t bu } tensor_list2d outputs(devices.size()); - outputs[0] = tensors; + outputs[0] = tensors.vec(); for (auto & o : outputs) o.reserve(tensors.size()); diff --git a/torch/csrc/distributed/c10d/ddp.h b/torch/csrc/distributed/c10d/ddp.h new file mode 100644 index 00000000000000..7b26c1475fc1c6 --- /dev/null +++ b/torch/csrc/distributed/c10d/ddp.h @@ -0,0 +1,52 @@ +#pragma once + +#include + +#include + +#include + +#include +#include +#include + +namespace c10d { +inline void distBroadcastCoalesced( + std::vector& tensors, + int64_t bufferSize, + ProcessGroup& processGroup) { + auto tensorGroups = torch::utils::take_tensors(tensors, bufferSize); + // We store single-element vectors in `flatTensors` because + // `ProcessGroup::broadcast` takes a reference to a vector, which must be + // alive until the `wait()` call on the returned `Work` completes. + std::vector> flatTensors; + std::vector> work; + flatTensors.reserve(tensorGroups.size()); + work.reserve(tensorGroups.size()); + for (const auto& group : tensorGroups) { + // Flatten each group of tensors (whose size equals `bufferSize`) into a + // single tensor. + flatTensors.push_back({torch::utils::flatten_dense_tensors(group.tensors)}); + BroadcastOptions broadcastOptions; + broadcastOptions.rootRank = 0; + broadcastOptions.rootTensor = 0; + // Enqueue a work item and collect the `Work` (essntially a "future") so we + // can `wait()` for its completion after we have collected all `Work` items. + work.push_back( + processGroup.broadcast(flatTensors.back(), broadcastOptions)); + } + // Now loop through each group, wait for the broadcast to complete, and + // un-flatten the broadcast tensor back into device-local individual tensors. + for (size_t group = 0; group < tensorGroups.size(); ++group) { + auto& tensors = tensorGroups[group].tensors; + work[group]->wait(); + const auto synced = + torch::utils::unflatten_dense_tensors(flatTensors[group][0], tensors); + AT_ASSERT(synced.size() == tensors.size()); + for (size_t i = 0; i < synced.size(); ++i) { + // Copy into the per-process tensors. + tensors[i].copy_(synced[i], /*non_blocking=*/true); + } + } +} +} // namespace c10d diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index 2bd7a871dc36fc..797fcbcdd2432e 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -13,9 +13,10 @@ #include #include -#include "torch/csrc/Exceptions.h" -#include "torch/csrc/utils/object_ptr.h" -#include "torch/csrc/utils/pybind.h" +#include +#include +#include +#include namespace torch { namespace distributed { @@ -199,6 +200,8 @@ PyObject* c10d_init(PyObject* _unused) { &::c10d::ProcessGroup::Work::wait, py::call_guard()); + module.def("_dist_broadcast_coalesced", &::c10d::distBroadcastCoalesced); + Py_RETURN_TRUE; } diff --git a/torch/csrc/jit/argument_spec.h b/torch/csrc/jit/argument_spec.h index d6bd90cb708784..f404b4ce9a05c6 100644 --- a/torch/csrc/jit/argument_spec.h +++ b/torch/csrc/jit/argument_spec.h @@ -59,20 +59,21 @@ struct ArgumentSpec { for(int32_t i = 0; i < num_inputs; i++) { auto & pod = pods[i]; pod.is_tensor = static_cast(inputs[i].isTensor()); - if (!pod.is_tensor) continue; - at::Tensor t = inputs[i].toTensor(); - pod.defined = t.defined(); - if (pod.defined) { - pod.type = static_cast(t.type().scalarType()); - pod.device = (!t.type().is_cuda()) ? -1 : t.get_device(); - pod.requires_grad = with_grad && autograd::as_variable_ref(t).requires_grad(); - total_dims += t.ndimension(); - auto sizes = t.sizes(); - std::copy(sizes.begin(),sizes.end(), next_dim); - next_dim += sizes.size(); - auto strides = t.strides(); - std::copy(strides.begin(), strides.end(), next_dim); - next_dim += strides.size(); + if (pod.is_tensor) { + at::Tensor t = inputs[i].toTensor(); + pod.defined = t.defined(); + if (pod.defined) { + pod.type = static_cast(t.type().scalarType()); + pod.device = (!t.type().is_cuda()) ? -1 : t.get_device(); + pod.requires_grad = with_grad && autograd::as_variable_ref(t).requires_grad(); + total_dims += t.ndimension(); + auto sizes = t.sizes(); + std::copy(sizes.begin(),sizes.end(), next_dim); + next_dim += sizes.size(); + auto strides = t.strides(); + std::copy(strides.begin(), strides.end(), next_dim); + next_dim += strides.size(); + } } // each POD has a running tally of all dimensions including its own pod.total_dims = total_dims; diff --git a/torch/csrc/jit/attributes.h b/torch/csrc/jit/attributes.h index f69790cab52e00..53b87af9ef991d 100644 --- a/torch/csrc/jit/attributes.h +++ b/torch/csrc/jit/attributes.h @@ -28,7 +28,7 @@ struct AttributeValue { Symbol name; virtual AttributeKind kind() const = 0; virtual Ptr clone() const = 0; - virtual ~AttributeValue() {} + virtual ~AttributeValue() = default; }; template @@ -101,7 +101,7 @@ struct AttributeError : public std::exception { // we return Derived* pointers because Nodes are normally held as pointers. template struct Attributes { - Attributes() {} + Attributes() = default; void copyAttributes(const Attributes & rhs) { values_.clear(); for(auto & i : rhs.values_) { diff --git a/torch/csrc/jit/autodiff.cpp b/torch/csrc/jit/autodiff.cpp index c830dc45a537f5..7f250bf7c452aa 100644 --- a/torch/csrc/jit/autodiff.cpp +++ b/torch/csrc/jit/autodiff.cpp @@ -9,6 +9,7 @@ #include #include +#include namespace torch { namespace jit { @@ -564,14 +565,13 @@ static void lambdaLiftReverse(Gradient& grad_desc, ReverseDetails& rev_info) { reverse_block->owningNode()->destroy(); } -Gradient differentiate(std::shared_ptr& _graph, const std::vector& requires_grad) { +Gradient differentiate(std::shared_ptr& graph, const std::vector& requires_grad) { Gradient grad_desc; // Take ownership of the graph - JIT_ASSERTM( - _graph.use_count() == 1, - "differentiate will mutate and destroy the graph, so it requires " - "graph.use_count() == 1, but found ", _graph.use_count()); - std::swap(_graph, grad_desc.f); + JIT_ASSERTM(graph.use_count() == 1, + "differentiate will mutate and destroy the graph, so it requires " + "graph.use_count() == 1, but found %d", graph.use_count()); + std::swap(graph, grad_desc.f); // XXX: Take care when handling outputs - they can be duplicated! WithInsertPoint guard(grad_desc.f->block()); diff --git a/torch/csrc/jit/autodiff.h b/torch/csrc/jit/autodiff.h index 6dd2be9db0e779..ea2b7a1170efeb 100644 --- a/torch/csrc/jit/autodiff.h +++ b/torch/csrc/jit/autodiff.h @@ -4,7 +4,9 @@ #include "torch/csrc/jit/ir.h" #include + #include +#include namespace torch { namespace jit { diff --git a/torch/csrc/jit/constants.cpp b/torch/csrc/jit/constants.cpp index 3c4ad0c130ea31..47e593bbb125e2 100644 --- a/torch/csrc/jit/constants.cpp +++ b/torch/csrc/jit/constants.cpp @@ -22,8 +22,13 @@ Value* insertConstant( n->f_(attr::value, val.toDouble()); n->output()->setType(FloatType::get()); } else if(val.isIntList()) { - n->is_(attr::value, val.toIntList()->elements()); + n->is_(attr::value, val.toIntList()->elements().vec()); n->output()->setType(ListType::ofInts()); + } else if(val.isTensorList()) { + n->ts_(attr::value, fmap(val.toTensorList()->elements(), [](const at::Tensor & t) { + return autograd::Variable(t).data(); + })); + n->output()->setType(ListType::ofTensors()); } else { throw std::runtime_error("Unsupported value kind: " + val.tagKind()); } @@ -66,6 +71,14 @@ RegisterOperators reg({ push(stack, is); return 0; }; + } else if(type->isSubtypeOf(ListType::ofTensors())) { + auto ts = fmap(node->ts(attr::value), [](const at::Tensor & t) -> at::Tensor { + return autograd::make_variable(t); + }); + return [ts](Stack& stack) { + push(stack, ts); + return 0; + }; } else { std::stringstream ss; ss << "constant literal not supported for: " << type->str(); diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp index 71dec999c40216..20208af5496c28 100644 --- a/torch/csrc/jit/export.cpp +++ b/torch/csrc/jit/export.cpp @@ -1,6 +1,7 @@ #include "torch/csrc/jit/export.h" -#include "torch/csrc/onnx/onnx.h" #include "torch/csrc/autograd/symbolic.h" +#include "onnx/onnx.pb.h" +#include "torch/csrc/onnx/onnx.h" #include "torch/csrc/utils/functional.h" #include @@ -18,7 +19,8 @@ namespace torch { namespace jit { namespace { -namespace onnx = ::torch::onnx; +namespace onnx_torch = ::torch::onnx; +namespace onnx = ::ONNX_NAMESPACE; std::string value_name(Value* n) { return n->uniqueName(); @@ -26,7 +28,7 @@ std::string value_name(Value* n) { struct ExportContext { size_t num_blocks = 0; - onnx::OperatorExportTypes operator_export_type; + onnx_torch::OperatorExportTypes operator_export_type; }; void encodeGraph(onnx::GraphProto * p_g, const std::shared_ptr & g, @@ -43,34 +45,37 @@ void encodeTensor(onnx::TensorProto * p, const at::Tensor & tensor, for(auto d : tensor.sizes()) { p->add_dims(d); } - onnx::DataType onnx_type; + onnx::TensorProto_DataType onnx_type; // Most integral types and float16 need to be serialized as int32 at::ScalarType cast_type = tensor.type().scalarType(); switch(tensor.type().scalarType()) { case at::kDouble: - onnx_type = onnx::kDOUBLE; + onnx_type = onnx::TensorProto_DataType_DOUBLE; break; case at::kFloat: - onnx_type = onnx::kFLOAT; + onnx_type = onnx::TensorProto_DataType_FLOAT; break; case at::kHalf: - onnx_type = onnx::kFLOAT16; + onnx_type = onnx::TensorProto_DataType_FLOAT16; cast_type = at::kInt; break; case at::kByte: + onnx_type = onnx::TensorProto_DataType_UINT8; + cast_type = at::kInt; + break; case at::kChar: - onnx_type = onnx::kINT8; + onnx_type = onnx::TensorProto_DataType_INT8; cast_type = at::kInt; break; case at::kShort: - onnx_type = onnx::kINT16; + onnx_type = onnx::TensorProto_DataType_INT16; cast_type = at::kInt; break; case at::kInt: - onnx_type = onnx::kINT32; + onnx_type = onnx::TensorProto_DataType_INT32; break; case at::kLong: - onnx_type = onnx::kINT64; + onnx_type = onnx::TensorProto_DataType_INT64; break; default: AT_ERROR("unexpected tensor scalar type"); @@ -85,13 +90,14 @@ void encodeTensor(onnx::TensorProto * p, const at::Tensor & tensor, if (external_ref) { // For now, we use the name of the tensor as the external lookup name to // avoid ONNX protobuf changes. - JIT_ASSERT(external_ref.value() == p->get_name()); + JIT_ASSERT(external_ref.value() == p->name()); JIT_ASSERT(raw_data_export_map != nullptr); JIT_ASSERT(raw_data_export_map->count(external_ref.value()) == 0); (*raw_data_export_map)[external_ref.value()] = t; - p->set_external_data_present(); + p->set_raw_data("__EXTERNAL"); } else { - p->set_raw_data(t); + JIT_ASSERT(t.is_contiguous()); + p->set_raw_data(std::string(static_cast(t.data_ptr()), t.type().elementSizeInBytes() * t.numel())); } } @@ -102,50 +108,50 @@ void addAttribute(onnx::NodeProto * n_p, jit::Node * n, jit::Symbol name, Export switch(n->kindOf(name)) { case AttributeKind::f: attr->set_f(n->f(name)); - attr->set_type(onnx::aFLOAT); + attr->set_type(onnx::AttributeProto_AttributeType_FLOAT); break; case AttributeKind::fs: - attr->set_type(onnx::aFLOATS); + attr->set_type(onnx::AttributeProto_AttributeType_FLOATS); for(auto & v : n->fs(name)) attr->add_floats(v); break; case AttributeKind::i: - attr->set_type(onnx::aINT); + attr->set_type(onnx::AttributeProto_AttributeType_INT); attr->set_i(n->i(name)); break; case AttributeKind::is: - attr->set_type(onnx::aINTS); + attr->set_type(onnx::AttributeProto_AttributeType_INTS); for(auto & v : n->is(name)) attr->add_ints(v); break; case AttributeKind::s: - attr->set_type(onnx::aSTRING); + attr->set_type(onnx::AttributeProto_AttributeType_STRING); attr->set_s(n->s(name)); break; case AttributeKind::ss: - attr->set_type(onnx::aSTRINGS); + attr->set_type(onnx::AttributeProto_AttributeType_STRINGS); for(auto & v : n->ss(name)) attr->add_strings(v); break; case AttributeKind::t: { - attr->set_type(onnx::aTENSOR); + attr->set_type(onnx::AttributeProto_AttributeType_TENSOR); auto t = attr->mutable_t(); encodeTensor(t, n->t(name)); } break; case AttributeKind::ts: - attr->set_type(onnx::aTENSORS); + attr->set_type(onnx::AttributeProto_AttributeType_TENSORS); for(auto & v : n->ts(name)) { auto t = attr->add_tensors(); encodeTensor(t, v); } break; case AttributeKind::g: { - attr->set_type(onnx::aGRAPH); + attr->set_type(onnx::AttributeProto_AttributeType_GRAPH); auto g = attr->mutable_g(); encodeGraph(g, n->g(name), {}, ctx, nullptr); } break; case AttributeKind::gs: - attr->set_type(onnx::aGRAPHS); + attr->set_type(onnx::AttributeProto_AttributeType_GRAPHS); for(auto & v : n->gs(name)) { auto g = attr->add_graphs(); encodeGraph(g, v, {}, ctx, nullptr); @@ -154,49 +160,52 @@ void addAttribute(onnx::NodeProto * n_p, jit::Node * n, jit::Symbol name, Export } } -void encodeTypeProtoTensorType(onnx::TypeProtoTensor* tensor_type, Value* n) { +void encodeTypeProtoTensorType(onnx::TypeProto_Tensor* tensor_type, Value* n) { onnx::TensorShapeProto* shape = tensor_type->mutable_shape(); if (TensorTypePtr node_type = n->type()->cast()) { const std::vector& sizes = node_type->sizes(); - for (std::int64_t s : sizes) { - shape->add_dim(s); + for (size_t i = 0; i < sizes.size(); i++) { + shape->add_dim(); + shape->mutable_dim(i)->set_dim_value(sizes[i]); } - onnx::DataType onnx_type; + onnx::TensorProto_DataType onnx_type; switch(node_type->scalarType()) { case at::kDouble: - onnx_type = onnx::kDOUBLE; + onnx_type = onnx::TensorProto_DataType_DOUBLE; break; case at::kFloat: - onnx_type = onnx::kFLOAT; + onnx_type = onnx::TensorProto_DataType_FLOAT; break; case at::kHalf: - onnx_type = onnx::kFLOAT16; + onnx_type = onnx::TensorProto_DataType_FLOAT16; break; case at::kByte: + onnx_type = onnx::TensorProto_DataType_UINT8; + break; case at::kChar: - onnx_type = onnx::kINT8; + onnx_type = onnx::TensorProto_DataType_INT8; break; case at::kShort: - onnx_type = onnx::kINT16; + onnx_type = onnx::TensorProto_DataType_INT16; break; case at::kInt: - onnx_type = onnx::kINT32; + onnx_type = onnx::TensorProto_DataType_INT32; break; case at::kLong: - onnx_type = onnx::kINT64; + onnx_type = onnx::TensorProto_DataType_INT64; break; default: AT_ERROR("unexpected tensor scalar type"); break; } - tensor_type->set_data_type(onnx_type); + tensor_type->set_elem_type(onnx_type); } } void encodeValueInfo(onnx::ValueInfoProto* v, Value* n) { v->set_name(value_name(n)); onnx::TypeProto* t = v->mutable_type(); - onnx::TypeProtoTensor* tensor_type = t->mutable_tensor_type(); + onnx::TypeProto_Tensor* tensor_type = t->mutable_tensor_type(); encodeTypeProtoTensorType(tensor_type, n); } @@ -226,7 +235,7 @@ void encodeBlock(onnx::GraphProto * p_g, Block *b, encodeValueInfo(v, output); } for (auto node : b->nodes()) { - bool is_raw_export = ctx->operator_export_type == onnx::OperatorExportTypes::RAW; + bool is_raw_export = ctx->operator_export_type == onnx_torch::OperatorExportTypes::RAW; if (node->kind() == prim::Undefined && !is_raw_export) { // Undefined nodes are used to implement optional inputs. One // way to "not provide" an optional input is to create an @@ -253,7 +262,7 @@ void encodeBlock(onnx::GraphProto * p_g, Block *b, JIT_ASSERT(!node->kind().is_onnx()); p_n->set_domain(node->kind().domainString()); } - else if (ctx->operator_export_type != onnx::OperatorExportTypes::ONNX_ATEN_FALLBACK) { + else if (ctx->operator_export_type != onnx_torch::OperatorExportTypes::ONNX_ATEN_FALLBACK) { JIT_ASSERT(node->kind().is_onnx()); } p_n->set_op_type(node->kind().toUnqualString()); @@ -263,7 +272,7 @@ void encodeBlock(onnx::GraphProto * p_g, Block *b, if (is_raw_export && node->blocks().size() > 0) { auto blocks = p_n->add_attribute(); blocks->set_name("_blocks"); - blocks->set_type(onnx::aGRAPHS); + blocks->set_type(onnx::AttributeProto_AttributeType_GRAPHS); for (auto block : node->blocks()) { auto graph = blocks->add_graphs(); encodeBlock(graph, block, initializers, ctx, raw_data_export_map); @@ -274,7 +283,7 @@ void encodeBlock(onnx::GraphProto * p_g, Block *b, auto body = p_n->add_attribute(); body->set_name("body"); - body->set_type(onnx::aGRAPH); + body->set_type(onnx::AttributeProto_AttributeType_GRAPH); auto g = body->mutable_g(); encodeBlock(g, node->blocks()[0], {}, ctx, raw_data_export_map); } @@ -283,13 +292,13 @@ void encodeBlock(onnx::GraphProto * p_g, Block *b, auto true_branch = p_n->add_attribute(); true_branch->set_name("then_branch"); - true_branch->set_type(onnx::aGRAPH); + true_branch->set_type(onnx::AttributeProto_AttributeType_GRAPH); auto true_g = true_branch->mutable_g(); encodeBlock(true_g, node->blocks()[0], {}, ctx, raw_data_export_map); auto false_branch = p_n->add_attribute(); false_branch->set_name("else_branch"); - false_branch->set_type(onnx::aGRAPH); + false_branch->set_type(onnx::AttributeProto_AttributeType_GRAPH); auto false_g = false_branch->mutable_g(); encodeBlock(false_g, node->blocks()[1], {}, ctx, raw_data_export_map); } @@ -300,7 +309,7 @@ void encodeBlock(onnx::GraphProto * p_g, Block *b, for (auto & tensor : initializers) { // TODO: stop using positions to determine which initializers // match to which inputs - std::string name = p_g->get_input_name(inputs_count++); + std::string name = p_g->input(inputs_count++).name(); auto p = p_g->add_initializer(); p->set_name(name); if (raw_data_export_map) { @@ -314,8 +323,8 @@ void encodeBlock(onnx::GraphProto * p_g, Block *b, void encodeModel(onnx::ModelProto* p_m, const std::shared_ptr& g, const std::vector& initializers, RawDataExportMap* raw_data_export_map = nullptr, - onnx::OperatorExportTypes operator_export_type - = onnx::OperatorExportTypes::ONNX) { + onnx_torch::OperatorExportTypes operator_export_type + = onnx_torch::OperatorExportTypes::ONNX) { onnx::GraphProto* p_g = p_m->mutable_graph(); ExportContext ctx; ctx.operator_export_type = operator_export_type; @@ -334,7 +343,7 @@ std::string getNodeStackTraceString(Node* n) { } } // namespace -void validateGraph(const std::shared_ptr& graph, onnx::OperatorExportTypes operator_export_type) { +void validateGraph(const std::shared_ptr& graph, onnx_torch::OperatorExportTypes operator_export_type) { for (auto node : graph->nodes()) { // Macro'ed so we get a marginally better line number on failed export #define FAIL_EXPORT(name) \ @@ -356,7 +365,7 @@ void validateGraph(const std::shared_ptr& graph, onnx::OperatorExportType "Cannot export individual pack_padded_sequence or pad_packed_sequence; these operations must occur in pairs.\n\nUsage of this operation occurred at:\n" + getNodeStackTraceString(node)); } - bool is_aten_fallback = operator_export_type == onnx::OperatorExportTypes::ONNX_ATEN_FALLBACK; + bool is_aten_fallback = operator_export_type == onnx_torch::OperatorExportTypes::ONNX_ATEN_FALLBACK; if (!node->kind().is_onnx() && !is_aten_fallback && node->kind() != prim::Undefined) { FAIL_EXPORT( "Couldn't export operator " + node->kind().toDisplayString() + "\n\nDefined at:\n" + @@ -367,6 +376,182 @@ void validateGraph(const std::shared_ptr& graph, onnx::OperatorExportType } } +// Pretty printing +namespace { +constexpr char indent_char = ' '; +constexpr size_t indent_multiplier = 2; + +std::string idt(size_t indent) { + return std::string(indent * indent_multiplier, indent_char); +} + +std::string nlidt(size_t indent) { + return std::string("\n") + idt(indent); +} + +void dump(const onnx::TensorProto& tensor, std::ostream& stream) { + stream << "TensorProto shape: ["; + for (int i = 0; i < tensor.dims_size(); ++i) { + stream << tensor.dims(i) << (i == tensor.dims_size() - 1 ? "" : " "); + } + stream << "]"; +} + +void dump(const onnx::TensorShapeProto& shape, std::ostream& stream) { + for (int i = 0; i < shape.dim_size(); ++i) { + auto &dim = shape.dim(i); + if (dim.has_dim_value()) { + stream << dim.dim_value(); + } else { + stream << "?"; + } + stream << (i == shape.dim_size() - 1 ? "" : " "); + } +} + +void dump(const onnx::TypeProto_Tensor& tensor_type, std::ostream& stream) { + stream << "Tensor dims: "; + dump(tensor_type.shape(), stream); +} + +void dump(const onnx::TypeProto& type, std::ostream& stream) { + dump(type.tensor_type(), stream); +} + +void dump(const onnx::ValueInfoProto& value_info, std::ostream& stream) { + stream << "{name: \"" << value_info.name() + << "\", type:"; + dump(value_info.type(), stream); + stream << "}"; +} + +void dump(const onnx::GraphProto& graph, std::ostream& stream, size_t indent); + +void dump(const onnx::AttributeProto& attr, std::ostream& stream, size_t indent) { + stream << "{ name: '" << attr.name() << "', type: "; + if (attr.has_f()) { + stream << "float, value: " << attr.f(); + } else if (attr.has_i()) { + stream << "int, value: " << attr.i(); + } else if (attr.has_s()) { + stream << "string, value: '" << attr.s() << "'"; + } else if (attr.has_g()) { + stream << "graph, value:\n"; + dump(attr.g(), stream, indent+1); + stream << nlidt(indent); + } else if (attr.has_t()) { + stream << "tensor, value:"; + dump(attr.t(), stream); + } else if (attr.floats_size()) { + stream << "floats, values: ["; + for (int i = 0; i < attr.floats_size(); ++i) + stream << attr.floats(i) << (i == attr.floats_size() - 1 ? "" : " "); + stream << "]"; + } else if (attr.ints_size()) { + stream << "ints, values: ["; + for (int i = 0; i < attr.ints_size(); ++i) + stream << attr.ints(i) << (i == attr.ints_size() - 1 ? "" : " "); + stream << "]"; + } else if (attr.strings_size()) { + stream << "strings, values: ["; + for (int i = 0; i < attr.strings_size(); ++i) + stream << "'" << attr.strings(i) << "'" << (i == attr.strings_size() - 1 ? "" : " "); + stream << "]"; + } else if (attr.tensors_size()) { + stream << "tensors, values: ["; + for (auto& t : attr.tensors()) { + dump(t, stream); + } + stream << "]"; + } else if (attr.graphs_size()) { + stream << "graphs, values: ["; + for (auto& g : attr.graphs()) { + dump(g, stream, indent+1); + } + stream << "]"; + } else { + stream << "UNKNOWN"; + } + stream << "}"; +} + +void dump(const onnx::NodeProto& node, std::ostream& stream, size_t indent) { + stream << "Node {type: \"" << node.op_type() << "\", inputs: ["; + for (int i = 0; i < node.input_size(); ++i) { + stream << node.input(i) << (i == node.input_size() - 1 ? "" : ","); + } + stream << "], outputs: ["; + for (int i = 0; i < node.output_size(); ++i) { + stream << node.output(i) << (i == node.output_size() - 1 ? "" : ","); + } + stream << "], attributes: ["; + for (int i = 0; i < node.attribute_size(); ++i) { + dump(node.attribute(i), stream, indent+1); + stream << (i == node.attribute_size() - 1 ? "" : ","); + } + stream << "]}"; +} + +void dump(const onnx::GraphProto& graph, std::ostream& stream, size_t indent) { + stream << idt(indent) << "GraphProto {" << nlidt(indent+1) + << "name: \"" << graph.name() << "\"" << nlidt(indent+1) + << "inputs: ["; + for (int i = 0; i < graph.input_size(); ++i) { + dump(graph.input(i), stream); + stream << (i == graph.input_size() - 1 ? "" : ","); + } + stream << "]" << nlidt(indent+1) + << "outputs: ["; + for (int i = 0; i < graph.output_size(); ++i) { + dump(graph.output(i), stream); + stream << (i == graph.output_size() - 1 ? "" : ","); + } + stream << "]" << nlidt(indent+1) + << "initializers: ["; + for (int i = 0; i < graph.initializer_size(); ++i) { + dump(graph.initializer(i), stream); + stream << (i == graph.initializer_size() - 1 ? "" : ","); + } + stream << "]" << nlidt(indent+1) + << "nodes: [" << nlidt(indent+2); + for (int i = 0; i < graph.node_size(); ++i) { + dump(graph.node(i), stream, indent+2); + if (i != graph.node_size() - 1) stream << "," << nlidt(indent+2); + } + stream << nlidt(indent+1) << "]\n" << idt(indent) << "}\n"; +} + +void dump(const onnx::OperatorSetIdProto& operator_set_id, std::ostream& stream) { + stream << "OperatorSetIdProto { domain: " << operator_set_id.domain() << "}"; +} + +void dump(const onnx::ModelProto& model, std::ostream& stream, size_t indent) { + stream << idt(indent) + << "ModelProto {" << nlidt(indent+1) + << "producer_name: \"" << model.producer_name() << "\"" << nlidt(indent+1) + << "domain: \"" << model.domain() << "\"" << nlidt(indent+1) + << "doc_string: \"" << model.doc_string() << "\""; + if (model.has_graph()) { + stream << nlidt(indent+1) << "graph:\n"; + dump(model.graph(), stream, indent+2); + } + if (model.opset_import_size()) { + stream << idt(indent+1) << "opset_import: ["; + for (auto &opset_imp : model.opset_import()) { + dump(opset_imp, stream); + } + stream << "],\n"; + } + stream << idt(indent) << "}\n"; +} +} // namespace + +std::string prettyPrint(const onnx::ModelProto& model) { + std::stringstream ss; + dump(model, ss, 0); + return ss.str(); +} + } namespace { @@ -376,14 +561,15 @@ RawDataExportMap ToModelProto( const std::vector & initializers, int64_t onnx_opset_version, bool defer_weight_export, - onnx::OperatorExportTypes operator_export_type, + onnx_torch::OperatorExportTypes operator_export_type, onnx::ModelProto *model_proto) { - if (operator_export_type != onnx::OperatorExportTypes::RAW) { + if (operator_export_type != onnx_torch::OperatorExportTypes::RAW) { validateGraph(graph, operator_export_type); } model_proto->set_producer_name("pytorch"); model_proto->set_producer_version("0.3"); + model_proto->set_ir_version(onnx::IR_VERSION); auto* imp = model_proto->add_opset_import(); // This is the version of ONNX operator set we are targeting imp->set_version(onnx_opset_version); @@ -411,12 +597,12 @@ std::string PrettyPrintExportedGraph( int64_t onnx_opset_version, bool defer_weight_export, ::torch::onnx::OperatorExportTypes operator_export_type) { - ::torch::onnx::ModelProto model_proto; + ::ONNX_NAMESPACE::ModelProto model_proto; RawDataExportMap raw_data_export_map; raw_data_export_map = ToModelProto( graph, initializers, onnx_opset_version, defer_weight_export, operator_export_type, &model_proto); - return model_proto.prettyPrint(); + return prettyPrint(model_proto); } // export_raw_ir will export IR ops without turning them into ONNX ops. @@ -430,21 +616,12 @@ std::tuple ExportGraph( int64_t onnx_opset_version, bool defer_weight_export, ::torch::onnx::OperatorExportTypes operator_export_type) { - ::torch::onnx::ModelProto model_proto; + ::ONNX_NAMESPACE::ModelProto model_proto; RawDataExportMap raw_data_export_map; raw_data_export_map = ToModelProto( graph, initializers, onnx_opset_version, defer_weight_export, operator_export_type, &model_proto); - - size_t out_size; - pb_get_encoded_size(&out_size, onnx_ModelProto_fields, &model_proto.proto); - - // Allocate storage and export the graph - std::string out(out_size, '\0'); - pb_ostream_t ostream = pb_ostream_from_buffer(reinterpret_cast(&out[0]), out_size); - pb_encode(&ostream, onnx_ModelProto_fields, &model_proto.proto); - - return std::make_tuple(out, raw_data_export_map); + return std::make_tuple(model_proto.SerializeAsString(), raw_data_export_map); } }} diff --git a/torch/csrc/jit/fusion_compiler.cpp b/torch/csrc/jit/fusion_compiler.cpp index 8d20045efefe6a..22f8b40ba30542 100644 --- a/torch/csrc/jit/fusion_compiler.cpp +++ b/torch/csrc/jit/fusion_compiler.cpp @@ -345,18 +345,14 @@ std::vector emitCompilationUnit(std::ostream & out, size_t i = 0; for(auto o : subgraph.outputs()) { auto & desc = agraph.output_desc[i++]; - if(o->node()->kind() != aten::cat) { + if(o->node()->kind() != prim::FusedConcat) { emitFormal(o, desc); concat_desc.emplace_back(); flat_output_nodes.push_back(o); } else { auto cat = o->node(); - auto tensor_inputs = cat->inputs(); - // We need to drop the dim arg - tensor_inputs = tensor_inputs.slice(0, tensor_inputs.size() - 1); - size_t nInputs = tensor_inputs.size(); - concat_desc.emplace_back(desc, nInputs, cat->get(attr::dim).value()); - for(auto c : tensor_inputs) { + concat_desc.emplace_back(desc, cat->inputs().size(), cat->i(attr::dim)); + for(auto c : cat->inputs()) { emitFormal(c, *concat_desc.back().subtensorDesc); flat_output_nodes.push_back(c); } @@ -386,8 +382,9 @@ std::vector emitCompilationUnit(std::ostream & out, } for(auto n : subgraph.nodes()) { - if(n->kind() == aten::cat) - continue; // Concat nodes by narrowing the output Tensors before the kernel runs + // FusedConcat nodes work by narrowing the output Tensors before the kernel runs + if (n->kind() == prim::FusedConcat) + continue; env.s("node",valueName(n->output())); env.s("rhs", encodeRHS(n)); body << format("auto ${node} = ${rhs};\n",env); diff --git a/torch/csrc/jit/fusion_compiler.h b/torch/csrc/jit/fusion_compiler.h index 6c4759aefb692a..c2f35ee0aa2074 100644 --- a/torch/csrc/jit/fusion_compiler.h +++ b/torch/csrc/jit/fusion_compiler.h @@ -86,7 +86,7 @@ struct CompiledFusionFunction { TH_DISALLOW_COPY_AND_ASSIGN(CompiledFusionFunction); CompiledFusionFunction(const std::string & name, AnnotatedGraph & agraph); - virtual ~CompiledFusionFunction() {} + virtual ~CompiledFusionFunction() = default; // expects outputs to be pre-allocated void launch_with_tensors(at::ArrayRef inputs, at::ArrayRef outputs); diff --git a/torch/csrc/jit/graph_executor.cpp b/torch/csrc/jit/graph_executor.cpp index df81c378ad137d..56a836b312d0c7 100644 --- a/torch/csrc/jit/graph_executor.cpp +++ b/torch/csrc/jit/graph_executor.cpp @@ -21,6 +21,7 @@ #include "torch/csrc/jit/passes/specialize_undef.h" #include "torch/csrc/jit/passes/loop_unrolling.h" #include "torch/csrc/jit/passes/lower_grad_of.h" +#include "torch/csrc/jit/passes/constant_propagation.h" #include "torch/csrc/jit/symbolic_variable.h" #include "torch/csrc/jit/ivalue.h" @@ -240,14 +241,7 @@ struct GraphExecutorImpl { , symbolically_differentiable(symbolically_differentiable) , may_introduce_gradient(calcMayIntroduceGradient(this->graph->block())) {} GraphExecutorImpl(std::shared_ptr graph, bool optimize) - : GraphExecutorImpl(graph, optimize, isDifferentiable(*graph)) { - for(auto input : graph->inputs()) { - JIT_ASSERTM(input->type()->kind() != TypeKind::TupleType, "tuples cannot be inputs to the graph"); - } - for(auto output : graph->outputs()) { - JIT_ASSERTM(output->type()->kind() != TypeKind::TupleType, "tuples cannot be outputs to the graph"); - } - } + : GraphExecutorImpl(graph, optimize, isDifferentiable(*graph)) {} // entry point where execution begins void run(Stack & stack) { @@ -516,28 +510,28 @@ void runRequiredPasses(const std::shared_ptr& g) { RemoveExpands(g); } -void specializeToSpec(const std::shared_ptr& graph_, const ArgumentSpec& spec) { +void specializeToSpec(const std::shared_ptr& graph, const ArgumentSpec& spec) { // clean up GradOf and AutogradAdd nodes // this must be first because later passes do not know what GradOfs are std::vector defined; for(size_t i = 0; i < spec.size(); ++i) { defined.push_back(spec.at(i).defined()); } - specializeUndef(*graph_, defined); + specializeUndef(*graph, defined); // required passes shared with autograd fallback - runRequiredPasses(graph_); + runRequiredPasses(graph); // Decompose addmm nodes to add + mm, so expands can be inserted and // gradients accumulated on the backward pass // // In the future, if we need more passes like this, we should convert this // into a generic canonicalization pass. - DecomposeAddmm(graph_); + DecomposeAddmm(graph); // clean up dead constants from specialization - EliminateDeadCode(graph_); + EliminateDeadCode(graph); // calculate all input shapes - PropagateInputShapes(*graph_, spec); + PropagateInputShapes(*graph, spec); } void runOptimization(std::shared_ptr & graph, bool graphMustSupportVariables) { @@ -554,7 +548,7 @@ void runOptimization(std::shared_ptr & graph, bool graphMustSupportVariab // They also may assume that concrete sizes/strides are availiable UnrollLoops(graph); - + ConstantPropagation(graph); //TODO: create peephole optimizations that are safe to run // when we are using variables, and when we do not know sizes. PeepholeOptimize(graph); diff --git a/torch/csrc/jit/graph_executor.h b/torch/csrc/jit/graph_executor.h index 4e862c9e0a1e44..2693af50af1025 100644 --- a/torch/csrc/jit/graph_executor.h +++ b/torch/csrc/jit/graph_executor.h @@ -34,7 +34,7 @@ struct GraphExecutorState { struct GraphExecutorImpl; struct TORCH_API GraphExecutor { - GraphExecutor() {} + GraphExecutor() = default; GraphExecutor(std::shared_ptr graph, bool optimize = true); // note: if not specified, symbolically_differentiable is computed from the graph. GraphExecutor(std::shared_ptr graph, bool optimize, bool symbolically_differentiable); diff --git a/torch/csrc/jit/graph_node_list.h b/torch/csrc/jit/graph_node_list.h index 996a8b2c75fa0f..054b9517776863 100644 --- a/torch/csrc/jit/graph_node_list.h +++ b/torch/csrc/jit/graph_node_list.h @@ -1,3 +1,5 @@ +#pragma once + #include "torch/csrc/jit/assertions.h" namespace torch { namespace jit { diff --git a/torch/csrc/jit/import.cpp b/torch/csrc/jit/import.cpp index 5b128fd822dafd..a453925cf2f8eb 100644 --- a/torch/csrc/jit/import.cpp +++ b/torch/csrc/jit/import.cpp @@ -1,5 +1,5 @@ #include "torch/csrc/jit/import.h" -#include "torch/csrc/onnx/onnx.npb.h" +#include "onnx/onnx.pb.h" #include "torch/csrc/jit/ir.h" #include "torch/csrc/utils/functional.h" #include "torch/csrc/jit/assertions.h" @@ -16,401 +16,60 @@ namespace torch { namespace jit { namespace { -// Deserialized data - -struct Tensor_ { - std::vector dims; - std::vector raw_data; - onnx_TensorProto_DataType data_type; -}; - -struct AttributeValue_ { - std::string name; - onnx_AttributeProto_AttributeType type; - double f; - int64_t i; - std::string s; - Tensor_ t; - std::string g; - std::vector fs; - std::vector is; - std::vector ss; - std::vector ts; - std::vector gs; -}; - -struct Value_ { - std::string name; -}; - -struct Node_ { - std::string op_type; - std::string domain; - std::vector inputs; - std::vector outputs; - std::vector attrs; -}; - -struct Graph_ { - std::vector inputs; - std::vector outputs; - std::vector nodes; - std::vector initializers; -}; - -struct Model_ { - Graph_ graph; -}; - - -// Readers - -struct ReaderBase { - ReaderBase() {} - ReaderBase(pb_callback_t& cb) { - initialize_callback(cb); - } - - void initialize_callback(pb_callback_t& cb) { - cb.funcs.decode = ReaderBase::decode; - cb.arg = this; - } - - virtual void decode(pb_istream_t *stream) = 0; - - static bool decode(pb_istream_t *stream, const pb_field_t *, void **_self) { - ReaderBase* self = *reinterpret_cast(_self); - self->decode(stream); - return true; - } -}; - - -template -struct Reader : ReaderBase {}; - -template -struct Reader> : Reader { - Reader(pb_callback_t& cb) : Reader(cb) {} - // Decode is going to be called repeatedly from the callback - // (registered in the parent class constructor) each time an - // element is encountered. So all we do is relay the decoding - // through the parent class decode and push the result, every - // time this decode is called. - virtual void decode(pb_istream_t *stream) override { - Reader::decode(stream); - values.push_back(std::move(Reader::value)); - } - std::vector values; -}; - -template<> -struct Reader : ReaderBase { - Reader(pb_callback_t& cb) : ReaderBase(cb) {} - virtual void decode(pb_istream_t *stream) override { - // For string and bytes, the length value has already been - // parsed, and is available at stream->bytes_left. - std::vector res(stream->bytes_left); - if (!pb_read(stream, res.data(), stream->bytes_left)) { - throw std::runtime_error("Decoding failed"); - } - value.assign(res.begin(), res.end()); - } - std::string value; -}; - -template<> -struct Reader : ReaderBase { - Reader(pb_callback_t& cb) : ReaderBase(cb) {} - virtual void decode(pb_istream_t *stream) override { - if (!pb_decode_fixed32(stream, &value)) { - throw std::runtime_error("Decoding failed"); - } - } - double value; -}; - -template<> -struct Reader : ReaderBase { - Reader(pb_callback_t& cb) : ReaderBase(cb) {} - virtual void decode(pb_istream_t *stream) override { - if (!pb_decode_varint(stream, reinterpret_cast(&value))) { - throw std::runtime_error("Decoding failed"); - } - } - int64_t value; -}; - -template<> -struct Reader> : ReaderBase { - Reader(pb_callback_t& cb) : ReaderBase(cb) {} - virtual void decode(pb_istream_t *stream) override { - // For string and bytes, the length value has already been - // parsed, and is available at stream->bytes_left. - value.resize(stream->bytes_left); - if (!pb_read(stream, value.data(), stream->bytes_left)) { - throw std::runtime_error("Decoding failed"); - } - } - std::vector value; -}; - -template<> -struct Reader : ReaderBase { - Reader() - : proto(onnx_TensorProto_init_default) - , dims_reader(proto.dims) - , raw_data_reader(proto.raw_data) - {} - - Reader(pb_callback_t& cb) - : Reader() { initialize_callback(cb); } - - virtual void decode(pb_istream_t *stream) override { - if (!pb_decode(stream, onnx_TensorProto_fields, &proto)) { - throw std::runtime_error("Decoding failed"); - } - - value.dims = std::move(dims_reader.values); - value.raw_data = std::move(raw_data_reader.value); - value.data_type = proto.data_type; - } - - onnx_TensorProto proto; - Reader> dims_reader; - Reader> raw_data_reader; - Tensor_ value; -}; - -template<> -struct Reader : ReaderBase { - Reader() - : proto(onnx_AttributeProto_init_default) - , name_reader(proto.name) - , str_reader(proto.s) - , tensor_reader(proto.t) - , graph_reader(proto.g) - , floats_reader(proto.floats) - , ints_reader(proto.ints) - , strings_reader(proto.strings) - , tensors_reader(proto.tensors) - , graphs_reader(proto.graphs) {} - - Reader(pb_callback_t& cb) - : Reader() { initialize_callback(cb); } - - virtual void decode(pb_istream_t *stream) override { - if (!pb_decode(stream, onnx_AttributeProto_fields, &proto)) { - throw std::runtime_error("Decoding failed"); - } - - value.name = std::move(name_reader.value); - value.type = proto.type; - value.f = proto.f; - value.i = proto.i; - value.s = std::move(str_reader.value); - value.t = std::move(tensor_reader.value); - value.g = std::move(graph_reader.value); - value.fs = std::move(floats_reader.values); - value.is = std::move(ints_reader.values); - value.ss = std::move(strings_reader.values); - value.ts = std::move(tensors_reader.values); - value.gs = std::move(graphs_reader.values); - } - - onnx_AttributeProto proto; - Reader name_reader; - Reader str_reader; - Reader tensor_reader; - Reader graph_reader; - Reader> floats_reader; - Reader> ints_reader; - Reader> strings_reader; - Reader> tensors_reader; - Reader> graphs_reader; - AttributeValue_ value; -}; - -template<> -struct Reader : ReaderBase { - Reader() - : proto(onnx_ValueInfoProto_init_default) - , name_reader(proto.name) {} - Reader(pb_callback_t& cb) - : Reader() { initialize_callback(cb); } - - virtual void decode(pb_istream_t *stream) override { - if (!pb_decode(stream, onnx_ValueInfoProto_fields, &proto)) { - throw std::runtime_error("Decoding failed"); - } - - value.name = std::move(name_reader.value); - } - - onnx_ValueInfoProto proto; - Reader name_reader; - Value_ value; -}; - - -template<> -struct Reader : ReaderBase { - Reader() - : proto(onnx_NodeProto_init_default) - , op_type_reader(proto.op_type) - , domain_reader(proto.domain) - , inputs_reader(proto.input) - , outputs_reader(proto.output) - , attrs_reader(proto.attribute) - {} - Reader(pb_callback_t& cb) - : Reader() { initialize_callback(cb); } - - virtual void decode(pb_istream_t *stream) override { - if (!pb_decode(stream, onnx_NodeProto_fields, &proto)) { - throw std::runtime_error("Decoding failed"); - } - - value.op_type = std::move(op_type_reader.value); - value.domain = std::move(domain_reader.value); - value.inputs = std::move(inputs_reader.values); - value.outputs = std::move(outputs_reader.values); - value.attrs = std::move(attrs_reader.values); - } - - onnx_NodeProto proto; - Reader op_type_reader; - Reader domain_reader; - Reader> inputs_reader; - Reader> outputs_reader; - Reader> attrs_reader; - Node_ value; -}; - - -template<> -struct Reader : ReaderBase { - Reader() - : proto(onnx_GraphProto_init_default) - , input_reader(proto.input) - , output_reader(proto.output) - , node_reader(proto.node) - , initializer_reader(proto.initializer) - {} - Reader(pb_callback_t& cb) - : Reader() { initialize_callback(cb); } - - virtual void decode(pb_istream_t *stream) override { - if (!pb_decode(stream, onnx_GraphProto_fields, &proto)) { - throw std::runtime_error("Decoding failed"); - } - - value.inputs = std::move(input_reader.values); - value.outputs = std::move(output_reader.values); - value.nodes = std::move(node_reader.values); - value.initializers = std::move(initializer_reader.values); - } - - static Graph_ read(pb_istream_t *stream) { - Reader reader; - reader.decode(stream); - return reader.value; - } - - onnx_GraphProto proto; - Reader> input_reader; - Reader> output_reader; - Reader> node_reader; - Reader> initializer_reader; - Graph_ value; -}; - - -template<> -struct Reader : ReaderBase { - Reader() - : proto(onnx_ModelProto_init_default) - , graph_reader(proto.graph) {} - Reader(pb_callback_t& cb) - : Reader() { initialize_callback(cb); } - - virtual void decode(pb_istream_t *stream) override { - if (!pb_decode(stream, onnx_ModelProto_fields, &proto)) { - throw std::runtime_error("Decoding failed"); - } - - value.graph = std::move(graph_reader.value); - } - - static Model_ read(pb_istream_t *stream) { - Reader reader; - reader.decode(stream); - return reader.value; - } - - onnx_ModelProto proto; - Reader graph_reader; - Model_ value; -}; - - // IR graph construction -at::Tensor buildTensor(const Tensor_& tensor_) { +namespace onnx = ::ONNX_NAMESPACE; + +at::Tensor buildTensor(const onnx::TensorProto& tensor_proto) { at::Tensor tensor; - switch(tensor_.data_type) { - case onnx_TensorProto_DataType_UINT8: + switch(tensor_proto.data_type()) { + case onnx::TensorProto_DataType_UINT8: tensor = at::CPU(at::kByte).tensor(); break; - case onnx_TensorProto_DataType_INT8: + case onnx::TensorProto_DataType_INT8: tensor = at::CPU(at::kChar).tensor(); break; - case onnx_TensorProto_DataType_INT16: + case onnx::TensorProto_DataType_INT16: tensor = at::CPU(at::kShort).tensor(); break; - case onnx_TensorProto_DataType_INT32: + case onnx::TensorProto_DataType_INT32: tensor = at::CPU(at::kInt).tensor(); break; - case onnx_TensorProto_DataType_INT64: + case onnx::TensorProto_DataType_INT64: tensor = at::CPU(at::kLong).tensor(); break; - case onnx_TensorProto_DataType_FLOAT16: + case onnx::TensorProto_DataType_FLOAT16: tensor = at::CPU(at::kHalf).tensor(); break; - case onnx_TensorProto_DataType_FLOAT: + case onnx::TensorProto_DataType_FLOAT: tensor = at::CPU(at::kFloat).tensor(); break; - case onnx_TensorProto_DataType_DOUBLE: + case onnx::TensorProto_DataType_DOUBLE: tensor = at::CPU(at::kDouble).tensor(); break; default: throw std::runtime_error("Unsupported data type"); } - tensor.resize_(tensor_.dims); + std::vector sizes = {tensor_proto.dims().begin(), tensor_proto.dims().end()}; + tensor.resize_(sizes); JIT_ASSERT( tensor.storage()->pImpl()->get_size() * tensor.storage()->pImpl()->elementSize() == - tensor_.raw_data.size()); + tensor_proto.raw_data().size()); - std::memcpy(tensor.data_ptr(), tensor_.raw_data.data(), tensor_.raw_data.size()); + std::memcpy(tensor.data_ptr(), tensor_proto.raw_data().data(), tensor_proto.raw_data().size()); return tensor; } -Graph_ readSubgraph(const std::string& serialized_subgraph) { - pb_istream_t istream = pb_istream_from_buffer(reinterpret_cast(serialized_subgraph.data()), serialized_subgraph.size()); - - return Reader::read(&istream); -} - -void buildBlock(const Graph_& graph_, Block* block, +void buildBlock(const onnx::GraphProto& graph_proto, Block* block, std::unordered_map& value_map); -void buildBlocks(const std::vector& graphs_, Node* node, +void buildBlocks(const std::vector& graphs_, Node* node, std::unordered_map& value_map) { for (auto g_ : graphs_) { auto block = node->addBlock(); @@ -418,97 +77,96 @@ void buildBlocks(const std::vector& graphs_, Node* node, } } -std::shared_ptr buildGraph(const Graph_& graph_) { +std::shared_ptr buildGraph(const onnx::GraphProto& graph_proto) { auto graph = std::make_shared(); std::unordered_map value_map; - buildBlock(graph_, graph->block(), value_map); + buildBlock(graph_proto, graph->block(), value_map); return graph; } -void buildBlock(const Graph_& graph_, Block* block, +void buildBlock(const onnx::GraphProto& graph_proto, Block* block, std::unordered_map& value_map) { - for (auto & input : graph_.inputs) { - value_map[input.name] = block->addInput(); + for (auto & input : graph_proto.input()) { + value_map[input.name()] = block->addInput(); } - for (auto & node_ : graph_.nodes) { - JIT_ASSERT(node_.op_type != "PythonOp"); + for (auto & node_ : graph_proto.node()) { + JIT_ASSERT(node_.op_type() != "PythonOp"); - auto node = block->owningGraph()->create(Symbol::fromDomainAndUnqualString(node_.domain, node_.op_type), - node_.outputs.size()); + auto node = block->owningGraph()->create(Symbol::fromDomainAndUnqualString(node_.domain(), node_.op_type()), + node_.output().size()); - for (auto & attr : node_.attrs) { - Symbol name = Symbol::attr(attr.name); + for (auto & attr : node_.attribute()) { + Symbol name = Symbol::attr(attr.name()); - switch(attr.type) { - case onnx_AttributeProto_AttributeType_UNDEFINED: + switch(attr.type()) { + case onnx::AttributeProto_AttributeType_UNDEFINED: throw std::runtime_error("UNDEFINED attribute unsupported"); break; - case onnx_AttributeProto_AttributeType_FLOAT: - node->f_(name, attr.f); + case onnx::AttributeProto_AttributeType_FLOAT: + node->f_(name, attr.f()); break; - case onnx_AttributeProto_AttributeType_INT: - node->i_(name, attr.i); + case onnx::AttributeProto_AttributeType_INT: + node->i_(name, attr.i()); break; - case onnx_AttributeProto_AttributeType_STRING: - node->s_(name, std::move(attr.s)); + case onnx::AttributeProto_AttributeType_STRING: + node->s_(name, std::move(attr.s())); break; - case onnx_AttributeProto_AttributeType_TENSOR: - node->t_(name, buildTensor(attr.t)); + case onnx::AttributeProto_AttributeType_TENSOR: + node->t_(name, buildTensor(attr.t())); break; - case onnx_AttributeProto_AttributeType_GRAPH: - node->g_(name, buildGraph(readSubgraph(attr.g))); + case onnx::AttributeProto_AttributeType_GRAPH: + node->g_(name, buildGraph(attr.g())); break; - case onnx_AttributeProto_AttributeType_FLOATS: - node->fs_(name, std::move(attr.fs)); + case onnx::AttributeProto_AttributeType_FLOATS: + node->fs_(name, {attr.floats().begin(), attr.floats().end()}); break; - case onnx_AttributeProto_AttributeType_INTS: - node->is_(name, std::move(attr.is)); + case onnx::AttributeProto_AttributeType_INTS: + node->is_(name, {attr.ints().begin(), attr.ints().end()}); break; - case onnx_AttributeProto_AttributeType_STRINGS: - node->ss_(name, std::move(attr.ss)); + case onnx::AttributeProto_AttributeType_STRINGS: + node->ss_(name, {attr.strings().begin(), attr.strings().end()}); break; - case onnx_AttributeProto_AttributeType_TENSORS: - node->ts_(name, fmap(attr.ts, [](const Tensor_& t) { return buildTensor(t); })); + case onnx::AttributeProto_AttributeType_TENSORS: + node->ts_(name, fmap(attr.tensors(), [](const onnx::TensorProto& t) { return buildTensor(t); })); break; - case onnx_AttributeProto_AttributeType_GRAPHS: - if (attr.name == "_blocks") { - buildBlocks(fmap(attr.gs, [](const std::string& g) { return readSubgraph(g); }), node, value_map); + case onnx::AttributeProto_AttributeType_GRAPHS: + if (attr.name() == "_blocks") { + buildBlocks({attr.graphs().begin(), attr.graphs().end()}, node, value_map); } else { - node->gs_(name, fmap(fmap(attr.gs, [](const std::string& g) { return readSubgraph(g); } ), - [](const Graph_& g_) { return buildGraph(g_); })); + node->gs_(name, fmap(attr.graphs(), [](const onnx::GraphProto& g_) { return buildGraph(g_); })); } break; } } - for (auto & input : node_.inputs) { + for (auto & input : node_.input()) { auto v = value_map[input]; node->addInput(v); } - for (size_t i=0; ioutputs()[i]; + for (int i=0; ioutputs()[i]; } block->appendNode(node); } - for (auto & output : graph_.outputs) { - Value* v = value_map.at(output.name); + for (auto & output : graph_proto.output()) { + Value* v = value_map.at(output.name()); block->registerOutput(v); } } -std::shared_ptr buildGraph(const Graph_& graph_, std::vector& initializers) { +std::shared_ptr buildGraph(const onnx::GraphProto& graph_proto, std::vector& initializers) { - auto graph = buildGraph(graph_); + auto graph = buildGraph(graph_proto); - for (auto tensor_ : graph_.initializers) { + for (auto tensor_ : graph_proto.initializer()) { initializers.push_back(buildTensor(tensor_)); } @@ -557,12 +215,10 @@ void reconstructOutputTypes(Block *b) { std::shared_ptr ImportIRGraph(const std::string& serialized_graph, std::vector& initializers) { + auto model_proto = ::ONNX_NAMESPACE::ModelProto(); + model_proto.ParseFromString(serialized_graph); - pb_istream_t istream = pb_istream_from_buffer(reinterpret_cast(serialized_graph.data()), serialized_graph.size()); - - auto model = Reader::read(&istream); - - auto graph = buildGraph(model.graph, initializers); + auto graph = buildGraph(model_proto.graph(), initializers); reconstructOutputTypes(graph->block()); diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp index d3a9bd9139a96e..5363eda02ff528 100644 --- a/torch/csrc/jit/init.cpp +++ b/torch/csrc/jit/init.cpp @@ -18,6 +18,7 @@ #include "torch/csrc/jit/passes/onnx/fixup_onnx_loop.h" #include "torch/csrc/jit/passes/shape_analysis.h" #include "torch/csrc/jit/passes/decompose_addmm.h" +#include "torch/csrc/jit/passes/constant_propagation.h" #include "torch/csrc/jit/passes/loop_unrolling.h" #include "torch/csrc/jit/passes/to_batch.h" #include "torch/csrc/jit/passes/specialize_undef.h" @@ -70,11 +71,14 @@ void initJITBindings(PyObject *module) { }) .def("_jit_pass_lint", LintGraph) .def("_jit_pass_shape_analysis", [](Graph& graph, py::tuple inputs, bool with_grad) { - PropagateInputShapes(graph, ArgumentSpec(with_grad, createStack(inputs))); + PropagateInputShapes(graph, ArgumentSpec(with_grad, createStack(inputs, graph.inputs()))); }) .def("_jit_pass_remove_expands", RemoveExpands) .def("_jit_pass_erase_number_types", EraseNumberTypes) .def("_jit_pass_loop_unrolling", UnrollLoops) + .def("_jit_pass_constant_propagation", [](std::shared_ptr& g) { + return ConstantPropagation(g); + }) .def("_jit_run_cpp_tests", [] { // We have to release the GIL inside this method, because if we happen to // initialize the autograd engine in these tests, the newly spawned worker threads will @@ -182,15 +186,16 @@ void initJITBindings(PyObject *module) { return ge.graph(); }) .def("graph_for", [](GraphExecutor& ge, py::args args) { - return ge.graphFor(createStack(args)); + return ge.graphFor(createStack(args, ge.graph()->inputs())); }) .def("get_debug_state", [](GraphExecutor& ge) { return ge.getDebugState(); }) .def("__call__", [](GraphExecutor& ge, py::args args) -> py::object { - auto stack = createStack(args); + const auto & graph = ge.graph(); + auto stack = createStack(args, graph->inputs()); ge.run(stack); - return wrapStack(std::move(stack)); + return wrapStack(std::move(stack), graph->outputs()); }); diff --git a/torch/csrc/jit/interned_strings.h b/torch/csrc/jit/interned_strings.h index 52b8cb0eaccd98..c567793552d73a 100644 --- a/torch/csrc/jit/interned_strings.h +++ b/torch/csrc/jit/interned_strings.h @@ -50,6 +50,7 @@ _(prim, TensorToNum) \ _(prim, AutogradAdd) \ _(prim, GradOf) \ _(prim, AnyDefined) \ +_(prim, FusedConcat) \ _(aten, __not__) \ FORALL_ATEN_BASE_SYMBOLS(_) \ _(onnx, Add) \ diff --git a/torch/csrc/jit/interpreter.cpp b/torch/csrc/jit/interpreter.cpp index 65bdcf695f6de2..0c1fe17ade0dfd 100644 --- a/torch/csrc/jit/interpreter.cpp +++ b/torch/csrc/jit/interpreter.cpp @@ -337,9 +337,9 @@ struct PreprocessGraph { struct ContainerTensor : public at::TensorImpl { public: ContainerTensor() - : TensorImpl(&(at::globalContext().getType(at::Backend::Undefined,at::ScalarType::Undefined)), nullptr) {} + : TensorImpl(at::Backend::Undefined,at::ScalarType::Undefined, nullptr, /* is_variable */ false) {} - virtual ~ContainerTensor() {} + virtual ~ContainerTensor() = default; virtual at::IntList sizes() const override { throw std::runtime_error("sizes() on ContainerTensor"); } @@ -685,8 +685,8 @@ struct CodeImpl { // InterpreterState state that is held across stages and used to compute a Code struct InterpreterStateImpl { - InterpreterStateImpl(const Code & function_) - : function(function_.pImpl), + InterpreterStateImpl(const Code & code) + : function(code.pImpl), int_data(function->int_data.data()), bool_data(function->bool_data), registers(function->register_size) { @@ -775,15 +775,15 @@ std::ostream & operator<<(std::ostream & out, const Code & code) { Code::Code(std::shared_ptr& graph) : pImpl(new CodeImpl(graph)) {} -Code::~Code() {} +Code::~Code() = default; const std::vector& Code::executors() { return pImpl->executors(); } -InterpreterState::InterpreterState(const Code & function) - : pImpl(new InterpreterStateImpl(function)) {} -InterpreterState::~InterpreterState() {} +InterpreterState::InterpreterState(const Code & code) + : pImpl(new InterpreterStateImpl(code)) {} +InterpreterState::~InterpreterState() = default; void InterpreterState::runOneStage(Stack & stack) { return pImpl->runOneStage(stack); diff --git a/torch/csrc/jit/ir.cpp b/torch/csrc/jit/ir.cpp index 7f09b22b324d11..ede14249c46dce 100644 --- a/torch/csrc/jit/ir.cpp +++ b/torch/csrc/jit/ir.cpp @@ -44,9 +44,9 @@ std::ostream& operator<<(std::ostream & out, const at::ArrayRef & nodes) { } struct const_value_list_with_types { - const std::vector& values; + const ArrayRef values; bool use_newlines; - const_value_list_with_types(const std::vector& values, bool use_newlines = false) + const_value_list_with_types(ArrayRef values, bool use_newlines = false) : values(values), use_newlines(use_newlines) {} }; std::ostream& operator<<(std::ostream & out, const_value_list_with_types l) { @@ -355,7 +355,7 @@ void Graph::lint() const { // - every use will occur later in the topsort struct LintScope { - LintScope() {} + LintScope() = default; LintScope(std::unique_ptr parent) : parent(std::move(parent)) {} bool contains(const Value * v) { @@ -487,13 +487,13 @@ void LintGraph(std::shared_ptr& graph) { graph->lint(); } -void Block::cloneFrom(Block * src, std::function outer_map) { +void Block::cloneFrom(Block * src, std::function value_map) { std::unordered_map local_map; auto env = [&](Value * v) { auto it = local_map.find(v); if(it != local_map.end()) return it->second; - return outer_map(v); + return value_map(v); }; auto graph = owningGraph(); @@ -619,23 +619,8 @@ Value* Node::namedInput(Symbol name) const { // so this is completely unsafe and needs to be gone as soon as possible. return v; } - const auto & the_schema = schema(); - int64_t tensor_list_pos = 0; - for (auto & arg : the_schema.arguments) { - if (*arg.type == *ListType::ofTensors()) - break; - tensor_list_pos++; - } int64_t arg_pos = findArgument(schema(), name).first; - // XXX: we don't have a single value we could give for a Tensor[], - // because we flatten lists into arguments - JIT_ASSERT(arg_pos != tensor_list_pos); - // NB: if there's no tensor list, then tensor_list_pos == arguments.size(), so this is always true - if (arg_pos < tensor_list_pos) { - return input(arg_pos); - } else { - return input(inputs().size() - (the_schema.arguments.size() - arg_pos)); - } + return input(arg_pos); } bool Node::matches(const char *signature_literal, at::ArrayRef const_inputs) { @@ -646,8 +631,12 @@ bool Node::matches(const char *signature_literal, at::ArrayRef const_inp return true; } +void Node::dump() const { + std::cout << *this << "\n"; +} + void Node::findSchema() const { - schema_ = &getOperatorFor(this).schema; + schema_ = &getOperatorFor(this).schema(); } PythonOp* defaultAllocPythonOp(Graph*g) { diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h index 9af468e6ee06e7..b2caa642b6fe20 100644 --- a/torch/csrc/jit/ir.h +++ b/torch/csrc/jit/ir.h @@ -54,7 +54,7 @@ struct Value; TORCH_API std::ostream& operator<<(std::ostream & out, const Graph & g); TORCH_API std::ostream& operator<<(std::ostream & out, const Type & t); -TORCH_API std::ostream& operator<<(std::ostream & out, const Node & t); +TORCH_API std::ostream& operator<<(std::ostream & out, const Node & n); // A list of nodes, with inputs and outputs struct Block; @@ -683,7 +683,9 @@ struct Node : public Attributes { return *schema_; } - virtual ~Node() {} + void dump() const; + + virtual ~Node() = default; private: std::pair findInput(Symbol name); void findSchema() const; @@ -889,8 +891,7 @@ friend struct Block; , block_(new Block(this, nullptr)) , insert_before_(return_node()) {} - Graph() - : Graph( std::make_shared()) {} + Graph() : Graph(std::make_shared()) {} at::ArrayRef inputs() { return block_->inputs(); diff --git a/torch/csrc/jit/ivalue.h b/torch/csrc/jit/ivalue.h index 42a5be89e55e4b..6eef40a0323068 100644 --- a/torch/csrc/jit/ivalue.h +++ b/torch/csrc/jit/ivalue.h @@ -83,6 +83,7 @@ struct ConstantList; struct IValue; using Tuple = ConstantList; using IntList = ConstantList; +using TensorList = ConstantList; using DoubleList = ConstantList; // IValue is the generic tagged union used by the interpreter to hold @@ -93,7 +94,7 @@ using DoubleList = ConstantList; // retain/release calls. #define TORCH_FORALL_TAGS(_) \ - _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) + _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(TensorList) struct IValue { IValue() @@ -223,6 +224,20 @@ struct IValue { return toRetainable(); } + //TensorList + IValue(Shared v); + IValue(std::vector v); + bool isTensorList() const { return Tag::TensorList == tag; } + Shared toTensorList() && { + JIT_ASSERT(isTensorList()); + return moveToRetainable(); + } + Shared toTensorList() const & { + JIT_ASSERT(isTensorList()); + return toRetainable(); + } + + // None bool isNone() { return Tag::None == tag; } @@ -369,8 +384,15 @@ inline IValue::IValue(Shared v) inline IValue::IValue(std::vector v) : IValue(DoubleList::create(std::move(v))) {} +inline IValue::IValue(Shared v) +: tag(Tag::TensorList), retainable(true) { + as_retainable = v.detach(); +} +inline IValue::IValue(std::vector v) +: IValue(TensorList::create(std::move(v))) {} + inline std::vector IValue::copyToIntList() const { - return std::vector(toIntList()->elements()); + return toIntList()->elements().vec(); } }} diff --git a/torch/csrc/jit/operator.cpp b/torch/csrc/jit/operator.cpp index f19d18caa9289e..5cb2c2c11ad5a7 100644 --- a/torch/csrc/jit/operator.cpp +++ b/torch/csrc/jit/operator.cpp @@ -248,8 +248,12 @@ std::string canonicalSchemaString(const FunctionSchema& schema) { using OperatorMap = std::unordered_map>>; struct OperatorRegistry { - OperatorMap operators; +private: std::mutex lock; + OperatorMap operators; + // list of operators whose schema have not yet been parsed, and must + // be registered before any call to lookup an opeator + std::vector> to_register; // Those two maps are used to implement lookupByLiteral, which is needed for the n->match(...) calls. // Basically, every function schema is assigned a unique string you can use to match it. However, // parsing those strings or comparing and hashing them character by character would be very slow, so @@ -260,18 +264,26 @@ struct OperatorRegistry { // by performing a lookup in the operators_by_sig map. std::unordered_map> operators_by_sig; std::unordered_map> operators_by_sig_literal; - void registerOperator(Operator&& op){ - std::lock_guard guard(lock); - Symbol sym = Symbol::fromQualString(op.schema.name); - auto op_ptr = std::make_shared(std::move(op)); - - operators[sym].push_back(op_ptr); + // XXX - caller must be holding lock + void registerPendingOperators() { + for(auto op : to_register) { + Symbol sym = Symbol::fromQualString(op->schema().name); + operators[sym].push_back(op); + operators_by_sig[canonicalSchemaString(op->schema())] = op; + } + to_register.clear(); + } - operators_by_sig[canonicalSchemaString(op.schema)] = op_ptr; +public: + void registerOperator(Operator&& op) { + std::lock_guard guard(lock); + to_register.push_back(std::make_shared(std::move(op))); } const std::shared_ptr& lookupByLiteral(const char * name) { + std::lock_guard guard(lock); + registerPendingOperators(); auto it = operators_by_sig_literal.find(name); if (it == operators_by_sig_literal.end()) { auto op_ptr_it = operators_by_sig.find(name); @@ -289,8 +301,10 @@ struct OperatorRegistry { return it->second; } + const std::vector>& getOperators(Symbol name) { std::lock_guard guard(lock); + registerPendingOperators(); static std::vector> empty; auto it = operators.find(name); if(it != operators.end()) @@ -342,16 +356,16 @@ bool typeMatches(TypePtr actual, TypePtr formal) { } bool Operator::matches(const Node* node) const { - if (node->kind().toQualString() != schema.name) { + if (node->kind().toQualString() != schema().name) { return false; } size_t attributes_size = node->numAttributes(); size_t attributes_seen = 0; auto inputs_size = node->inputs().size(); size_t input_i = 0; - for(size_t arg_i = 0; arg_i < schema.arguments.size(); ++arg_i) { + for(size_t arg_i = 0; arg_i < schema().arguments.size(); ++arg_i) { at::optional attribute_kind; - const Argument& arg = schema.arguments[arg_i]; + const Argument& arg = schema().arguments[arg_i]; if(attributes_size > 0 && (attribute_kind = attributeKindOf(arg.type))) { auto name = Symbol::fromQualString("attr::" + arg.name); if(!node->hasAttribute(name) || node->kindOf(name) != *attribute_kind) { @@ -359,22 +373,6 @@ bool Operator::matches(const Node* node) const { return false; } attributes_seen++; - } else if(*arg.type == *ListType::ofTensors()) { - // Tensor[] is handled as varargs, consume inputs until the remaining required arguments - // XXX - there can only be a single Tensor[] in a declaration - size_t remaining_required = 0; - for(size_t j = arg_i + 1; j < schema.arguments.size(); ++j){ - // remaining arguments are only those that won't be consumed from attributes - if(attributes_size == 0 || !attributeKindOf(schema.arguments[j].type)) - remaining_required++; - } - while(inputs_size - input_i > remaining_required) { - auto input = node->inputs()[input_i++]; - if(!typeMatches(input->type(), DynamicType::get())) { - // std::cout << "vararg argument is not Dynamic\n"; - return false; - } - } } else { if(input_i == inputs_size) { // std::cout << "not enough inputs\n"; @@ -388,11 +386,11 @@ bool Operator::matches(const Node* node) const { } } - if(!schema.is_vararg && input_i != inputs_size) { + if(!schema().is_vararg && input_i != inputs_size) { // std::cout << "not all inputs used\n" << input_i << " " << inputs_size << "\n"; return false; } - if(!schema.is_vararg && attributes_seen != attributes_size) { + if(!schema().is_vararg && attributes_seen != attributes_size) { // std::cout << "not all attributes used\n" << attributes_seen << " " << attributes_size << "\n"; return false; } @@ -426,7 +424,7 @@ const Operator& getOperatorFor(const Node* node) { er << "\ncandidates were:\n"; const auto& candidates = getAllOperatorsFor(node->kind()); for(auto & candidate : candidates) { - er << " " << candidate->schema << "\n"; + er << " " << candidate->schema() << "\n"; } throw er; } @@ -436,7 +434,7 @@ OperatorSet::OperatorSet(std::initializer_list sig_literals) { auto & registry = getRegistry(); for (const char * sig : sig_literals) { auto op = registry.lookupByLiteral(sig); - ops[Symbol::fromQualString(op->schema.name)].push_back(op); + ops[Symbol::fromQualString(op->schema().name)].push_back(op); } } diff --git a/torch/csrc/jit/operator.h b/torch/csrc/jit/operator.h index 7e6a314d2cb8c3..be2c20b01a5379 100644 --- a/torch/csrc/jit/operator.h +++ b/torch/csrc/jit/operator.h @@ -2,57 +2,81 @@ // once C10 exists this can be removed, or stubbed out, but we need // it now to implement correct semantic checking for script #pragma once -#include "ATen/ATen.h" + #include "torch/csrc/jit/assertions.h" #include "torch/csrc/jit/ir.h" #include "torch/csrc/jit/function_schema.h" #include "torch/csrc/jit/stack.h" +#include "ATen/ATen.h" + +#include +#include +#include +#include +#include +#include +#include + namespace torch { namespace jit { -FunctionSchema parseSchema(const std::string& decl); +FunctionSchema parseSchema(const std::string& schema); using OperationCreator = std::function; struct TORCH_API Operator { - Operator(FunctionSchema schema, OperationCreator op, OperationCreator op_const_attributes = nullptr) - : schema(std::move(schema)) - , op(std::move(op)) - , op_const_attributes(std::move(op_const_attributes)) {} + Operator(FunctionSchema schema, OperationCreator op_creator) + : schema_(std::make_shared(std::move(schema))), + op_creator_(std::move(op_creator)) {} - Operator(const std::string& schema, OperationCreator op, OperationCreator op_const_attributes = nullptr) - : Operator(parseSchema(schema), std::move(op), std::move(op_const_attributes)) {} + Operator(const std::string& schema, OperationCreator op_creator) + : schema_string_(schema), op_creator_(std::move(op_creator)) {} - // Helper constructor to regsiter `op` to run + // Helper constructor to register `op` to run // run for _every_ IR Node where n.kind() == name, regardless of arguments. - // This is accomplished by marking the schema varargs and having no required arguments. - // This is used for things like prim::While or prim::If that can take a number - // of different valid input types and lengths. - Operator(Symbol name, OperationCreator op) - : Operator(FunctionSchema(name, {}, {}, true), op, op) {} - - FunctionSchema schema; - - bool matches(const Node* n) const; - // Operators have different versions depending on if some inputs are encoded - // as attributes or inputs. This function returns the right Operation function, - // given a node encoded for one variant. - // Behavior is undefined if matches(n) == false - // TODO (apaszke) : remove - Operation selectVariant(Node* n) const { - if(n->hasAttributes()) { - JIT_ASSERT(op_const_attributes != nullptr); - return op_const_attributes(n); - } else { - return op(n); + // This is accomplished by marking the schema varargs and having no required + // arguments. This is used for things like prim::While or prim::If that can + // take a number of different valid input types and lengths. + Operator(Symbol name, OperationCreator op_creator) + : Operator(FunctionSchema(name, {}, {}, true), std::move(op_creator)) {} + + Operator(FunctionSchema schema, Operation op) + : schema_(std::make_shared(std::move(schema))), + op_(std::make_shared(std::move(op))) {} + + Operator(const std::string& schema, Operation op) + : schema_string_(schema), + op_(std::make_shared(std::move(op))) {} + + bool matches(const Node* node) const; + + Operation getOperation(Node* node = nullptr) const { + if (op_) { + return *op_; } + AT_ASSERT(node != nullptr); + return op_creator_(node); } - bool hasAttributedVersion() const { - return op_const_attributes != nullptr; + + const FunctionSchema & schema() const { + // we lazily parse schema initialized from strings so that + // we do less work during static operator registration + if(!schema_) { + schema_ = std::make_shared(parseSchema(schema_string_.value())); + schema_string_ = at::nullopt; + } + return *schema_; } private: - OperationCreator op; - OperationCreator op_const_attributes; + mutable at::optional schema_string_; + // cannot use at::optional because windows has issues that require an assignment operator to be generated + // cannot use std::unique_ptr because initializer lists of Operators end up copying the Operator + mutable std::shared_ptr schema_; + + // Essentially a variant. + // NB: std::function has a default state (where it == nullptr). + std::shared_ptr op_; + OperationCreator op_creator_; }; const std::vector>& getAllOperatorsFor(Symbol name); @@ -62,7 +86,7 @@ const Operator& getOperatorFor(const Node* node); inline Operation getOperation(Node* node) { // note: getOperatorFor ensures that getOperatorFor(node).matches(node) == true // so the call to selectVariant is always valid. - return getOperatorFor(node).selectVariant(node); + return getOperatorFor(node).getOperation(node); } void registerOperator(Operator&& op); diff --git a/torch/csrc/jit/passes/batch_mm.cpp b/torch/csrc/jit/passes/batch_mm.cpp index 0e40bc8831a6df..414dc1652a4da1 100644 --- a/torch/csrc/jit/passes/batch_mm.cpp +++ b/torch/csrc/jit/passes/batch_mm.cpp @@ -3,8 +3,9 @@ #include "torch/csrc/jit/passes/dead_code_elimination.h" #include "torch/csrc/jit/interned_strings.h" #include "torch/csrc/jit/constants.h" -#include "torch/csrc/utils/functional.h" +#include "torch/csrc/jit/symbolic_variable.h" #include "torch/csrc/jit/assertions.h" +#include "torch/csrc/utils/functional.h" #include #include @@ -191,12 +192,11 @@ void BatchMMBlock(Block* block) { int cat_dim = s == Side::LHS ? 1 : 0; cat_sizes[cat_dim] *= matmuls.size(); // make them really cat_sizes - auto inputs = fmap(matmuls, [=](Node *mm) { return mm->inputs()[inputs_off]; }); WithInsertPoint iguard { root.node }; - inputs.push_back(insertConstant(*graph, cat_dim)); - Node *cat = graph->insertNode(graph->create(aten::cat, inputs)); - cat->output()->setType(type->withSizes(cat_sizes)); - return cat->output(); + auto inputs = fmap(matmuls, [=](Node *mm) -> SymbolicVariable { return mm->inputs()[inputs_off]; }); + auto cat_output = SymbolicVariable::cat(inputs, cat_dim).value(); + cat_output->setType(type->withSizes(cat_sizes)); + return cat_output; }; auto lhs_batch = batch_inputs(Side::LHS, root.lhs_sizes); diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp new file mode 100644 index 00000000000000..39492f9e76c50c --- /dev/null +++ b/torch/csrc/jit/passes/constant_propagation.cpp @@ -0,0 +1,95 @@ +#include "torch/csrc/jit/passes/constant_propagation.h" +#include "torch/csrc/autograd/variable.h" +#include "torch/csrc/jit/constants.h" +#include "torch/csrc/jit/interpreter.h" +#include "torch/csrc/jit/ir.h" +#include "torch/csrc/jit/ivalue.h" +#include "torch/csrc/jit/operator.h" +#include "torch/csrc/jit/passes/dead_code_elimination.h" +#include "torch/csrc/utils/functional.h" + +namespace torch { namespace jit { + +namespace { + +std::unordered_set skip_list = { + //FIXME If & Loop require special casing because they cannot be run as a + //single node. + prim::If, + prim::Loop, + //FIXME Same problem as in DCE - cpp & python PythonOp and CppOp should be + //FIXME treated as having side effects but ONNX depends on them being removed + prim::Print, + //all the rand functions from native_functions.yaml + aten::permute, + aten::rand, + aten::rand_out, + aten::rand_like, + aten::randint, + aten::randint_out, + aten::randint_like, + aten::randn, + aten::randn_out, + aten::randn_like, + aten::randperm, + aten::randperm_out, + }; + +std::vector runNode(Node* n) { + auto op = getOperation(n); + Stack stack; + for (auto input : n->inputs()) { + stack.push_back(*(toIValue(input))); + } + op(stack); + auto var_outputs = fmap(stack, [&](IValue v) { + if (v.isTensor()) { + return IValue(autograd::as_variable_ref(v.toTensor()).data()); + } else { + return v; + } + }); + return var_outputs; +} + +void propagateNode(Node* n) { + auto outputs = runNode(n); + auto graph = n->owningGraph(); + WithInsertPoint guard(n); + for (size_t i = 0; i < outputs.size(); ++i) { + auto new_output = insertConstant(*graph, outputs[i]); + n->outputs()[i]->replaceAllUsesWith(new_output); + // let dce elimination remove n + } +} + +} // anonymous namespace + +void ConstantPropagation(Node* n, bool recurse) { + bool constant_inputs = (n->inputs().size() > 0) && + std::all_of(n->inputs().begin(), n->inputs().end(), [&](Value* v) { + return v->node()->kind() == prim::Constant; + }); + bool supported_node = skip_list.count(n->kind()) == 0; + if (constant_inputs && supported_node) { + propagateNode(n); + } + if (recurse) { + for (Block * block : n->blocks()) + ConstantPropagation(block, recurse); + } +} + +void ConstantPropagation(Block* block, bool recurse) { + ConstantPropagation(block->param_node(), recurse); + for (auto n: block->nodes()) { + ConstantPropagation(n, recurse); + } +} + +void ConstantPropagation(std::shared_ptr& graph) { + ConstantPropagation(graph->block(), true); + EliminateDeadCode(graph); +} + +}} diff --git a/torch/csrc/jit/passes/constant_propagation.h b/torch/csrc/jit/passes/constant_propagation.h new file mode 100644 index 00000000000000..12df329c81ccfc --- /dev/null +++ b/torch/csrc/jit/passes/constant_propagation.h @@ -0,0 +1,11 @@ +#pragma once + +#include "torch/csrc/jit/ir.h" + +namespace torch { namespace jit { + +TORCH_API void ConstantPropagation(std::shared_ptr& graph); +TORCH_API void ConstantPropagation(Block* block, bool recurse); +TORCH_API void ConstantPropagation(Node* n, bool recurse); + +}} diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp index cb3757cffb0e34..cc8dcb8926dee0 100644 --- a/torch/csrc/jit/passes/graph_fuser.cpp +++ b/torch/csrc/jit/passes/graph_fuser.cpp @@ -177,16 +177,25 @@ struct GraphFuser { } } - bool allCatInputsHaveSameSize(Node * node) { - JIT_ASSERT(node->kind() == aten::cat); - std::vector inputs = node->inputs(); - if (!node->hasAttributes()) { - inputs.pop_back(); // Get rid of the dim argument - } + bool isFusableCatNode(Node * node) { + if (node->kind() != aten::cat) + return false; + if (!node->is_constant(attr::dim)) + return false; - auto expected = inputs.at(0)->type()->cast(); + auto tensors_node = node->namedInput(attr::tensors)->node(); + if (tensors_node->kind() != prim::ListConstruct) return false; + // NB: Note that technically other uses of the list aren't a big problem for us. + // It would be enough to place the prim::FusedConcat before the prim::ListConstruct, and + // allUsersAreThisConsumerOrOccurAfterIt would still be satisfied. However, I don't expect this + // to be necessary any time soon, and so we're simply assuming that we don't have to deal with that. + if (tensors_node->output()->uses().size() > 1) return false; + auto tensors = tensors_node->inputs(); + + // Our fusion code assumes that all inputs have the same shapes, so we need to check this too. + auto expected = tensors.at(0)->type()->cast(); if (!expected) return false; - return std::all_of(inputs.begin(), inputs.end(), [expected](Value *v) { + return std::all_of(tensors.begin(), tensors.end(), [&expected](Value *v) { auto actual = v->type()->cast(); return actual && actual->sizes() == expected->sizes(); }); @@ -197,15 +206,7 @@ struct GraphFuser { // because it is not a simple map, can be put in a fusion group // as long as no items in the group read the output of concat bool isFusableAsExitNode(Node * node) { - if(isFusable(node)) - return true; - // this concat fusion only works when all the inputs are the same size - // and we can statically infer the dimension along which we should concat - // otherwise they cannot partipate in the same map - if(node->kind() == aten::cat && node->is_constant(attr::dim) && allCatInputsHaveSameSize(node)) - return true; - - return false; + return isFusable(node) || isFusableCatNode(node); } // necessary condition for fusion. If all of the uses of producer are consumer @@ -241,8 +242,9 @@ struct GraphFuser { // we can move the consumer up into the producer. // but this requires better handling of merging fusion groups so it is not done now at::optional consumer_device = getDevice(consumer); + Node *real_consumer = consumer->kind() == aten::cat ? consumer->namedInput(attr::tensors)->node() : consumer; return isFusable(producer->node()) && - allUsersAreThisConsumerOrOccurAfterIt(consumer, producer) && + allUsersAreThisConsumerOrOccurAfterIt(real_consumer, producer) && consumer_device && consumer_device == getDevice(producer->node()) && (*consumer_device != kCPUDevice || sharedFusionCompiler().canCompileOnCPU()); } @@ -389,7 +391,24 @@ struct GraphFuser { Node * fuse(Node * consumer, Value * producer) { auto group = consumer; - if(group->kind() != prim::FusionGroup) { + if (consumer->kind() == aten::cat) { + Graph * graph = consumer->owningGraph(); + Node * list_construct = consumer->namedInput(attr::tensors)->node(); + int64_t dim = consumer->get(attr::dim).value(); + + Node * fused_cat = graph->create(prim::FusedConcat, list_construct->inputs())->i_(attr::dim, dim); + fused_cat->insertBefore(list_construct); + fused_cat->output()->copyMetadata(consumer->output()); + consumer->output()->replaceAllUsesWith(fused_cat->output()); + topological_index[fused_cat] = topological_index[list_construct]; + + // NB: this deletes the fused_cat node from the original graph + group = createSingletonFusionGroup(fused_cat); + consumer->destroy(); + if (list_construct->output()->uses().empty()) { + list_construct->destroy(); + } + } else if (consumer->kind() != prim::FusionGroup) { group = createSingletonFusionGroup(consumer); } if (producer->node()->kind() == prim::FusionGroup) { @@ -450,7 +469,6 @@ struct GraphFuser { } } - // TODO: Remove this restriction if we ever need to distribute across // multiple return operators Node * producer_for_chunk_node = producer_for_chunk->node(); JIT_ASSERT(producer_for_chunk_node->outputs().size() == 1); @@ -521,11 +539,14 @@ struct GraphFuser { std::pair scanNode(Node * consumer) { auto stage_guard = block->owningGraph()->setStageTemporary(consumer->stage()); if(isFusableAsExitNode(consumer)) { + value_list inputs; + auto consumer_inputs = consumer->kind() == aten::cat ? + consumer->namedInput(attr::tensors)->node()->inputs() : + consumer->inputs(); // handle inputs in reverse topological order as well... // otherwise in f(a,a+b) it will appear a is used twice if we consider // the f-a fusion before the f-(a+b) fusion first. - value_list inputs; - for(auto i : consumer->inputs()) { + for(auto i : consumer_inputs) { if (i->node()->owningBlock() == block) { inputs.push_back(i); JIT_ASSERT(topological_index.count(i->node()) > 0); diff --git a/torch/csrc/jit/passes/lower_grad_of.h b/torch/csrc/jit/passes/lower_grad_of.h index a0a881e3002ed9..0ec3589e3acd31 100644 --- a/torch/csrc/jit/passes/lower_grad_of.h +++ b/torch/csrc/jit/passes/lower_grad_of.h @@ -10,6 +10,6 @@ namespace torch { namespace jit { // outputs = // else: // outputs = undefineds -TORCH_API void LowerGradOf(Graph& graph); +TORCH_API void LowerGradOf(Graph& g); }} diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp index 63fb7030aa3ad1..ee9b76f417bd17 100644 --- a/torch/csrc/jit/passes/shape_analysis.cpp +++ b/torch/csrc/jit/passes/shape_analysis.cpp @@ -263,6 +263,39 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) { default: break; // fall-through } + if (node->matches("aten::cat(Tensor[] tensors, int dim) -> Tensor", /*with_const=*/attr::dim)) { + auto list_node = node->namedInput(attr::tensors)->node(); + JIT_ASSERT(list_node->kind() == prim::ListConstruct); + auto tensors = list_node->inputs(); + if (tensors.size() > 0) { + auto input_types = fmap(tensors, [](Value *v) { return v->type()->cast(); }); + if (std::all_of(input_types.begin(), input_types.end(), + [](const TensorTypePtr& tp) { return tp != nullptr; })) { + std::vector sizes = input_types[0]->sizes(); + const int64_t dim = wrapDim(node->get(attr::dim).value(), sizes); + const int64_t ndim = sizes.size(); + + if (dim < 0 || dim >= ndim) + goto cat_fail; + + sizes[dim] = 0; + for (auto & tp : input_types) { + auto & tp_sizes = tp->sizes(); + if (sizes.size() != tp_sizes.size()) + goto cat_fail; + for (int64_t i = 0; i < ndim; ++i) { + if (sizes[i] != tp_sizes[i] && i != dim) { + goto cat_fail; + } + } + sizes[dim] += tp_sizes[dim]; + } + node->output()->setType(input_types[0]->withSizes(sizes)); + return; + } + } + } +cat_fail: bool can_propagate_by_running = canPropagateShapeByRunningIt(node); auto maybe_tensor_types = gatherTensorTypes(node); diff --git a/torch/csrc/jit/passes/to_batch.cpp b/torch/csrc/jit/passes/to_batch.cpp index 5494cf2b78a798..f78da9b92baccc 100644 --- a/torch/csrc/jit/passes/to_batch.cpp +++ b/torch/csrc/jit/passes/to_batch.cpp @@ -3,59 +3,530 @@ namespace torch { namespace jit { -std::unordered_map> ToBatch::batch_operator_table; +std::unordered_map>> ToBatch::batch_operator_table; -void ToBatch::toBatch(Block* block, Block* res_block) { - // change inputs of a graph - expand tensor to {data, mask, dims} - auto size = block->inputs().size(); - for(size_t i = 0; i < size; i++){ - auto input = block->inputs()[i]; +std::shared_ptr ToBatch::getBatchOperator(std::string name, int64_t num_inputs){ + if(batch_operator_table.find(name) == batch_operator_table.end()){ + throw std::runtime_error("function " + name + " is not supported in batched tensor yet"); + } + auto ops = batch_operator_table.at(name); + if(num_inputs == -1) // default function + return ops[0]; + for(auto op : ops){ + if(size_t(num_inputs) == op->inputs().size()) + return op; + } + throw std::runtime_error("function " + name + " with " + std::to_string(num_inputs) + " inputs is not supported in batched tensor yet"); +} + +// replace aten operator node with BatchTensor operator graph +void ToBatch::visitAten(Node* n, Block* block, Block* res_block){ + auto res_graph = res_block->owningGraph(); + auto func_name = std::string(n->kind().toUnqualString()); + std::vector new_inputs; + for(Value *input : n->inputs()){ + if(rn_env.find(input) == rn_env.end()){ // non-tensor input + auto new_input = batch_map.at(input); + new_inputs.insert(new_inputs.end(), new_input.begin(), new_input.end()); + } + else{ // batched tensor input + new_inputs.push_back(rn_env.at(input)); + } + } + + // transform scalar to tensor before pass to batch operator script + for(size_t i = 0; i < new_inputs.size(); i++){ + auto input = new_inputs[i]; + if(input->type() == IntType::get() || input->type() == FloatType::get()){ + auto to_tensor_node = res_graph->createNumToTensor(input); + res_graph->insertNode(to_tensor_node); + new_inputs[i] = to_tensor_node->output(); + } + } + + auto batch_graph = getBatchOperator(func_name, new_inputs.size()); + auto outputs = script::inlineCallTo(*res_block->owningGraph(), *batch_graph, new_inputs); + + // Assume all outputs from inlined operator implementation are in the triple form batched tensor or just a single non-tensor. + if(outputs.size() == 1){ + // if previous output is scalar, transform new output back to scalar from dynamic + if(n->outputs()[0]->type() != outputs[0]->type()){ + Node* to_scalar_node; + if(n->outputs()[0]->type() == IntType::get()){ + to_scalar_node = res_graph->createTensorToNum(IntType::get(), outputs[0]); + } + else if(n->outputs()[0]->type() == FloatType::get()){ + to_scalar_node = res_graph->createTensorToNum(FloatType::get(), outputs[0]); + } + else{ + throw std::runtime_error("NYI: scalar type other than int, float is not supported yet"); + } + res_graph->insertNode(to_scalar_node); + rn_env[n->outputs()[0]] = to_scalar_node->output(); + } + else + rn_env[n->outputs()[0]] = outputs[0]; + } + else{ + for(size_t i = 0; i < n->outputs().size(); i++){ + auto output = n->outputs()[i]; + batch_map[output] = std::vector(outputs.begin() + i * EXP_BTENSOR_SIZE, outputs.begin() + i * EXP_BTENSOR_SIZE + EXP_BTENSOR_SIZE); + } + } +} + +// clone prim::Constant to new graph +// batching transformation is applied to the output of prim::NumToTensor. +// If there is a prim::NumToTensor following prim::Constant, it will be finally transformed to BatchTensor. +void ToBatch::visitConstant(Node* n, Block* block, Block* res_block){ + auto res_graph = res_block->owningGraph(); + auto* r_node = res_graph->createClone(n, rn_fn); + r_node->setStage(n->stage()); + res_block->appendNode(r_node); + rn_env[n->output()] = r_node->output(); +} + +// change return tensor to expanded batched tensor, eg: {data, mask, dims} +void ToBatch::visitNumToTensor(Node* n, Block* block, Block* res_block){ + auto res_graph = res_block->owningGraph(); + auto* r_node = res_graph->createClone(n, rn_fn); + r_node->setStage(n->stage()); + res_block->appendNode(r_node); + auto outputs = script::inlineCallTo(*res_block->owningGraph(), *getBatchOperator("batch_from_scalar_tensor"), r_node->outputs()); + batch_map[n->output()] = outputs; +} + +// clone prim::TensorToNum to new graph +void ToBatch::visitTensorToNum(Node* n, Block* block, Block* res_block){ + auto res_graph = res_block->owningGraph(); + if(rn_env.find(n->input()) == rn_env.end()){ + rn_env[n->input()] = batch_map.at(n->input())[0]; + } + auto* r_node = res_graph->createClone(n, rn_fn); + r_node->setStage(n->stage()); + res_block->appendNode(r_node); + rn_env[n->output()] = r_node->output(); + batch_map[n->output()] = batch_map.at(n->input()); +} + +// clone prim::ListConstruct to new graph +void ToBatch::visitListConstruct(Node* n, Block* block, Block* res_block){ + auto res_graph = res_block->owningGraph(); + if(n->inputs()[0]->type() == DynamicType::get()){ // TensorList: expand directly + std::vector inputs; + for(Value* input: n->inputs()) { + auto res = batch_map.at(input); + inputs.insert(inputs.end(), res.begin(), res.end()); + } + batch_map[n->output()] = inputs; + } + else { // ScalarList: transform to tensor, then transform back + for(Value* input : n->inputs()) { + if(rn_env.find(input) == rn_env.end()){ + rn_env[input] = batch_map.at(input)[0]; + } + } + auto* r_node = res_graph->createClone(n, rn_fn); + r_node->setStage(n->stage()); + res_block->appendNode(r_node); + // transform int[] to tensor + auto to_tensor_node = res_graph->create(Symbol::fromQualString("aten::_list_to_tensor")); + to_tensor_node->setStage(n->stage()); + to_tensor_node->addInput(r_node->output()); + res_block->appendNode(to_tensor_node); + rn_env[n->output()] = to_tensor_node->output(); + } +} + +// prim::If transformation: +// elif is not supported +// +// transformation example: +// @torch.jit.batch(batch_size=4) +// def batch_if(a, b): +// if a > b: +// a += b +// else: +// a -= b +// return a +// +// original graph: +// graph(%a.1 : Dynamic +// %b : Dynamic) { +// %2 : Dynamic = aten::gt(%a.1, %b) +// %a : Dynamic = prim::If(%2) +// block0() { +// %a.2 : Dynamic = aten::add[alpha={1}](%a.1, %b) +// -> (%a.2) +// } +// block1() { +// %a.3 : Dynamic = aten::sub[alpha={1}](%a.1, %b) +// -> (%a.3) +// } +// return (%a); +// } +// +// transformed graph: +// graph(%a.1_data : Dynamic +// %a.1_mask : Dynamic +// %a.1_dims : Dynamic +// %b_data : Dynamic +// %b_mask : Dynamic +// %b_dims : Dynamic) { +// %6 : Dynamic = aten::gt(%a.1_data, %b_data) // calculate condition +// %7 : Dynamic = aten::mul(%a.1_mask, %b_mask) +// %8 : Dynamic = aten::__or__(%a.1_dims, %b_dims) +// %9 : int = prim::TensorToNum(%6) +// %10 : Long() = prim::Constant[value={1}]() // if_block +// %alpha.1 : float = prim::TensorToNum(%10) +// %data.1 : Dynamic = aten::add(%a.1_data, %b_data, %alpha.1) +// %mask.1 : Dynamic = aten::mul(%a.1_mask, %b_mask) +// %dims.1 : Dynamic = aten::__or__(%a.1_dims, %b_dims) +// %15 : Long() = prim::Constant[value={1}]() // else_block +// %alpha : float = prim::TensorToNum(%15) +// %data.4 : Dynamic = aten::sub(%a.1_data, %b_data, %alpha) +// %mask : Dynamic = aten::mul(%a.1_mask, %b_mask) +// %dims : Dynamic = aten::__or__(%a.1_dims, %b_dims) +// %20 : Dynamic = aten::type_as(%7, %6) // combine two outputs (batch_where) +// %cond_mask.1 : Dynamic = aten::mul(%6, %20) +// %22 : int = aten::dim(%cond_mask.1) +// %23 : int = prim::Constant[value=1]() +// %24 : int = aten::eq(%22, %23) +// %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%24) +// block0() { +// %28 : int = aten::dim(%data.1) +// %29 : int = prim::Constant[value=1]() +// %30 : int = aten::sub(%28, %29) +// %31 : int = prim::Constant[value=1]() +// %data.3 : Dynamic = prim::Loop(%30, %31, %cond_mask.1) +// block0(%_ : int, %34 : Dynamic) { +// %35 : int = prim::Constant[value=1]() +// %36 : int = aten::neg(%35) +// %data.2 : Dynamic = aten::unsqueeze(%34, %36) +// %38 : int = prim::Constant[value=1]() +// -> (%38, %data.2) +// } +// %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1) +// %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask.1) +// -> (%cond_data.1, %cond_mask.2, %data.3) +// } +// block1() { +// -> (%cond_mask.1, %cond_mask.1, %cond_mask.1) +// } +// %res_data : Dynamic = aten::where(%cond_data, %data.1, %data.4) +// %res_mask : Dynamic = aten::where(%cond_mask, %mask.1, %mask) +// %res_dims : Dynamic = aten::__or__(%dims.1, %dims) +// return (%res_data, %res_mask, %res_dims); +// } +void ToBatch::visitIf(Node* n, Block* block, Block* res_block){ + toBatch(n->blocks()[0], res_block); + toBatch(n->blocks()[1], res_block); + + // combine results from two if paths + for(size_t i = 0; i < n->outputs().size(); i++){ + std::vector inputs; + if(batch_map.find(n->input()) == batch_map.end()){ // cond is scalar + inputs.push_back(rn_env.at(n->input())); + } + else{ // cond is tensor + auto cond = batch_map.at(n->input()); + inputs.insert(inputs.end(), cond.begin(), cond.end()); + } + auto if_output = batch_map.at(n->blocks()[0]->outputs()[i]); + inputs.insert(inputs.end(), if_output.begin(), if_output.end()); + auto else_output = batch_map.at(n->blocks()[1]->outputs()[i]); + inputs.insert(inputs.end(), else_output.begin(), else_output.end()); + auto outputs = script::inlineCallTo(*res_block->owningGraph(), *getBatchOperator("where", inputs.size()), inputs); + batch_map[n->outputs()[i]] = outputs; + } +} + +// prim::Loop transformation: +// +// transformation example: +// @torch.jit.batch(batch_size=4) +// def batch_while(a, b): +// while a > b: +// a -= b +// return a +// +// original graph: +// graph(%a.1 : Dynamic +// %b : Dynamic) { +// %2 : int = prim::Constant[value={2147483647}]() +// %3 : Dynamic = aten::gt(%a.1, %b) +// %a : Dynamic = prim::Loop(%2, %3, %a.1) +// block0(%4 : Dynamic, %5 : Dynamic) { +// %a.2 : Dynamic = aten::sub[alpha={1}](%5, %b) +// %9 : Dynamic = aten::gt(%a.2, %b) +// -> (%9, %a.2) +// } +// return (%a); +// } +// +// transformed graph: +// graph(%a.1_data : Dynamic +// %a.1_mask : Dynamic +// %a.1_dims : Dynamic +// %b_data : Dynamic +// %b_mask : Dynamic +// %b_dims : Dynamic) { +// %6 : int = prim::Constant[value=2147483647]() +// %7 : Dynamic = aten::gt(%a.1_data, %b_data) +// %8 : Dynamic = aten::mul(%a.1_mask, %b_mask) +// %9 : Dynamic = aten::__or__(%a.1_dims, %b_dims) +// %10 : int = prim::TensorToNum(%7) +// %11 : Dynamic = aten::mul(%7, %8) +// %12 : Dynamic = aten::sum(%11) +// %13 : Dynamic = aten::gt[other={0}](%12) // cond_any +// %14 : int = prim::TensorToNum(%13) +// %62 : Dynamic, %63 : Dynamic, %64 : Dynamic, %a : Dynamic, %60 : Dynamic, %61 : Dynamic = prim::Loop(%6, %14, %7, %8, %9, %a.1_data, %a.1_mask, %a.1_dims) +// block0(%loop_num : int, %cond_data.2 : Dynamic, %cond_mask.3 : Dynamic, %cond_dims : Dynamic, %6_data : Dynamic, %6_mask : Dynamic, %6_dims : Dynamic) { +// %23 : Long() = prim::Constant[value={1}]() +// %alpha : float = prim::TensorToNum(%23) +// %data.1 : Dynamic = aten::sub(%6_data, %b_data, %alpha) +// %mask : Dynamic = aten::mul(%6_mask, %b_mask) +// %dims : Dynamic = aten::__or__(%6_dims, %b_dims) +// %28 : Dynamic = aten::gt(%data.1, %b_data) +// %29 : Dynamic = aten::mul(%mask, %b_mask) +// %30 : Dynamic = aten::__or__(%dims, %b_dims) +// %31 : int = prim::TensorToNum(%28) +// %32 : Dynamic = aten::type_as(%cond_mask.3, %cond_data.2) // update outputs (batch_where) +// %cond_mask.1 : Dynamic = aten::mul(%cond_data.2, %32) +// %34 : int = aten::dim(%cond_mask.1) +// %35 : int = prim::Constant[value=1]() +// %36 : int = aten::eq(%34, %35) +// %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%36) +// block0() { +// %40 : int = aten::dim(%data.1) +// %41 : int = prim::Constant[value=1]() +// %42 : int = aten::sub(%40, %41) +// %43 : int = prim::Constant[value=1]() +// %data.3 : Dynamic = prim::Loop(%42, %43, %cond_mask.1) +// block0(%_ : int, %46 : Dynamic) { +// %47 : int = prim::Constant[value=1]() +// %48 : int = aten::neg(%47) +// %data.2 : Dynamic = aten::unsqueeze(%46, %48) +// %50 : int = prim::Constant[value=1]() +// -> (%50, %data.2) +// } +// %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1) +// %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask) +// -> (%cond_data.1, %cond_mask.2, %data.3) +// } +// block1() { +// -> (%cond_mask.1, %cond_mask.1, %cond_mask.1) +// } +// %res_data : Dynamic = aten::where(%cond_data, %data.1, %6_data) +// %res_mask : Dynamic = aten::where(%cond_mask, %mask, %6_mask) +// %res_dims : Dynamic = aten::__or__(%dims, %6_dims) +// %56 : Dynamic = aten::mul(%28, %29) +// %57 : Dynamic = aten::sum(%56) +// %58 : Dynamic = aten::gt[other={0}](%57) +// %59 : int = prim::TensorToNum(%58) +// -> (%59, %28, %29, %30, %res_data, %res_mask, %res_dims) +// } +// return (%a, %60, %61); +// } +void ToBatch::visitLoop(Node* n, Block* block, Block* res_block){ + auto res_graph = res_block->owningGraph(); + // bool cond_is_tensor indicates whether cond is tensor + // cond_is_tensor = false, eg: for loop, n->inputs()[1] = byte() + // cond_is_tensor = true, eg: in some while loop, cond is a batched tensor, + // we need to add expanded cond to the inputs of loop node and block, + // and compute cond_any as cond for while loop + bool cond_is_tensor = (batch_map.find(n->inputs()[1]) != batch_map.end()); + + // create prim::Loop node for res_block + + // type of cond in loop should be int type + if(rn_env.at(n->inputs()[0])->type() != IntType::get()){ + auto to_int_node = res_graph->createTensorToNum(IntType::get(), rn_env.at(n->inputs()[0])); + res_graph->insertNode(to_int_node); + rn_env[n->inputs()[0]] = to_int_node->output(); + } + if(cond_is_tensor){ + auto cond = batch_map.at(n->inputs()[1]); + auto cond_any = script::inlineCallTo(*res_block->owningGraph(), *getBatchOperator("any"), cond); + auto to_int_node = res_graph->createTensorToNum(IntType::get(), cond_any[0]); + res_graph->insertNode(to_int_node); + rn_env[n->inputs()[1]] = to_int_node->output(); + } + for(size_t i = 2; i < n->inputs().size(); i++){ + auto input = n->inputs()[i]; + rn_env[input] = batch_map.at(input)[0]; + } + auto* r_node = res_graph->createClone(n, rn_fn, /*copy_blocks=*/false); + + // change inputs of prim::Loop + if(cond_is_tensor){ + for(size_t i = 0; i < EXP_BTENSOR_SIZE; i++){ + auto cond = batch_map.at(n->inputs()[1]); + r_node->insertInput(i + 2, cond[i]); + } + } + for(size_t i = 2; i < n->inputs().size(); i++){ + for(size_t j = 1; j < EXP_BTENSOR_SIZE; j++){ + r_node->insertInput((i - 2) * EXP_BTENSOR_SIZE + EXP_BTENSOR_SIZE * cond_is_tensor + 2 + j, batch_map.at(n->inputs()[i])[j]); + } + } + r_node->setStage(n->stage()); + res_block->appendNode(r_node); + + // create block for Loop node in res_block + // if cond is tensor: first 4 inputs of block: cond_any, cond_data, cond_mask, cond_dims + // if cond is not tensor: first 1 input of block: cond + auto loop_block = r_node->addBlock(); + + // add inputs + loop_block->addInput("loop_num"); + loop_block->inputs()[0]->setType(IntType::get()); + rn_env[n->blocks()[0]->inputs()[0]] = loop_block->inputs()[0]; + if(cond_is_tensor){ + for(size_t i = 0; i < EXP_BTENSOR_SIZE; i++){ + loop_block->addInput("cond_" + EXP_BTENSOR_NAME[i]); + } + } + for(size_t i = 1; i < n->blocks()[0]->inputs().size(); i++){ + auto input = n->blocks()[0]->inputs()[i]; auto name = input->uniqueName(); - res_block->addInput(name + "_data"); - res_block->addInput(name + "_mask"); - res_block->addInput(name + "_dims"); - batch_map[input] = std::vector(res_block->inputs().slice(i * 3, 3)); + for(size_t j = 0; j < EXP_BTENSOR_SIZE; j++){ + loop_block->addInput(name + "_" + EXP_BTENSOR_NAME[j]); + } + batch_map[input] = std::vector(loop_block->inputs().slice((i - 1) * EXP_BTENSOR_SIZE + 1 + EXP_BTENSOR_SIZE * cond_is_tensor, EXP_BTENSOR_SIZE).vec()); + } + + toBatch(n->blocks()[0], loop_block); + + WithInsertPoint guard(loop_block); + + // use where operator to update variables and add to outputs + for(size_t i = 0; i < n->outputs().size(); i++){ + std::vector inputs, outputs; + if(cond_is_tensor){ + for(size_t j = 0; j < EXP_BTENSOR_SIZE; j++){ + inputs.push_back(loop_block->inputs()[j + 1]); + } + auto data = batch_map.at(n->blocks()[0]->outputs()[i + 1]); + inputs.insert(inputs.end(), data.begin(), data.end()); + for(size_t j = 0; j < EXP_BTENSOR_SIZE; j++){ + inputs.push_back(loop_block->inputs()[i * EXP_BTENSOR_SIZE + j + EXP_BTENSOR_SIZE + 1]); + } + outputs = script::inlineCallTo(*res_block->owningGraph(), *getBatchOperator("where"), inputs); + } + else{ + for(size_t j = 0; j < EXP_BTENSOR_SIZE; j++){ + inputs.push_back(loop_block->inputs()[i * EXP_BTENSOR_SIZE + j + 1]); + } + auto data = batch_map.at(n->blocks()[0]->outputs()[i + 1]); + inputs.insert(inputs.end(), data.begin(), data.end()); + outputs = script::inlineCallTo(*res_block->owningGraph(), *getBatchOperator("update"), inputs); + } + batch_map[n->outputs()[i]] = outputs; + for(size_t j = 0; j < EXP_BTENSOR_SIZE; j++){ + loop_block->registerOutput(outputs[j]); + } + } + + // update loop conditions + if(cond_is_tensor){ + auto cond = batch_map.at(n->blocks()[0]->outputs()[0]); + auto cond_any = script::inlineCallTo(*res_block->owningGraph(), *getBatchOperator("any"), cond); + auto to_int_node = res_graph->createTensorToNum(IntType::get(), cond_any[0]); + res_graph->insertNode(to_int_node); + loop_block->insertOutput(0, to_int_node->output()); + for(size_t i = 0; i < EXP_BTENSOR_SIZE; i++){ + loop_block->insertOutput(i + 1, cond[i]); + } + } + else{ + auto cond = rn_env.at(n->blocks()[0]->outputs()[0]); + loop_block->insertOutput(0, cond); + } + + // change outputs of prim::Loop + auto size = r_node->outputs().size(); + for(size_t i = 0; i < size; i++){ + for(size_t j = 1; j < EXP_BTENSOR_SIZE; j++){ + r_node->insertOutput(i * EXP_BTENSOR_SIZE + j); + } + batch_map[n->outputs()[i]] = r_node->outputs().slice(i * EXP_BTENSOR_SIZE, EXP_BTENSOR_SIZE).vec(); + } + // add cond to outputs of loop node + if(cond_is_tensor){ + for(size_t i = 0; i < EXP_BTENSOR_SIZE; i++){ + r_node->insertOutput(i); + } + } +} + +void ToBatch::toBatch(Block* block, Block* res_block) { + WithInsertPoint guard(res_block); + + // change inputs of block - expand tensor to batchtensor eg: (data, mask, dims) + // eg: a -> a_data, a_mask, a_dims + // for block in prim::Loop, register inputs separately to deal with cond + if(!block->owningNode() || block->owningNode()->kind() != prim::Loop){ + auto size = block->inputs().size(); + for(size_t i = 0; i < size; i++){ + auto input = block->inputs()[i]; + auto name = input->uniqueName(); + for(size_t j = 0; j < EXP_BTENSOR_SIZE; j++){ + res_block->addInput(name + "_" + EXP_BTENSOR_NAME[j]); + } + batch_map[input] = std::vector(res_block->inputs().slice(i * EXP_BTENSOR_SIZE, EXP_BTENSOR_SIZE).vec()); + } } for (auto it = block->nodes().begin(); it != block->nodes().end(); it++) { auto n = *it; - // replace tensor operator to BatchTensor operator if(n->kind().is_aten()){ - auto batch_graph = batch_operator_table.at(n->kind().toUnqualString()); - WithInsertPoint guard(res_block); - std::vector new_inputs; - for(Value *input : n->inputs()){ - if(batch_map.find(input) != batch_map.end()){ - auto new_input = batch_map.at(input); - new_inputs.insert(new_inputs.end(), new_input.begin(), new_input.end()); - } - else{ - throw std::runtime_error("NYI: non-tensor input for aten operator is not supported yet"); - } - } - auto outputs = script::inlineCallTo(*res_block->owningGraph(), *batch_graph, new_inputs); - // Assume all outputs from inlined operator implementation are in the triple form. - for(size_t i = 0; i < n->outputs().size(); i++){ - auto output = n->outputs()[i]; - batch_map[output] = std::vector(outputs.begin() + i * 3, outputs.begin() + i * 3 + 3); - } + visitAten(n, block, res_block); } else if(n->kind().is_prim()){ - throw std::runtime_error("NYI: node of prim kind is not supported to transform to batch graph yet"); + switch(n->kind()){ + case prim::Constant: + visitConstant(n, block, res_block); + break; + case prim::NumToTensor: + visitNumToTensor(n, block, res_block); + break; + case prim::TensorToNum: + visitTensorToNum(n, block, res_block); + break; + case prim::ListConstruct: + visitListConstruct(n, block, res_block); + break; + case prim::If: + visitIf(n, block, res_block); + break; + case prim::Loop: + visitLoop(n, block, res_block); + break; + default: + throw std::runtime_error("NYI: node of prim kind other than [Constant, NumToTensor, TensorToNum, If, Loop] is not supported yet"); + } + } + else{ + throw std::runtime_error("NYI: node that is not aten or prim kind is not supported yet"); } } - // change outputs of a graph - expand tensor to {data, mask, dims} - for(Value* output : block->outputs()){ - auto r_output = batch_map.at(output); - res_block->registerOutput(r_output[0]); - res_block->registerOutput(r_output[1]); - res_block->registerOutput(r_output[2]); + // change outputs of block - expand tensor to batchtensor(data, mask, dims) + // for block in prim::Loop, register outputs separately to deal with cond and cond_any + // for block in prim::If, register outputs separately by combining outputs from two paths and return + if(!block->owningNode() || (block->owningNode()->kind() != prim::Loop && block->owningNode()->kind() != prim::If)) { + for(Value* output : block->outputs()){ + auto r_output = batch_map.at(output); + for(size_t i = 0; i < EXP_BTENSOR_SIZE; i++){ + res_block->registerOutput(r_output[i]); + } + } } } std::shared_ptr to_batch_graph(std::shared_ptr& graph){ // std::cout<toString()<(graph->scope_root()); + std::shared_ptr res_graph = std::make_shared(graph->scope_root()); ToBatch to_batch; to_batch.toBatch(graph->block(), res_graph->block()); // std::cout<toString()<(); m.def("to_batch_graph", &to_batch_graph); m.def("register_batch_operator", [](std::string name, std::shared_ptr graph){ - ToBatch::batch_operator_table[name] = graph; + ToBatch::batch_operator_table[name].push_back(graph); }); } diff --git a/torch/csrc/jit/passes/to_batch.h b/torch/csrc/jit/passes/to_batch.h index 23c23a0632b310..6545e2a2d4f8ed 100644 --- a/torch/csrc/jit/passes/to_batch.h +++ b/torch/csrc/jit/passes/to_batch.h @@ -3,14 +3,33 @@ #include "torch/csrc/jit/pybind.h" #include "torch/csrc/jit/ir.h" +#include + namespace torch { namespace jit { class ToBatch { private: + // number of tensors to represent a expanded BatchTensor. {data, mask, dims} for now. + const size_t EXP_BTENSOR_SIZE = 3; + const std::vector EXP_BTENSOR_NAME = {"data", "mask", "dims"}; // mapping from tensor in original graph to {data, mask, dims} in new graph std::unordered_map> batch_map; + // mapping from input in original graph to new input in new graph - used in createClone + std::unordered_map rn_env; + std::function rn_fn = [this](Value* v) { return rn_env.at(v); }; + +private: + std::shared_ptr getBatchOperator(std::string name, int64_t input_num = -1); + void visitAten(Node* n, Block* block, Block* res_block); + void visitConstant(Node* n, Block* block, Block* res_block); + void visitNumToTensor(Node* n, Block* block, Block* res_block); + void visitTensorToNum(Node* n, Block* block, Block* res_block); + void visitListConstruct(Node* n, Block* block, Block* res_block); + void visitIf(Node* n, Block* block, Block* res_block); + void visitLoop(Node* n, Block* block, Block* res_block); + public: - static std::unordered_map> batch_operator_table; + static std::unordered_map>> batch_operator_table; TORCH_API void toBatch(Block* block, Block* res_block); }; diff --git a/torch/csrc/jit/pybind_utils.h b/torch/csrc/jit/pybind_utils.h index 415fc311086ac9..0598e651d32437 100644 --- a/torch/csrc/jit/pybind_utils.h +++ b/torch/csrc/jit/pybind_utils.h @@ -4,26 +4,70 @@ namespace torch { namespace jit { -inline Stack createStack(const py::tuple& tuple, size_t reserve_extra_space = 0) { +inline Stack createStack(const py::tuple& tuple, at::ArrayRef inputs, size_t reserve_extra_space = 0) { + if (tuple.size() != inputs.size()) { + throw std::runtime_error("expected " + std::to_string(inputs.size()) + + " inputs, but got " + std::to_string(tuple.size())); + } + static const auto castToIValue = [](const py::object& obj, Type& t) -> IValue{ + switch (t.kind()) { + case TypeKind::DynamicType: + case TypeKind::TensorType: + return py::cast(obj); + case TypeKind::FloatType: + return py::cast(obj); + case TypeKind::IntType: + return py::cast(obj); + case TypeKind::NoneType: + return {}; + case TypeKind::ListType: + case TypeKind::TupleType: + throw std::runtime_error("Lists and tuples are not supported yet"); + case TypeKind::NumberType: + throw std::runtime_error("Insufficient type information to convert input"); + } + throw std::runtime_error("Missing cases in castToIValue! File a bug report."); + }; Stack result; result.reserve(tuple.size() + reserve_extra_space); - for(auto e : tuple) { - result.push_back(py::cast(e)); + for (size_t i = 0; i < inputs.size(); ++i) { + result.push_back(castToIValue(tuple[i], *inputs[i]->type())); } return result; } -inline py::object wrapStack(Stack&& outputs) { +inline py::object wrapStack(Stack&& outputs, at::ArrayRef output_vals) { + if (outputs.size() != output_vals.size()) { + throw std::runtime_error("expected " + std::to_string(output_vals.size()) + + " outputs, but got " + std::to_string(outputs.size())); + } + static const auto createOutput = [](IValue && ivalue, Value * value) -> py::object { + switch (value->type()->kind()) { + case TypeKind::DynamicType: + case TypeKind::TensorType: + return py::cast(autograd::Variable(ivalue.toTensor())); + case TypeKind::FloatType: + return py::cast(ivalue.toDouble()); + case TypeKind::IntType: + return py::cast(ivalue.toInt()); + case TypeKind::NoneType: + return py::none(); + case TypeKind::ListType: + case TypeKind::TupleType: + throw std::runtime_error("Lists and tuples are not supported yet"); + case TypeKind::NumberType: + throw std::runtime_error("Insufficient type information to convert input"); + } + throw std::runtime_error("Missing cases in createOutput! File a bug report."); + }; if (outputs.size() == 0) { return py::none(); } else if (outputs.size() == 1) { - JIT_ASSERT(outputs[0].isTensor()); - return py::cast(autograd::as_variable_ref(std::move(outputs[0]).toTensor())); + return createOutput(std::move(outputs[0]), output_vals[0]); } else { py::tuple tuple(outputs.size()); for(size_t i = 0; i < outputs.size(); i++) { - JIT_ASSERT(outputs[i].isTensor()); - tuple[i] = py::cast(autograd::as_variable_ref(std::move(outputs[i]).toTensor())); + tuple[i] = createOutput(std::move(outputs[i]), output_vals[i]); } return tuple; } diff --git a/torch/csrc/jit/python_arg_flatten.h b/torch/csrc/jit/python_arg_flatten.h index b5139032fde169..3e1477e52e0701 100644 --- a/torch/csrc/jit/python_arg_flatten.h +++ b/torch/csrc/jit/python_arg_flatten.h @@ -14,7 +14,7 @@ namespace torch { namespace jit { namespace python { struct IODescriptor { struct VariableMetadata { VariableMetadata(const autograd::Variable& var) - : sizes(var.sizes()) + : sizes(var.sizes().vec()) , type(var.type().scalarType()) , device(var.type().is_cuda() ? var.get_device() : -1) , requires_grad(var.requires_grad()) {} @@ -104,7 +104,7 @@ struct ParsedArgs { ParsedArgs flatten(py::handle obj); -PyObject* unflatten(at::ArrayRef outputs, +PyObject* unflatten(at::ArrayRef vars, const IODescriptor& structure); }}} // namespace torch::jit::python diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp index 81211085569953..b72fdb6b8860b1 100644 --- a/torch/csrc/jit/python_ir.cpp +++ b/torch/csrc/jit/python_ir.cpp @@ -451,10 +451,22 @@ void initPythonIRBindings(PyObject * module_) { .def("scalarType",[](Type& t) { return at::toString(t.expect()->scalarType()); }) - ; + .def("__eq__", [](std::shared_ptr& self, std::shared_ptr& other) { + return *self == *other; + }) + .def("isSubtypeOf", [](std::shared_ptr& self, std::shared_ptr other) { + return self->isSubtypeOf(other); + }); + py::class_>(m, "NumberType") + .def_static("get", &NumberType::get); + py::class_>(m, "IntType") + .def_static("get", &IntType::get); + py::class_>(m, "FloatType") + .def_static("get", &FloatType::get); py::class_>(m, "DynamicType") - .def(py::init([](){ return DynamicType::create(); })); + .def_static("get", &DynamicType::get); + py::class_>(m, "TupleType") .def(py::init([](std::vector a){ return TupleType::create(a); })) .def("elements", [](TupleType &self){ @@ -465,7 +477,9 @@ void initPythonIRBindings(PyObject * module_) { return types; }); py::class_>(m, "ListType") - .def_static("ofInts", &ListType::ofInts); + .def_static("ofInts", &ListType::ofInts) + .def_static("ofTensors", &ListType::ofTensors) + .def("getElementType", &ListType::getElementType); py::class_(m,"Use") .def_readonly("user",&Use::user) diff --git a/torch/csrc/jit/python_tracer.cpp b/torch/csrc/jit/python_tracer.cpp index 7439b2b5e334cc..0496af67412654 100644 --- a/torch/csrc/jit/python_tracer.cpp +++ b/torch/csrc/jit/python_tracer.cpp @@ -103,10 +103,10 @@ void pythonRecordSourceLocation(Node* n) { n->setSourceLocation(sl); } -void initPythonTracerBindings(PyObject* module_) { +void initPythonTracerBindings(PyObject* module) { setRecordSourceLocation(pythonRecordSourceLocation); - auto m = py::handle(module_).cast(); + auto m = py::handle(module).cast(); py::class_>(m, "TracingState", py::dynamic_attr()) // NB: no constructor; you have to get it from C++ code .def("__repr__", [](const TracingState& s) { diff --git a/torch/csrc/jit/register_prim_ops.cpp b/torch/csrc/jit/register_prim_ops.cpp index 8fe747e59900f0..f2b8ea18a2be24 100644 --- a/torch/csrc/jit/register_prim_ops.cpp +++ b/torch/csrc/jit/register_prim_ops.cpp @@ -231,6 +231,18 @@ RegisterOperators reg({ push(stack, std::move(vals)); return 0; }; + } else if (lt->getElementType()->isSubtypeOf(DynamicType::get())) { + return [=](Stack& stack) { + const size_t stack_size = stack.size(); + std::vector vals; + vals.reserve(num_inputs); + for (size_t i = stack_size - num_inputs; i < stack_size; ++i) { + vals.push_back(std::move(stack[i]).toTensor()); + } + drop(stack, num_inputs); + push(stack, std::move(vals)); + return 0; + }; } else { std::stringstream ss; ss << "unsupported list type: " << *lt->getElementType(); @@ -335,7 +347,35 @@ RegisterOperators reg2({ return 0; }; }), - + Operator( + "aten::_tensor_to_list(Tensor a) -> int[]", + [](Node* node) { + return [=](Stack& stack) { + at::Tensor t; + pop(stack, t); + std::vector elems; + for(int i = 0; i < t.size(0); i++){ + elems.push_back(*t[i].toIntData()); + } + push(stack, jit::IntList::create(elems)); + return 0; + }; + }), + Operator( + "aten::_list_to_tensor(int[] a) -> Tensor", + [](Node* node) { + return [=](Stack& stack) { + std::vector l; + pop(stack, l); + auto t = torch::empty( + {static_cast(l.size())}, at::dtype(at::kInt)); + for(size_t i = 0; i < l.size(); i++){ + t[i] = l[i]; + } + push(stack, t); + return 0; + }; + }), // commutative DEFINE_ST_OP(mul, at::mul(b, a)) DEFINE_ST_OP(add, at::add(b, a)) diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp index 0016f69b5ce07b..4f27cb25b53cb7 100644 --- a/torch/csrc/jit/script/compiler.cpp +++ b/torch/csrc/jit/script/compiler.cpp @@ -351,37 +351,19 @@ Value* createNumber(Graph& g, const SourceRange& loc, const at::Tensor& val) { return output; } -Value* createStack(Graph& g, const SourceRange& loc, at::ArrayRef inputs) { - // bake in constant propagation for the all-constant case because it is - // common to see constant lists like [1, 2] passed to attributes - bool all_constant = std::all_of(inputs.begin(), inputs.end(), [&](Value* v) { - return v->node()->kind() == prim::Constant; - }); - if(all_constant) { - auto values = fmap(inputs, [&](Value* v) { - return v->node()->t(attr::value); - }); - return insertConstant(g, at::stack(values), loc); - } - return g.insertNode(g.create(aten::stack, inputs) - ->i_(attr::dim, 0) - ->setSourceLocation(std::make_shared(loc)))->output(); -} - -static bool isTensorSubtype(Value* v) { - return v->type()->isSubtypeOf(DynamicType::get()); -} - at::optional> getIntListAttribute(at::optional N, Value* input) { auto list = constant_as>(input); if(list) - return std::vector(list.value()->elements()); + return list.value()->elements().vec(); + // broadcast IntList[3] with value 4 -> {4, 4, 4} if(!N) return at::nullopt; + auto r = constant_as(input); if(!r) return at::nullopt; + // broadcast to attribute size return std::vector(*N, *r); } @@ -455,51 +437,46 @@ at::optional> tryMatchSchema( } // check input types - std::vector flat_inputs; + std::vector matched_inputs; for(size_t i = 0; i < schema.arguments.size(); ++i) { - NamedValue v = *positional_inputs[i]; + Value* value = positional_inputs[i]->value; const auto& arg = schema.arguments[i]; // some functions that take lists of integers for fixed size arrays // also allow single ints to be passed in their place. // the single int is then repeated to the length of the list - if (isIntUsedAsIntList(v.value, arg)) { - std::vector repeated(*arg.N, v.value); - v.value = graph.insertNode(graph.createList(IntType::get(), repeated))->output(); + if (isIntUsedAsIntList(value, arg)) { + std::vector repeated(*arg.N, value); + value = graph.insertNode(graph.createList(IntType::get(), repeated))->output(); } - // Allow tuples that only contain integers to turn into lists of integers - if(*ListType::ofInts() == *arg.type && - v.value->type()->kind() == TypeKind::TupleType && - v.value->type()->isSubtypeOf(ListType::ofInts())) { - auto unpacked = createTupleUnpack(v.value); - v.value = graph.insertNode(graph.createList(IntType::get(), unpacked))->output(); + // Allow homogeneous tuples to be casted implicitly to lists of appropriate types + if (arg.type->kind() == TypeKind::ListType && + value->type()->kind() == TypeKind::TupleType && + value->type()->isSubtypeOf(arg.type)) { + auto unpacked = createTupleUnpack(value); + auto elem_type = arg.type->expect()->getElementType(); + value = graph.insertNode(graph.createList(elem_type, unpacked))->output(); } - if (v.value->node()->kind() == prim::None){ + if (value->node()->kind() == prim::None){ if (arg.type->isSubtypeOf(NumberType::get())) - v.value = insertConstant(graph, at::Scalar(NAN), loc); + value = insertConstant(graph, at::Scalar(NAN), loc); else - v.value = graph.insertNode(graph.createUndefined())->output(); + value = graph.insertNode(graph.createUndefined())->output(); } - if(!v.value->type()->isSubtypeOf(arg.type)) { + if(!value->type()->isSubtypeOf(arg.type)) { err() << "expected a value of type " << arg.type->str() << " for argument '" << arg.name << "' but found " - << v.value->type()->str() << "\n" - << v.loc; + << value->type()->str() << "\n" + << positional_inputs[i]->loc; return at::nullopt; } - // we only support tensor lists for builtins, where they must be flattened - if(arg.type->isSubtypeOf(ListType::ofTensors())) { - auto outputs = createTupleUnpack(v.value); - flat_inputs.insert(flat_inputs.end(), outputs.begin(), outputs.end()); - } else { - flat_inputs.push_back(v.value); - } + matched_inputs.push_back(value); } - return flat_inputs; + return matched_inputs; } @@ -513,27 +490,27 @@ static std::shared_ptr tryEmitBuiltin( at::ArrayRef attributes) { auto graph = method.graph(); - auto flat_inputs = tryMatchSchema(op->schema, loc, *graph, inputs, attributes, failure_messages); - if(!flat_inputs) + auto matched_inputs = tryMatchSchema(op->schema(), loc, *graph, inputs, attributes, failure_messages); + if(!matched_inputs) return nullptr; // we successfully matched this schema, construct the node NodeKind kind(Symbol::aten(name)); - auto n = graph->insertNode(graph->create(kind, *flat_inputs, 0)) + auto n = graph->insertNode(graph->create(kind, *matched_inputs, 0)) ->setSourceLocation(std::make_shared(loc)); // special case for chunk when the chunks= is known // DO NOT ADD MORE SPECIAL CASES HERE, REFACTOR INTO A FUNCTION IF // NEEDED if(n->kind() == aten::chunk) { - auto value = constant_as((*flat_inputs)[1]); + auto value = constant_as((*matched_inputs)[1]); if(!value) { throw ErrorReport(loc) << "argument 'chunks' must be a constant"; } for(int64_t i = 0; i < *value; ++i) n->addOutput(); } else { - for(auto & ret : op->schema.returns) { + for(auto & ret : op->schema().returns) { n->addOutput()->setType(ret.type); } } @@ -588,7 +565,7 @@ std::shared_ptr emitBuiltinCall( } static Value* ensureTensor(const SourceRange& range, Value* v) { - if(!isTensorSubtype(v)) { + if(!v->type()->isSubtypeOf(DynamicType::get())) { throw ErrorReport(range) << "expected a tensor value but found a " << v->type()->str(); } @@ -700,7 +677,7 @@ struct to_ir { if (return_stmt.values().size() == 1 && results.size() == 1) { auto result = results.at(0); if(result->type()->cast()) { - results = createTupleUnpack(result); + results = createTupleUnpack(result).vec(); } } if (typed_def.schema && typed_def.schema->returns.size() != results.size()) { @@ -711,12 +688,16 @@ struct to_ir { auto range = return_stmt.range(); size_t return_type_idx = 0; for (auto& r : results) { - if(r->type()->isSubtypeOf(NumberType::get())) { - graph->registerOutput(numToTensor(range, r)); - } else { - ensureTensor(range, r); - graph->registerOutput(r); + // TODO: support tuples and lists as returns + auto return_kind = r->type()->kind(); + if (return_kind != TypeKind::TensorType && + return_kind != TypeKind::DynamicType && + return_kind != TypeKind::IntType && + return_kind != TypeKind::FloatType) { + throw ErrorReport(return_stmt.range()) << "The only supported return types " + << "are tensors, ints and floats"; } + graph->registerOutput(r); TypePtr type = DynamicType::get(); if (typed_def.schema) { type = typed_def.schema->returns.at(return_type_idx).type; @@ -1387,6 +1368,11 @@ struct to_ir { auto values = getValues(ll.inputs(), /*maybe_unpack=*/true, identity); return graph->insertNode(graph->createTuple(values))->output(); } break; + case TK_TUPLE_LITERAL: { + auto ll = TupleLiteral(tree); + auto values = getValues(ll.inputs(), /*maybe_unpack=*/true, identity); + return graph->insertNode(graph->createTuple(values))->output(); + } break; default: throw ErrorReport(tree) << "NYI: " << tree; break; diff --git a/torch/csrc/jit/script/compiler.h b/torch/csrc/jit/script/compiler.h index 0b87cf56be6ad3..3c4dcb07a248ee 100644 --- a/torch/csrc/jit/script/compiler.h +++ b/torch/csrc/jit/script/compiler.h @@ -68,7 +68,7 @@ struct SugaredValue : public std::enable_shared_from_this { SourceRange loc, Method & m, // note: names for args will be 'argument 0', 'argument 1', etc.. - at::ArrayRef inputs, + at::ArrayRef inputs_, at::ArrayRef attributes, size_t n_binders) { // n_binders is always set to the number of variables an expression is @@ -89,7 +89,7 @@ struct SugaredValue : public std::enable_shared_from_this { throw ErrorReport(loc) << "cannot call a " << kind(); } - virtual ~SugaredValue() {} + virtual ~SugaredValue() = default; }; // most things in the environment are just simple value types diff --git a/torch/csrc/jit/script/init.cpp b/torch/csrc/jit/script/init.cpp index cb7893234dc747..39bb51ed89ca5d 100644 --- a/torch/csrc/jit/script/init.cpp +++ b/torch/csrc/jit/script/init.cpp @@ -370,10 +370,15 @@ static void gatherParametersAndBuffers(std::vector & values, const } } +Stack createStack(const py::tuple& tuple, const Method& method) { + auto relevant_inputs = method.graph()->inputs().slice(0, method.num_inputs()); + return createStack(tuple, relevant_inputs); +} + py::object runMethodFromPython(Method& m, py::args args) { - auto stack = createStack(args); + auto stack = createStack(args, m); m.run(stack); - return wrapStack(std::move(stack)); + return wrapStack(std::move(stack), m.graph()->outputs()); } void initJitScriptBindings(PyObject* module) { @@ -502,7 +507,8 @@ void initJitScriptBindings(PyObject* module) { }) .def("graph_for", [](Module& self, py::args args) { if (self.find_method("forward")) { - return self.get_method("forward").graph_for(createStack(args)); + Method & m = self.get_method("forward"); + return m.graph_for(createStack(args, m.graph()->inputs())); } throw std::runtime_error("Attempted to call graph_for on a Module without a compiled forward()"); }) @@ -530,7 +536,7 @@ void initJitScriptBindings(PyObject* module) { .def("propagate_and_assign_input_and_output_shapes", &Method::propagate_and_assign_input_and_output_shapes) .def("params", &Method::params) .def("graph_for", [](Method& self, py::args args) { - return self.graph_for(createStack(args)); + return self.graph_for(createStack(args, self.graph()->inputs())); }) .def("set_arg_and_return_types", [](Method &self, TypedDef &typed_def, bool method) { std::vector arg_type_args, return_type_args; diff --git a/torch/csrc/jit/script/lexer.h b/torch/csrc/jit/script/lexer.h index 912b488dde5d9e..1694889d630d39 100644 --- a/torch/csrc/jit/script/lexer.h +++ b/torch/csrc/jit/script/lexer.h @@ -75,6 +75,7 @@ namespace script { _(TK_GATHER, "gather", "") \ _(TK_NOTHING, "nothing", "") \ _(TK_LIST_LITERAL, "list-literal", "") \ + _(TK_TUPLE_LITERAL, "tuple-literal", "") \ _(TK_FOR, "for", "for") \ _(TK_IN, "in", "in") \ _(TK_STARRED, "starred", "") \ diff --git a/torch/csrc/jit/script/parser.h b/torch/csrc/jit/script/parser.h index abea2778053699..0cd833dc15e488 100644 --- a/torch/csrc/jit/script/parser.h +++ b/torch/csrc/jit/script/parser.h @@ -30,7 +30,7 @@ struct Parser { List(makeList(range, std::move(attributes)))); } // exp | expr, | expr, expr, ... - TreeRef parseExpOrExpList(int end) { + TreeRef parseExpOrExpTuple(int end) { auto prefix = parseExp(); if(L.cur().kind == ',') { std::vector exprs = { prefix }; @@ -39,7 +39,7 @@ struct Parser { exprs.push_back(parseExp()); } auto list = List::create(prefix.range(), exprs); - prefix = ListLiteral::create(list.range(), list); + prefix = TupleLiteral::create(list.range(), list); } return prefix; } @@ -61,7 +61,14 @@ struct Parser { } break; case '(': { L.next(); - prefix = parseExpOrExpList(')'); + if (L.nextIf(')')) { + /// here we have the empty tuple case + std::vector vecExpr; + List listExpr = List::create(L.cur().range, vecExpr); + prefix = TupleLiteral::create(L.cur().range, listExpr); + break; + } + prefix = parseExpOrExpTuple(')'); L.expect(')'); } break; case '[': { @@ -242,7 +249,7 @@ struct Parser { // first[,other,lhs] = rhs Assign parseAssign(List list) { auto red = parseOptionalReduction(); - auto rhs = parseExpOrExpList(TK_NEWLINE); + auto rhs = parseExpOrExpTuple(TK_NEWLINE); L.expect(TK_NEWLINE); return Assign::create(list.range(), list, AssignKind(red), Expr(rhs)); } diff --git a/torch/csrc/jit/script/python_tree_views.cpp b/torch/csrc/jit/script/python_tree_views.cpp index 569d1b0e66fdf3..7ece5e055a33df 100644 --- a/torch/csrc/jit/script/python_tree_views.cpp +++ b/torch/csrc/jit/script/python_tree_views.cpp @@ -193,6 +193,10 @@ void initTreeViewBindings(PyObject *module) { .def(py::init([](const SourceRange& range, std::vector args) { return ListLiteral::create(range, wrap_list(range, std::move(args))); })); + py::class_(m, "TupleLiteral") + .def(py::init([](const SourceRange& range, std::vector args) { + return TupleLiteral::create(range, wrap_list(range, std::move(args))); + })); py::class_(m, "Gather") .def(py::init([](const Expr& base, const Expr& index) { return Gather::create(base.range(), base, index); diff --git a/torch/csrc/jit/script/tree.h b/torch/csrc/jit/script/tree.h index e3d69d2790682d..0b9bc7009e0162 100644 --- a/torch/csrc/jit/script/tree.h +++ b/torch/csrc/jit/script/tree.h @@ -89,7 +89,7 @@ struct Tree : std::enable_shared_from_this { throw std::runtime_error(ss.str()); } } - virtual ~Tree() {} + virtual ~Tree() = default; private: int kind_; diff --git a/torch/csrc/jit/script/tree_views.h b/torch/csrc/jit/script/tree_views.h index 6cc934ab4d177a..10ac01799c0607 100644 --- a/torch/csrc/jit/script/tree_views.h +++ b/torch/csrc/jit/script/tree_views.h @@ -58,6 +58,7 @@ namespace script { // | Gather(Expr value, Expr indices) TK_GATHER // | Var(Ident name) TK_VAR // | ListLiteral(List inputs) TK_LIST_LITERAL +// | TupleLiteral(List inputs) TK_TUPLE_LITERAL // | Starred(Expr expr) TK_STARRED // // -- NB: only allowed expressions are Const or List(Const) @@ -255,6 +256,7 @@ struct Expr : public TreeView { case TK_GATHER: case TK_VAR: case TK_LIST_LITERAL: + case TK_TUPLE_LITERAL: case '@': case TK_POW: return; @@ -694,6 +696,17 @@ struct ListLiteral : public Expr { } }; +struct TupleLiteral : public Expr { + explicit TupleLiteral(const TreeRef& tree) : Expr(tree) { + tree_->match(TK_TUPLE_LITERAL); + } + List inputs() const { + return subtree(0); + } + static TupleLiteral create(const SourceRange& range, const List& inputs) { + return TupleLiteral(Compound::create(TK_TUPLE_LITERAL, range, {inputs})); + } +}; struct Starred : public Expr { explicit Starred(const TreeRef& tree) : Expr(tree) { diff --git a/torch/csrc/jit/stack.h b/torch/csrc/jit/stack.h index 2c74ae7e0a4c77..7a23aa55df538f 100644 --- a/torch/csrc/jit/stack.h +++ b/torch/csrc/jit/stack.h @@ -77,8 +77,8 @@ inline void pack(Stack & stack, T&& v) { } template<> -inline void pack(Stack & stack, std::vector&& ts) { - for(auto& t : ts) { +inline void pack(Stack & stack, std::vector&& v) { + for(auto& t : v) { stack.push_back(IValue(std::move(t))); } } diff --git a/torch/csrc/jit/symbolic_variable.h b/torch/csrc/jit/symbolic_variable.h index e4d2f98ba0ea0f..ef6d41005789f8 100644 --- a/torch/csrc/jit/symbolic_variable.h +++ b/torch/csrc/jit/symbolic_variable.h @@ -119,18 +119,20 @@ struct SymbolicVariable { return create(t("narrow"), { *this, insertConstant(dim), insertConstant(start), insertConstant(length) }, 1)[0]; } static SymbolicVariable cat(ArrayRef inputs, Value* dim) { - std::vector all_inputs = inputs; - all_inputs.push_back(dim); - return create(aten::cat, all_inputs)[0]; + Graph *g = dim->owningGraph(); + auto value_inputs = fmap(inputs, [](const SymbolicVariable & v) { return v.value(); }); + Value *input_list = g->insertNode(g->createList(DynamicType::get(), value_inputs))->output(); + return create(aten::cat, {input_list, dim})[0]; } static SymbolicVariable cat(ArrayRef inputs, int dim) { JIT_ASSERT(inputs.size() > 0); return SymbolicVariable::cat(inputs, inputs[0].insertConstant(dim)); } static SymbolicVariable stack(ArrayRef inputs, Value* dim) { - std::vector all_inputs = inputs; - all_inputs.push_back(dim); - return create(aten::stack, all_inputs)[0]; + Graph *g = dim->owningGraph(); + auto value_inputs = fmap(inputs, [](const SymbolicVariable & v) { return v.value(); }); + Value *input_list = g->insertNode(g->createList(DynamicType::get(), value_inputs))->output(); + return create(aten::stack, {input_list, dim})[0]; } static SymbolicVariable stack(ArrayRef inputs, int dim) { JIT_ASSERT(inputs.size() > 0); diff --git a/torch/csrc/jit/test_jit.cpp b/torch/csrc/jit/test_jit.cpp index 8c9763f88353e5..d5d204f9465bd8 100644 --- a/torch/csrc/jit/test_jit.cpp +++ b/torch/csrc/jit/test_jit.cpp @@ -220,6 +220,9 @@ static void fusionTests() { testOne(1,2,0,2); + auto createFusedConcat = [](Graph & graph, at::ArrayRef inputs, int64_t dim) -> Value* { + return graph.insertNode(graph.create(prim::FusedConcat, inputs)->i_(attr::dim, dim))->output(); + }; auto testConcat = [&](int dim) { Graph graph; @@ -227,7 +230,7 @@ static void fusionTests() { Var i1 = Var::asNewInput(graph); auto o0 = i0 * i1; o0.addAsOutput(); - Var::cat({i0, o0}, dim).addAsOutput(); + Var(createFusedConcat(graph, {i0, o0}, dim)).addAsOutput(); auto a = at::rand({3,4,5}, at::kCUDA); auto b = at::rand({4,3,5}, at::kCUDA).transpose(0,1); @@ -776,6 +779,9 @@ void argumentSpecTest() { REQUIRE(!(c == a)); REQUIRE(spec.count(c) == 0); + Stack stack = { var(CF, {1,2}, true), 3, var(CF, {1,2}, true) }; + ArgumentSpec with_const(true, stack); + REQUIRE(with_const.at(2).sizes().size() == 2); } void shapeAnalysisTest() { diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp index aec6eb4ddc9447..a0e2f65e617754 100644 --- a/torch/csrc/jit/tracer.cpp +++ b/torch/csrc/jit/tracer.cpp @@ -38,9 +38,9 @@ void addInputs(Node *n, const char * name, const std::string& value) { b void addInputs(Node *n, const char * name, const at::SparseTensorRef& value) { badArgType(); } void addInputs(Node *n, const char * name, at::TensorList value) { - for (auto & t : value) { - n->addInput(getValueTrace(t)); - } + Graph *g = n->owningGraph(); + Node *list_node = g->appendNode(g->createList(DynamicType::get(), fmap(value, getValueTrace))); + n->addInput(list_node->output()); } void addInputs(Node *n, const char * name, at::IntList value) { diff --git a/torch/csrc/jit/type.cpp b/torch/csrc/jit/type.cpp index ebcc91a908c213..ddb4dfad0154ad 100644 --- a/torch/csrc/jit/type.cpp +++ b/torch/csrc/jit/type.cpp @@ -46,31 +46,31 @@ std::ostream& operator<<(std::ostream & out, const Type & t) { return out; } -TypePtr DynamicType::get() { +DynamicTypePtr DynamicType::get() { static auto value = DynamicType::create(); return value; } -TypePtr NumberType::get() { +NumberTypePtr NumberType::get() { static auto value = NumberType::create(); return value; } -TypePtr IntType::get() { +IntTypePtr IntType::get() { static auto value = IntType::create(); return value; } -TypePtr FloatType::get() { +FloatTypePtr FloatType::get() { static auto value = FloatType::create(); return value; } -TypePtr NoneType::get() { +NoneTypePtr NoneType::get() { static auto value = NoneType::create(); return value; } -TypePtr ListType::ofTensors() { +ListTypePtr ListType::ofTensors() { static auto value = ListType::create(DynamicType::get()); return value; } -TypePtr ListType::ofInts() { +ListTypePtr ListType::ofInts() { static auto value = ListType::create(IntType::get()); return value; } diff --git a/torch/csrc/jit/type.h b/torch/csrc/jit/type.h index 7b7d708a549b32..713718e40681c8 100644 --- a/torch/csrc/jit/type.h +++ b/torch/csrc/jit/type.h @@ -80,7 +80,7 @@ struct TORCH_API Type : std::enable_shared_from_this { JIT_ASSERT(T::Kind == kind()); return std::static_pointer_cast(shared_from_this()); } - virtual ~Type() {} + virtual ~Type() = default; }; inline bool operator!=(const Type & lhs, const Type & rhs) { @@ -104,7 +104,7 @@ struct TORCH_API DynamicType : public Type { } static const TypeKind Kind = TypeKind::DynamicType; // global singleton - static TypePtr get(); + static DynamicTypePtr get(); private: DynamicType() : Type(TypeKind::DynamicType) {} @@ -186,16 +186,16 @@ struct TORCH_API TensorType : public Type { : Type(TypeKind::TensorType) , scalar_type_(tensor.type().scalarType()) , device_(tensor.type().is_cuda() ? tensor.get_device() : -1) - , sizes_(tensor.sizes()) - , strides_(tensor.strides()) {} + , sizes_(tensor.sizes().vec()) + , strides_(tensor.strides().vec()) {} TensorType(at::ScalarType scalar_type, int device, at::IntList sizes) : TensorType(scalar_type, device, sizes, TensorType::contiguousStridesOf(sizes)) {} TensorType(at::ScalarType scalar_type, int device, at::IntList sizes, at::IntList strides) : Type(TypeKind::TensorType) , scalar_type_(scalar_type) , device_(device) - , sizes_(sizes) - , strides_(strides) + , sizes_(sizes.vec()) + , strides_(strides.vec()) {} static std::vector contiguousStridesOf(at::IntList sizes) { std::vector strides(sizes.size()); @@ -237,8 +237,8 @@ struct TORCH_API ListType : public Type { return elem; } // common cast List[Tensor] - static TypePtr ofTensors(); - static TypePtr ofInts(); + static ListTypePtr ofTensors(); + static ListTypePtr ofInts(); private: ListType(TypePtr elem) : Type(TypeKind::ListType), elem(elem) {} @@ -326,7 +326,7 @@ struct TORCH_API NumberType : public Type { } static const TypeKind Kind = TypeKind::NumberType; // global singleton - static TypePtr get(); + static NumberTypePtr get(); private: NumberType() : Type(TypeKind::NumberType) {} @@ -351,7 +351,7 @@ struct TORCH_API FloatType : public Type { } static const TypeKind Kind = TypeKind::FloatType; // global singleton - static TypePtr get(); + static FloatTypePtr get(); private: FloatType() : Type(TypeKind::FloatType) {} @@ -376,7 +376,7 @@ struct TORCH_API IntType : public Type { } static const TypeKind Kind = TypeKind::IntType; // global singleton - static TypePtr get(); + static IntTypePtr get(); private: IntType() : Type(TypeKind::IntType) {} @@ -401,7 +401,7 @@ struct NoneType : public Type { } static const TypeKind Kind = TypeKind::NoneType; // global singleton - static TypePtr get(); + static NoneTypePtr get(); private: NoneType() : Type(TypeKind::NoneType) {} diff --git a/torch/csrc/jit/variable_tensor_list.h b/torch/csrc/jit/variable_tensor_list.h index eeae2a66b17e5f..0916fe6ac051d2 100644 --- a/torch/csrc/jit/variable_tensor_list.h +++ b/torch/csrc/jit/variable_tensor_list.h @@ -6,10 +6,10 @@ namespace torch { namespace jit { // a wrapper to mark places where we expect all the at::Tensors to be // variables struct variable_tensor_list : public std::vector { - variable_tensor_list() {} + variable_tensor_list() = default; template variable_tensor_list(InputIt first, InputIt last) - : std::vector(first, last) {} + : std::vector(first, last) {} explicit variable_tensor_list(std::vector && tensor) : std::vector(std::move(tensor)) {} }; diff --git a/torch/csrc/onnx/init.cpp b/torch/csrc/onnx/init.cpp index b09824ec77b4a5..64747c8c4b83a9 100644 --- a/torch/csrc/onnx/init.cpp +++ b/torch/csrc/onnx/init.cpp @@ -1,36 +1,33 @@ #include "torch/csrc/onnx/init.h" -#include "torch/csrc/onnx/onnx.npb.h" #include "torch/csrc/onnx/onnx.h" +#include "onnx/onnx.pb.h" namespace torch { namespace onnx { void initONNXBindings(PyObject* module) { auto m = py::handle(module).cast(); auto onnx = m.def_submodule("_onnx"); - py::enum_(onnx, "TensorProtoDataType") - .value("UNDEFINED", onnx_TensorProto_DataType_UNDEFINED) - .value("FLOAT", onnx_TensorProto_DataType_FLOAT) - .value("UINT8", onnx_TensorProto_DataType_UINT8) - .value("INT8", onnx_TensorProto_DataType_INT8) - .value("UINT16", onnx_TensorProto_DataType_UINT16) - .value("INT16", onnx_TensorProto_DataType_INT16) - .value("INT32", onnx_TensorProto_DataType_INT32) - .value("INT64", onnx_TensorProto_DataType_INT64) - .value("STRING", onnx_TensorProto_DataType_STRING) - .value("BOOL", onnx_TensorProto_DataType_BOOL) - .value("FLOAT16", onnx_TensorProto_DataType_FLOAT16) - .value("DOUBLE", onnx_TensorProto_DataType_DOUBLE) - .value("UINT32", onnx_TensorProto_DataType_UINT32) - .value("UINT64", onnx_TensorProto_DataType_UINT64) - .value("COMPLEX64", onnx_TensorProto_DataType_COMPLEX64) - .value("COMPLEX128", onnx_TensorProto_DataType_COMPLEX128); + py::enum_<::ONNX_NAMESPACE::TensorProto_DataType>(onnx, "TensorProtoDataType") + .value("UNDEFINED", ::ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED) + .value("FLOAT", ::ONNX_NAMESPACE::TensorProto_DataType_FLOAT) + .value("UINT8", ::ONNX_NAMESPACE::TensorProto_DataType_UINT8) + .value("INT8", ::ONNX_NAMESPACE::TensorProto_DataType_INT8) + .value("UINT16", ::ONNX_NAMESPACE::TensorProto_DataType_UINT16) + .value("INT16", ::ONNX_NAMESPACE::TensorProto_DataType_INT16) + .value("INT32", ::ONNX_NAMESPACE::TensorProto_DataType_INT32) + .value("INT64", ::ONNX_NAMESPACE::TensorProto_DataType_INT64) + .value("STRING", ::ONNX_NAMESPACE::TensorProto_DataType_STRING) + .value("BOOL", ::ONNX_NAMESPACE::TensorProto_DataType_BOOL) + .value("FLOAT16", ::ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) + .value("DOUBLE", ::ONNX_NAMESPACE::TensorProto_DataType_DOUBLE) + .value("UINT32", ::ONNX_NAMESPACE::TensorProto_DataType_UINT32) + .value("UINT64", ::ONNX_NAMESPACE::TensorProto_DataType_UINT64) + .value("COMPLEX64", ::ONNX_NAMESPACE::TensorProto_DataType_COMPLEX64) + .value("COMPLEX128", ::ONNX_NAMESPACE::TensorProto_DataType_COMPLEX128); py::enum_(onnx, "OperatorExportTypes") .value("ONNX", OperatorExportTypes::ONNX) .value("ONNX_ATEN", OperatorExportTypes::ONNX_ATEN) .value("ONNX_ATEN_FALLBACK", OperatorExportTypes::ONNX_ATEN_FALLBACK) .value("RAW", OperatorExportTypes::RAW); - - py::class_(onnx, "ModelProto") - .def("prettyPrint", &ModelProto::prettyPrint); } }} // namespace torch::onnx diff --git a/torch/csrc/onnx/onnx.cpp b/torch/csrc/onnx/onnx.cpp deleted file mode 100644 index fa93f6866d5ed6..00000000000000 --- a/torch/csrc/onnx/onnx.cpp +++ /dev/null @@ -1,214 +0,0 @@ -#include "torch/csrc/onnx/onnx.h" - -namespace torch { namespace onnx { - -template <> -bool micropb_encode(pb_ostream_t *stream, std::string* arg) { - return pb_encode_string(stream, reinterpret_cast(arg->c_str()), arg->size()); -} -// NB: Overloads don't work so great for signed variables. Hope this doesn't -// come up! -template <> -bool micropb_encode(pb_ostream_t *stream, int64_t* arg) { - // Yes, this looks dodgy, and yes, this is what the docs say to do: - // https://jpa.kapsi.fi/nanopb/docs/reference.html#pb-encode-varint - return pb_encode_varint(stream, *reinterpret_cast(arg)); -} -template <> -bool micropb_encode(pb_ostream_t *stream, float* arg) { - return pb_encode_fixed32(stream, static_cast(arg)); -} -template <> -bool micropb_encode(pb_ostream_t *stream, double* arg) { - return pb_encode_fixed64(stream, static_cast(arg)); -} - -template <> -bool micropb_encode(pb_ostream_t *stream, Dimension* arg) { - return pb_encode_submessage(stream, onnx_TensorShapeProto_Dimension_fields, - static_cast(arg)); -} - -// TODO: I'm not entirely sure why this can't be in the header... -bool micropb_callback_string_from_tensor(pb_ostream_t *stream, const pb_field_t *field, void * const *arg) { - at::Tensor* t = static_cast(*arg); - AT_ASSERT(t->is_contiguous()); - // Packed array format! - pb_encode_tag_for_field(stream, field); - pb_encode_string(stream, (pb_byte_t*)(t->data_ptr()), t->type().elementSizeInBytes()*t->numel()); - - return true; -} - -GraphProto* AttributeProto::add_graphs() { - auto ptr = new GraphProto(); - graphs.emplace_back(ptr); - return ptr; -} - -constexpr char indent_char = ' '; -constexpr size_t indent_multiplier = 2; - -std::string idt(size_t indent) { - return std::string(indent * indent_multiplier, indent_char); -} - -std::string nlidt(size_t indent) { - return std::string("\n") + idt(indent); -} - -void TensorProto::dump(std::ostream& stream, size_t indent) { - stream << "TensorProto shape: ["; - for (size_t i = 0; i < dims.size(); ++i) { - stream << *dims[i] << (i == dims.size() - 1 ? "" : " "); - } - stream << "]"; -} - -void TensorShapeProto::dump(std::ostream& stream, size_t indent) { - for (size_t i=0; i < dims.size(); ++i) { - auto &dim = dims[i]; - if (dim->has_dim_value) { - stream << dim->dim_value; - } else { - stream << "?"; - } - stream << (i == dims.size() - 1 ? "" : " "); - } -} - -void TypeProtoTensor::dump(std::ostream& stream, size_t indent) { - stream << "Tensor dims: "; - shape->dump(stream); -} - -void TypeProto::dump(std::ostream& stream, size_t indent) { - tensor_type->dump(stream); -} - -void ValueInfoProto::dump(std::ostream& stream, size_t indent) { - stream << "{name: \"" << name - << "\", type:"; - type->dump(stream); - stream << "}"; -} - -void AttributeProto::dump(std::ostream& stream, size_t indent) { - stream << "{ name: '" << name << "', type: "; - if (proto.has_f) { - stream << "float, value: " << proto.f; - } else if (proto.has_i) { - stream << "int, value: " << proto.i; - } else if (s.length()) { - stream << "string, value: '" << s << "'"; - } else if (g) { - stream << "graph, value:\n"; - g->dump(stream, indent+1); - stream << nlidt(indent); - } else if (t) { - stream << "tensor, value:"; - t->dump(stream, indent+1); - } else if (floats.size()) { - stream << "floats, values: ["; - for (size_t i=0; i < floats.size(); ++i) - stream << *floats[i] << (i == floats.size() - 1 ? "" : " "); - stream << "]"; - } else if (ints.size()) { - stream << "ints, values: ["; - for (size_t i=0; i < ints.size(); ++i) - stream << *ints[i] << (i == ints.size() - 1 ? "" : " "); - stream << "]"; - } else if (strings.size()) { - stream << "strings, values: ["; - for (size_t i=0; i < strings.size(); ++i) - stream << "'" << *strings[i] << "'" << (i == strings.size() - 1 ? "" : " "); - stream << "]"; - } else if (tensors.size()) { - stream << "tensors, values: ["; - for (auto& t : tensors) { - t->dump(stream, indent+1); - } - stream << "]"; - } else if (graphs.size()) { - stream << "graphs, values: ["; - for (auto& g : graphs) { - g->dump(stream, indent+1); - } - stream << "]"; - } else { - stream << "UNKNOWN"; - } - stream << "}"; -} - -void NodeProto::dump(std::ostream& stream, size_t indent) { - stream << "Node {type: \"" << op_type << "\", inputs: ["; - for (size_t i=0; i < inputs.size(); ++i) { - stream << *inputs[i] << (i == inputs.size() - 1 ? "" : ","); - } - stream << "], outputs: ["; - for (size_t i=0; i < outputs.size(); ++i) { - stream << *outputs[i] << (i == outputs.size() - 1 ? "" : ","); - } - stream << "], attributes: ["; - for (size_t i=0; i < attributes.size(); ++i) { - attributes[i]->dump(stream, indent+1); - stream << (i == attributes.size() - 1 ? "" : ","); - } - stream << "]}"; -} - -void GraphProto::dump(std::ostream& stream, size_t indent) { - stream << idt(indent) << "GraphProto {" << nlidt(indent+1) - << "name: \"" << name << "\"" << nlidt(indent+1) - << "inputs: ["; - for (size_t i=0; i < inputs.size(); ++i) { - inputs[i]->dump(stream, indent+2); - stream << (i == inputs.size() - 1 ? "" : ","); - } - stream << "]" << nlidt(indent+1) - << "outputs: ["; - for (size_t i=0; i < outputs.size(); ++i) { - outputs[i]->dump(stream, indent+2); - stream << (i == outputs.size() - 1 ? "" : ","); - } - stream << "]" << nlidt(indent+1) - << "initializers: ["; - for (size_t i=0; i < initializers.size(); ++i) { - initializers[i]->dump(stream, indent+2); - stream << (i == initializers.size() - 1 ? "" : ","); - } - stream << "]" << nlidt(indent+1) - << "nodes: [" << nlidt(indent+2); - for (size_t i=0; i < nodes.size(); ++i) { - nodes[i]->dump(stream, indent+2); - if (i != nodes.size() - 1) stream << "," << nlidt(indent+2); - } - stream << nlidt(indent+1) << "]\n" << idt(indent) << "}\n"; -} - -void OperatorSetIdProto::dump(std::ostream& stream, size_t indent) { - stream << "OperatorSetIdProto { domain: " << domain << "}"; -} - -void ModelProto::dump(std::ostream& stream, size_t indent) { - stream << idt(indent) - << "ModelProto {" << nlidt(indent+1) - << "producer_name: \"" << producer_name << "\"" << nlidt(indent+1) - << "domain: \"" << domain << "\"" << nlidt(indent+1) - << "doc_string: \"" << doc_string << "\""; - if (graph) { - stream << nlidt(indent+1) << "graph:\n"; - graph->dump(stream, indent+2); - } - if (opset_import.size()) { - stream << idt(indent+1) << "opset_import: ["; - for (auto &opset_imp : opset_import) { - opset_imp->dump(stream, indent+2); - } - stream << "],\n"; - } - stream << idt(indent) << "}\n"; -} - -}} // namespace onnx diff --git a/torch/csrc/onnx/onnx.h b/torch/csrc/onnx/onnx.h index 7fa38cc03898e9..76170e18110f1b 100644 --- a/torch/csrc/onnx/onnx.h +++ b/torch/csrc/onnx/onnx.h @@ -1,435 +1,11 @@ #pragma once -#include "torch/csrc/onnx/onnx.npb.h" -#include "torch/csrc/WindowsTorchApiMacro.h" - -#include -#include - -#include -#include -#include - namespace torch { namespace onnx { -using DataType = onnx_TensorProto_DataType; -using Dimension = onnx_TensorShapeProto_Dimension; - -// Note [Unique vector] -// ~~~~~~~~~~~~~~~~~~~~ -// Why do we need vectors of unique pointers? A Google-style C++ Protobuf API -// returns raw pointers T* which are expected to stay valid as long as the -// enclosing protobuf is live. However, if we store T directly in a vector, if -// the vector ever resizes (which it may, because we don't know a priori how -// many elements are in the vector) all of these pointers will be invalidated. -// Thus, up-front, we have to give them permanent, dynamically allocated -// addresses. -template -using unique_vector = std::vector>; - -// Helper function for encoding inside callbacks -template -bool micropb_encode(pb_ostream_t *stream, T* arg) { - static_assert(Field != nullptr, "no overload in micropb_encode"); - return pb_encode_submessage(stream, Field, static_cast(&arg->proto)); -} -template <> bool micropb_encode(pb_ostream_t *stream, std::string* arg); -template <> bool micropb_encode(pb_ostream_t *stream, int64_t* arg); -template <> bool micropb_encode(pb_ostream_t *stream, float* arg); -template <> bool micropb_encode(pb_ostream_t *stream, double* arg); -template <> bool micropb_encode(pb_ostream_t *stream, Dimension* arg); -// NB: If we ever add support for signed protobuf integers, we'll need a special -// wrapper, since we can't overload over them (they look the same from C++ side) - -// Callback functions of type pb_callback_t. - -// Write out a single protobuf field inside a message -template -bool micropb_callback(pb_ostream_t *stream, const pb_field_t *field, void * const *arg) { - if (!pb_encode_tag_for_field(stream, field)) return false; - if (!micropb_encode(stream, static_cast(*arg))) return false; - return true; -} - -// Write out a repeated protobuf field inside a message -template -bool micropb_callback_list(pb_ostream_t *stream, const pb_field_t *field, void * const *arg) { - std::vector>* vals = static_cast>*>(*arg); - for (std::unique_ptr& val : *vals) { - auto ptr = static_cast(val.get()); - if (!micropb_callback(stream, field, &ptr)) return false; - } - return true; -} - -bool micropb_callback_string_from_tensor(pb_ostream_t *stream, const pb_field_t *field, void * const *arg); - -// MicroProto helper class -template -struct MicroProto { - // The actual nanopb generated protobuf struct we are filling. - T proto; - - // The constructor takes the protobuf struct by value for initialization - // (since it is a C-style struct). In the constructor you're - // expected to call this with something like onnx_TensorProto_init_default - MicroProto(T proto) : proto(proto) {} - - // Usage: - // std::string owning_slot; - // proto.string_field = string(&owning_slot, value_to_set) - // - // This function takes a string 's' and copies it into the - // owning slot specified by 'slot'. It then returns a callback - // intended to be assigned into the particular protobuf field. - // The employed callback reads out the string from owning - // slot and writes it out to the protobuf. - // - // You should call this function IN THE SETTER METHOD, because - // the no-op callback is different from a callback with an empty - // string: in the former case, the field is absent; in the latter, - // the field is present but an empty string. - pb_callback_t string(std::string* slot, const std::string& s) { - *slot = s; // copy construct - pb_callback_t r; - r.funcs.encode = µpb_callback; - r.arg = static_cast(slot); - return r; // RVO - } - - // Usage: - // at::Tensor owning_slot; - // proto.string_field = string_from_tensor(&owning_slot, value_to_set) - // - // This function takes an at::Tensor and copies it into the - // owning slot specified by 'slot'. It then returns a callback - // intended to be assigned into the particular protobuf field. - // The employed callback reads out the tensor's data as if it - // were a string (adjusting for endianness, if necessary) - // writes it out to the protobuf. - // - // You should call this function IN THE SETTER METHOD, because - // the no-op callback is different from a callback with an undefined - // Tensor. - pb_callback_t string_from_tensor(at::Tensor* slot, const at::Tensor& t) { - *slot = t; // copy construct - pb_callback_t r; - r.funcs.encode = µpb_callback_string_from_tensor; - r.arg = static_cast(slot); - return r; // RVO - } - - // Usage: - // unique_vector owning_slot; - // proto.list_field = list(&owning_slot) - // - // This function returns a callback intended to be - // assigned into a particular protobuf field. The employed - // callback reads out the vector of elements from the owning - // slot and writes the entries into the protobuf. - // - // You should call this function IN THE CONSTRUCTOR, because - // the no-op callback is equivalent to a callback with an empty - // list. (While it's harmless to call this in the setter, but - // a bit wasteful.) - template - pb_callback_t list(unique_vector* slot) { - pb_callback_t r; - r.funcs.encode = µpb_callback_list; - r.arg = static_cast(slot); - return r; // RVO - } - - template - pb_callback_t msg(std::unique_ptr* slot) { - *slot = std::unique_ptr(new S()); // default construct - pb_callback_t r; - r.funcs.encode = µpb_callback; - r.arg = static_cast(slot->get()); - return r; // RVO - } -}; - -#define DEFINE_CONST(C) \ -const auto k##C = onnx_TensorProto_DataType_##C; -DEFINE_CONST(FLOAT) -DEFINE_CONST(UINT8) -DEFINE_CONST(INT8) -DEFINE_CONST(UINT16) -DEFINE_CONST(INT16) -DEFINE_CONST(INT32) -DEFINE_CONST(INT64) -DEFINE_CONST(STRING) -DEFINE_CONST(BOOL) -DEFINE_CONST(FLOAT16) -DEFINE_CONST(DOUBLE) -DEFINE_CONST(UINT32) -DEFINE_CONST(UINT64) -DEFINE_CONST(COMPLEX64) -DEFINE_CONST(COMPLEX128) -#undef DEFINE_CONST - -#define DEFINE_CONST(C) \ -const auto a##C = onnx_AttributeProto_AttributeType_##C; -DEFINE_CONST(FLOAT) -DEFINE_CONST(INT) -DEFINE_CONST(STRING) -DEFINE_CONST(TENSOR) -DEFINE_CONST(GRAPH) -DEFINE_CONST(FLOATS) -DEFINE_CONST(INTS) -DEFINE_CONST(STRINGS) -DEFINE_CONST(TENSORS) -DEFINE_CONST(GRAPHS) -#undef DEFINE_CONST - -// C++ wrappers which simulate the Google C++ Protobuf API -// -// These are NOT COMPLETE wrappers. If you find something is missing, add it! - -class AttributeProto; -class TensorShapeProto; -class TypeProtoTensor; -class TensorProto; -class TypeProto; -class ValueInfoProto; -class NodeProto; -class GraphProto; -class ModelProto; - -class TensorProto : public MicroProto { -private: - std::string name; // namespace ValueInfoProto. - unique_vector dims; - at::Tensor raw_data; - std::string dump_; -public: - TensorProto() : MicroProto(onnx_TensorProto_init_default) { - proto.dims = list(&dims); - } - void set_name(const std::string& s) { proto.name = string(&name, s); } - void add_dims(int64_t d) { dims.emplace_back(new int64_t(d)); } - // Google Protobuf divergence! - void set_raw_data(const at::Tensor& t) { proto.raw_data = string_from_tensor(&raw_data, t); } - void set_external_data_present() { proto.raw_data = string(&dump_, "__EXTERNAL"); } - void set_data_type(onnx_TensorProto_DataType t) { proto.has_data_type = true; proto.data_type = t; } - std::string get_name() const { return name; } - void dump(std::ostream& stream, size_t indent = 0); -}; - -class TensorShapeProto : public MicroProto { -private: - unique_vector dims; -public: - TensorShapeProto() : MicroProto(onnx_TensorShapeProto_init_default) { - proto.dim = list(&dims); - } - void add_dim(std::int64_t d) { - Dimension* p_d = new Dimension(); - p_d->has_dim_value = true; - p_d->dim_value = d; - dims.emplace_back(p_d); - } - void dump(std::ostream& stream, size_t indent = 0); -}; - -class TypeProtoTensor : public MicroProto { -private: - std::unique_ptr shape; -public: - TypeProtoTensor() : MicroProto(onnx_TypeProto_Tensor_init_default) {} - void set_data_type(onnx_TensorProto_DataType t) { proto.has_elem_type = true; proto.elem_type = t; } - TensorShapeProto* mutable_shape() { - proto.shape = msg(&shape); - return shape.get(); - } - void dump(std::ostream& stream, size_t indent = 0); -}; - -class TypeProto : public MicroProto { -private: - std::unique_ptr tensor_type; -public: - TypeProto() : MicroProto(onnx_TypeProto_init_default) {} - TypeProtoTensor* mutable_tensor_type() { - proto.tensor_type = msg(&tensor_type); - return tensor_type.get(); - } - void dump(std::ostream& stream, size_t indent = 0); -}; - -class ValueInfoProto : public MicroProto { -private: - std::string name; - std::unique_ptr type; -public: - ValueInfoProto() : MicroProto(onnx_ValueInfoProto_init_default) {} - std::string get_name() { return name; } - void set_name(const std::string& s) { proto.name = string(&name, s); } - TypeProto* mutable_type() { - proto.type = msg(&type); - return type.get(); - } - void dump(std::ostream& stream, size_t indent = 0); -}; - -class AttributeProto : public MicroProto { -private: - std::string name; - std::string s; - std::unique_ptr g; - std::unique_ptr t; - unique_vector floats; - unique_vector ints; - unique_vector strings; - unique_vector tensors; - unique_vector graphs; -public: - AttributeProto() : MicroProto(onnx_AttributeProto_init_default) { - proto.floats = list(&floats); - proto.ints = list(&ints); - proto.strings = list(&strings); - proto.tensors = list(&tensors); - proto.graphs = list(&graphs); - } - void set_name(const std::string& s) { proto.name = string(&name, s); } - void set_type(onnx_AttributeProto_AttributeType t) { proto.has_type = true; proto.type = t; } - void set_f(float f) { proto.has_f = true; proto.f = f; } - void set_i(int64_t i) { proto.has_i = true; proto.i = i; } - void set_s(std::string s_) { proto.s = string(&s, s_); } - // See https://developers.google.com/protocol-buffers/docs/reference/cpp-generated#embeddedmessage - GraphProto* mutable_g() { proto.g = msg(&g); return g.get(); } - TensorProto* mutable_t() { proto.t = msg(&t); return t.get(); } - void add_floats(float f) { floats.emplace_back(new float(f)); } - void add_ints(int64_t i) { ints.emplace_back(new int64_t(i)); } - void add_strings(std::string s) { strings.emplace_back(new std::string(s)); } - TensorProto* add_tensors() { - auto ptr = new TensorProto(); - tensors.emplace_back(ptr); - return ptr; - } - GraphProto* add_graphs(); - void dump(std::ostream& stream, size_t indent = 0); -}; - -class NodeProto : public MicroProto { -private: - std::string op_type; - std::string domain; - std::string doc_string; - unique_vector inputs; - unique_vector outputs; - unique_vector attributes; -public: - NodeProto() : MicroProto(onnx_NodeProto_init_default) { - proto.input = list(&inputs); - proto.output = list(&outputs); - proto.attribute = list(&attributes); - } - void add_input(const std::string& s) { inputs.emplace_back(new std::string(s)); } - void clear_input() { inputs.clear(); } - void add_output(const std::string& s) { outputs.emplace_back(new std::string(s)); } - void clear_output() { outputs.clear(); } - AttributeProto* add_attribute() { - auto ptr = new AttributeProto(); - attributes.emplace_back(ptr); - return ptr; - } - void set_op_type(const std::string& s) { proto.op_type = string(&op_type, s); } - void set_domain(const std::string& s) { proto.domain = string(&domain, s); } - void set_doc_string(const std::string& s) { proto.doc_string = string(&doc_string, s); } - void dump(std::ostream& stream, size_t indent = 0); -}; - -class GraphProto : public MicroProto { -private: - std::string name; - unique_vector inputs; - unique_vector outputs; - unique_vector nodes; - unique_vector initializers; -public: - GraphProto() : MicroProto(onnx_GraphProto_init_default) { - proto.input = list(&inputs); - proto.output = list(&outputs); - proto.node = list(&nodes); - proto.initializer = list(&initializers); - } - void set_name(const std::string& s) { proto.name = string(&name, s); } - ValueInfoProto* add_input() { - auto ptr = new ValueInfoProto(); - inputs.emplace_back(ptr); - return ptr; - } - std::string get_input_name(size_t i) { return inputs.at(i)->get_name(); } - ValueInfoProto* add_output() { - auto ptr = new ValueInfoProto(); - outputs.emplace_back(ptr); - return ptr; - } - NodeProto* add_node() { - auto ptr = new NodeProto(); - nodes.emplace_back(ptr); - return ptr; - } - TensorProto* add_initializer() { - auto ptr = new TensorProto(); - initializers.emplace_back(ptr); - return ptr; - } - void dump(std::ostream& stream, size_t indent = 0); -}; - -class OperatorSetIdProto : public MicroProto { -private: - std::string domain; -public: - OperatorSetIdProto() : MicroProto(onnx_OperatorSetIdProto_init_default) {} - void set_domain(const std::string& s) { proto.domain = string(&domain, s); } - void set_version(int64_t v) { proto.has_version = true; proto.version = v; } - void dump(std::ostream& stream, size_t indent = 0); -}; - -class ModelProto : public MicroProto { -private: - std::string producer_name; - std::string producer_version; - std::string domain; - std::string doc_string; - std::unique_ptr graph; - unique_vector opset_import; -public: - ModelProto() : MicroProto(onnx_ModelProto_init_default) { - proto.has_ir_version = true; - proto.ir_version = onnx_Version_IR_VERSION; - proto.opset_import = list(&opset_import); - } - void set_model_version(int64_t i) { proto.has_model_version = true; proto.model_version = i; } - void set_doc_string(const std::string& s) { proto.doc_string = string(&doc_string, s); } - void set_producer_name(const std::string& s) { proto.producer_name = string(&producer_name, s); } - void set_producer_version(const std::string& s) { proto.producer_version = string(&producer_version, s); } - GraphProto* mutable_graph() { - proto.graph = msg(&graph); - return graph.get(); - } - OperatorSetIdProto* add_opset_import() { - auto ptr = new OperatorSetIdProto(); - opset_import.emplace_back(ptr); - return ptr; - } - TORCH_API void dump(std::ostream& stream, size_t indent = 0); - std::string prettyPrint() { - std::stringstream ss; - dump(ss, 0); - return ss.str(); - } -}; - enum class OperatorExportTypes { ONNX, // Strict ONNX export ONNX_ATEN, // ONNX With ATen op everywhere ONNX_ATEN_FALLBACK, // ONNX export with ATen fallback RAW, // Raw export (no ONNX) }; - }} // namespace torch::onnx diff --git a/torch/csrc/onnx/onnx.npb.cpp b/torch/csrc/onnx/onnx.npb.cpp deleted file mode 100644 index 2d8ee60eaff414..00000000000000 --- a/torch/csrc/onnx/onnx.npb.cpp +++ /dev/null @@ -1,162 +0,0 @@ -/* Automatically generated nanopb constant definitions */ -/* Generated by nanopb-0.3.9-dev */ - -#include "onnx.npb.h" - -/* @@protoc_insertion_point(includes) */ -#if PB_PROTO_HEADER_VERSION != 30 -#error Regenerate this file with the current version of nanopb generator. -#endif - - - -const pb_field_t onnx_AttributeProto_fields[14] = { - PB_FIELD( 1, STRING , OPTIONAL, CALLBACK, FIRST, onnx_AttributeProto, name, name, 0), - PB_FIELD( 2, FLOAT , OPTIONAL, STATIC , OTHER, onnx_AttributeProto, f, name, 0), - PB_FIELD( 3, INT64 , OPTIONAL, STATIC , OTHER, onnx_AttributeProto, i, f, 0), - PB_FIELD( 4, BYTES , OPTIONAL, CALLBACK, OTHER, onnx_AttributeProto, s, i, 0), - PB_FIELD( 5, MESSAGE , OPTIONAL, CALLBACK, OTHER, onnx_AttributeProto, t, s, &onnx_TensorProto_fields), - PB_FIELD( 6, MESSAGE , OPTIONAL, CALLBACK, OTHER, onnx_AttributeProto, g, t, &onnx_GraphProto_fields), - PB_FIELD( 7, FLOAT , REPEATED, CALLBACK, OTHER, onnx_AttributeProto, floats, g, 0), - PB_FIELD( 8, INT64 , REPEATED, CALLBACK, OTHER, onnx_AttributeProto, ints, floats, 0), - PB_FIELD( 9, BYTES , REPEATED, CALLBACK, OTHER, onnx_AttributeProto, strings, ints, 0), - PB_FIELD( 10, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_AttributeProto, tensors, strings, &onnx_TensorProto_fields), - PB_FIELD( 11, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_AttributeProto, graphs, tensors, &onnx_GraphProto_fields), - PB_FIELD( 13, STRING , OPTIONAL, CALLBACK, OTHER, onnx_AttributeProto, doc_string, graphs, 0), - PB_FIELD( 20, UENUM , OPTIONAL, STATIC , OTHER, onnx_AttributeProto, type, doc_string, 0), - PB_LAST_FIELD -}; - -const pb_field_t onnx_ValueInfoProto_fields[4] = { - PB_FIELD( 1, STRING , OPTIONAL, CALLBACK, FIRST, onnx_ValueInfoProto, name, name, 0), - PB_FIELD( 2, MESSAGE , OPTIONAL, CALLBACK, OTHER, onnx_ValueInfoProto, type, name, &onnx_TypeProto_fields), - PB_FIELD( 3, STRING , OPTIONAL, CALLBACK, OTHER, onnx_ValueInfoProto, doc_string, type, 0), - PB_LAST_FIELD -}; - -const pb_field_t onnx_NodeProto_fields[8] = { - PB_FIELD( 1, STRING , REPEATED, CALLBACK, FIRST, onnx_NodeProto, input, input, 0), - PB_FIELD( 2, STRING , REPEATED, CALLBACK, OTHER, onnx_NodeProto, output, input, 0), - PB_FIELD( 3, STRING , OPTIONAL, CALLBACK, OTHER, onnx_NodeProto, name, output, 0), - PB_FIELD( 4, STRING , OPTIONAL, CALLBACK, OTHER, onnx_NodeProto, op_type, name, 0), - PB_FIELD( 5, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_NodeProto, attribute, op_type, &onnx_AttributeProto_fields), - PB_FIELD( 6, STRING , OPTIONAL, CALLBACK, OTHER, onnx_NodeProto, doc_string, attribute, 0), - PB_FIELD( 7, STRING , OPTIONAL, CALLBACK, OTHER, onnx_NodeProto, domain, doc_string, 0), - PB_LAST_FIELD -}; - -const pb_field_t onnx_ModelProto_fields[10] = { - PB_FIELD( 1, INT64 , OPTIONAL, STATIC , FIRST, onnx_ModelProto, ir_version, ir_version, 0), - PB_FIELD( 2, STRING , OPTIONAL, CALLBACK, OTHER, onnx_ModelProto, producer_name, ir_version, 0), - PB_FIELD( 3, STRING , OPTIONAL, CALLBACK, OTHER, onnx_ModelProto, producer_version, producer_name, 0), - PB_FIELD( 4, STRING , OPTIONAL, CALLBACK, OTHER, onnx_ModelProto, domain, producer_version, 0), - PB_FIELD( 5, INT64 , OPTIONAL, STATIC , OTHER, onnx_ModelProto, model_version, domain, 0), - PB_FIELD( 6, STRING , OPTIONAL, CALLBACK, OTHER, onnx_ModelProto, doc_string, model_version, 0), - PB_FIELD( 7, MESSAGE , OPTIONAL, CALLBACK, OTHER, onnx_ModelProto, graph, doc_string, &onnx_GraphProto_fields), - PB_FIELD( 8, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_ModelProto, opset_import, graph, &onnx_OperatorSetIdProto_fields), - PB_FIELD( 14, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_ModelProto, metadata_props, opset_import, &onnx_StringStringEntryProto_fields), - PB_LAST_FIELD -}; - -const pb_field_t onnx_StringStringEntryProto_fields[3] = { - PB_FIELD( 1, STRING , OPTIONAL, CALLBACK, FIRST, onnx_StringStringEntryProto, key, key, 0), - PB_FIELD( 2, STRING , OPTIONAL, CALLBACK, OTHER, onnx_StringStringEntryProto, value, key, 0), - PB_LAST_FIELD -}; - -const pb_field_t onnx_GraphProto_fields[8] = { - PB_FIELD( 1, MESSAGE , REPEATED, CALLBACK, FIRST, onnx_GraphProto, node, node, &onnx_NodeProto_fields), - PB_FIELD( 2, STRING , OPTIONAL, CALLBACK, OTHER, onnx_GraphProto, name, node, 0), - PB_FIELD( 5, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_GraphProto, initializer, name, &onnx_TensorProto_fields), - PB_FIELD( 10, STRING , OPTIONAL, CALLBACK, OTHER, onnx_GraphProto, doc_string, initializer, 0), - PB_FIELD( 11, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_GraphProto, input, doc_string, &onnx_ValueInfoProto_fields), - PB_FIELD( 12, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_GraphProto, output, input, &onnx_ValueInfoProto_fields), - PB_FIELD( 13, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_GraphProto, value_info, output, &onnx_ValueInfoProto_fields), - PB_LAST_FIELD -}; - -const pb_field_t onnx_TensorProto_fields[13] = { - PB_FIELD( 1, INT64 , REPEATED, CALLBACK, FIRST, onnx_TensorProto, dims, dims, 0), - PB_FIELD( 2, UENUM , OPTIONAL, STATIC , OTHER, onnx_TensorProto, data_type, dims, 0), - PB_FIELD( 3, MESSAGE , OPTIONAL, STATIC , OTHER, onnx_TensorProto, segment, data_type, &onnx_TensorProto_Segment_fields), - PB_FIELD( 4, FLOAT , REPEATED, CALLBACK, OTHER, onnx_TensorProto, float_data, segment, 0), - PB_FIELD( 5, INT32 , REPEATED, CALLBACK, OTHER, onnx_TensorProto, int32_data, float_data, 0), - PB_FIELD( 6, BYTES , REPEATED, CALLBACK, OTHER, onnx_TensorProto, string_data, int32_data, 0), - PB_FIELD( 7, INT64 , REPEATED, CALLBACK, OTHER, onnx_TensorProto, int64_data, string_data, 0), - PB_FIELD( 8, STRING , OPTIONAL, CALLBACK, OTHER, onnx_TensorProto, name, int64_data, 0), - PB_FIELD( 9, BYTES , OPTIONAL, CALLBACK, OTHER, onnx_TensorProto, raw_data, name, 0), - PB_FIELD( 10, DOUBLE , REPEATED, CALLBACK, OTHER, onnx_TensorProto, double_data, raw_data, 0), - PB_FIELD( 11, UINT64 , REPEATED, CALLBACK, OTHER, onnx_TensorProto, uint64_data, double_data, 0), - PB_FIELD( 12, STRING , OPTIONAL, CALLBACK, OTHER, onnx_TensorProto, doc_string, uint64_data, 0), - PB_LAST_FIELD -}; - -const pb_field_t onnx_TensorProto_Segment_fields[3] = { - PB_FIELD( 1, INT64 , OPTIONAL, STATIC , FIRST, onnx_TensorProto_Segment, begin, begin, 0), - PB_FIELD( 2, INT64 , OPTIONAL, STATIC , OTHER, onnx_TensorProto_Segment, end, begin, 0), - PB_LAST_FIELD -}; - -const pb_field_t onnx_TensorShapeProto_fields[2] = { - PB_FIELD( 1, MESSAGE , REPEATED, CALLBACK, FIRST, onnx_TensorShapeProto, dim, dim, &onnx_TensorShapeProto_Dimension_fields), - PB_LAST_FIELD -}; - -const pb_field_t onnx_TensorShapeProto_Dimension_fields[3] = { - PB_FIELD( 1, INT64 , OPTIONAL, STATIC , FIRST, onnx_TensorShapeProto_Dimension, dim_value, dim_value, 0), - PB_FIELD( 2, STRING , OPTIONAL, CALLBACK, OTHER, onnx_TensorShapeProto_Dimension, dim_param, dim_value, 0), - PB_LAST_FIELD -}; - -const pb_field_t onnx_TypeProto_fields[2] = { - PB_FIELD( 1, MESSAGE , OPTIONAL, CALLBACK, FIRST, onnx_TypeProto, tensor_type, tensor_type, &onnx_TypeProto_Tensor_fields), - PB_LAST_FIELD -}; - -const pb_field_t onnx_TypeProto_Tensor_fields[3] = { - PB_FIELD( 1, UENUM , OPTIONAL, STATIC , FIRST, onnx_TypeProto_Tensor, elem_type, elem_type, 0), - PB_FIELD( 2, MESSAGE , OPTIONAL, CALLBACK, OTHER, onnx_TypeProto_Tensor, shape, elem_type, &onnx_TensorShapeProto_fields), - PB_LAST_FIELD -}; - -const pb_field_t onnx_OperatorSetIdProto_fields[3] = { - PB_FIELD( 1, STRING , OPTIONAL, CALLBACK, FIRST, onnx_OperatorSetIdProto, domain, domain, 0), - PB_FIELD( 2, INT64 , OPTIONAL, STATIC , OTHER, onnx_OperatorSetIdProto, version, domain, 0), - PB_LAST_FIELD -}; - - - - - -/* Check that field information fits in pb_field_t */ -#if !defined(PB_FIELD_32BIT) -/* If you get an error here, it means that you need to define PB_FIELD_32BIT - * compile-time option. You can do that in pb.h or on compiler command line. - * - * The reason you need to do this is that some of your messages contain tag - * numbers or field sizes that are larger than what can fit in 8 or 16 bit - * field descriptors. - */ -PB_STATIC_ASSERT((pb_membersize(onnx_TensorProto, segment) < 65536), YOU_MUST_DEFINE_PB_FIELD_32BIT_FOR_MESSAGES_onnx_AttributeProto_onnx_ValueInfoProto_onnx_NodeProto_onnx_ModelProto_onnx_StringStringEntryProto_onnx_GraphProto_onnx_TensorProto_onnx_TensorProto_Segment_onnx_TensorShapeProto_onnx_TensorShapeProto_Dimension_onnx_TypeProto_onnx_TypeProto_Tensor_onnx_OperatorSetIdProto) -#endif - -#if !defined(PB_FIELD_16BIT) && !defined(PB_FIELD_32BIT) -/* If you get an error here, it means that you need to define PB_FIELD_16BIT - * compile-time option. You can do that in pb.h or on compiler command line. - * - * The reason you need to do this is that some of your messages contain tag - * numbers or field sizes that are larger than what can fit in the default - * 8 bit descriptors. - */ -PB_STATIC_ASSERT((pb_membersize(onnx_TensorProto, segment) < 256), YOU_MUST_DEFINE_PB_FIELD_16BIT_FOR_MESSAGES_onnx_AttributeProto_onnx_ValueInfoProto_onnx_NodeProto_onnx_ModelProto_onnx_StringStringEntryProto_onnx_GraphProto_onnx_TensorProto_onnx_TensorProto_Segment_onnx_TensorShapeProto_onnx_TensorShapeProto_Dimension_onnx_TypeProto_onnx_TypeProto_Tensor_onnx_OperatorSetIdProto) -#endif - - -/* On some platforms (such as AVR), double is really float. - * These are not directly supported by nanopb, but see example_avr_double. - * To get rid of this error, remove any double fields from your .proto. - */ -PB_STATIC_ASSERT(sizeof(double) == 8, DOUBLE_MUST_BE_8_BYTES) - -/* @@protoc_insertion_point(eof) */ diff --git a/torch/csrc/onnx/onnx.npb.h b/torch/csrc/onnx/onnx.npb.h deleted file mode 100644 index 84d3b318643830..00000000000000 --- a/torch/csrc/onnx/onnx.npb.h +++ /dev/null @@ -1,333 +0,0 @@ -/* Automatically generated nanopb header */ -/* Generated by nanopb-0.3.9-dev */ - -#ifndef PB_ONNX_ONNX_PB_H_INCLUDED -#define PB_ONNX_ONNX_PB_H_INCLUDED -#include - -/* @@protoc_insertion_point(includes) */ -#if PB_PROTO_HEADER_VERSION != 30 -#error Regenerate this file with the current version of nanopb generator. -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -/* Enum definitions */ -typedef enum _onnx_Version { - onnx_Version__START_VERSION = 0, - onnx_Version_IR_VERSION_2017_10_10 = 1, - onnx_Version_IR_VERSION_2017_10_30 = 2, - onnx_Version_IR_VERSION = 3 -} onnx_Version; -#define _onnx_Version_MIN onnx_Version__START_VERSION -#define _onnx_Version_MAX onnx_Version_IR_VERSION -#define _onnx_Version_ARRAYSIZE ((onnx_Version)(onnx_Version_IR_VERSION+1)) - -typedef enum _onnx_AttributeProto_AttributeType { - onnx_AttributeProto_AttributeType_UNDEFINED = 0, - onnx_AttributeProto_AttributeType_FLOAT = 1, - onnx_AttributeProto_AttributeType_INT = 2, - onnx_AttributeProto_AttributeType_STRING = 3, - onnx_AttributeProto_AttributeType_TENSOR = 4, - onnx_AttributeProto_AttributeType_GRAPH = 5, - onnx_AttributeProto_AttributeType_FLOATS = 6, - onnx_AttributeProto_AttributeType_INTS = 7, - onnx_AttributeProto_AttributeType_STRINGS = 8, - onnx_AttributeProto_AttributeType_TENSORS = 9, - onnx_AttributeProto_AttributeType_GRAPHS = 10 -} onnx_AttributeProto_AttributeType; -#define _onnx_AttributeProto_AttributeType_MIN onnx_AttributeProto_AttributeType_UNDEFINED -#define _onnx_AttributeProto_AttributeType_MAX onnx_AttributeProto_AttributeType_GRAPHS -#define _onnx_AttributeProto_AttributeType_ARRAYSIZE ((onnx_AttributeProto_AttributeType)(onnx_AttributeProto_AttributeType_GRAPHS+1)) - -typedef enum _onnx_TensorProto_DataType { - onnx_TensorProto_DataType_UNDEFINED = 0, - onnx_TensorProto_DataType_FLOAT = 1, - onnx_TensorProto_DataType_UINT8 = 2, - onnx_TensorProto_DataType_INT8 = 3, - onnx_TensorProto_DataType_UINT16 = 4, - onnx_TensorProto_DataType_INT16 = 5, - onnx_TensorProto_DataType_INT32 = 6, - onnx_TensorProto_DataType_INT64 = 7, - onnx_TensorProto_DataType_STRING = 8, - onnx_TensorProto_DataType_BOOL = 9, - onnx_TensorProto_DataType_FLOAT16 = 10, - onnx_TensorProto_DataType_DOUBLE = 11, - onnx_TensorProto_DataType_UINT32 = 12, - onnx_TensorProto_DataType_UINT64 = 13, - onnx_TensorProto_DataType_COMPLEX64 = 14, - onnx_TensorProto_DataType_COMPLEX128 = 15 -} onnx_TensorProto_DataType; -#define _onnx_TensorProto_DataType_MIN onnx_TensorProto_DataType_UNDEFINED -#define _onnx_TensorProto_DataType_MAX onnx_TensorProto_DataType_COMPLEX128 -#define _onnx_TensorProto_DataType_ARRAYSIZE ((onnx_TensorProto_DataType)(onnx_TensorProto_DataType_COMPLEX128+1)) - -/* Struct definitions */ -typedef struct _onnx_GraphProto { - pb_callback_t node; - pb_callback_t name; - pb_callback_t initializer; - pb_callback_t doc_string; - pb_callback_t input; - pb_callback_t output; - pb_callback_t value_info; -/* @@protoc_insertion_point(struct:onnx_GraphProto) */ -} onnx_GraphProto; - -typedef struct _onnx_NodeProto { - pb_callback_t input; - pb_callback_t output; - pb_callback_t name; - pb_callback_t op_type; - pb_callback_t attribute; - pb_callback_t doc_string; - pb_callback_t domain; -/* @@protoc_insertion_point(struct:onnx_NodeProto) */ -} onnx_NodeProto; - -typedef struct _onnx_StringStringEntryProto { - pb_callback_t key; - pb_callback_t value; -/* @@protoc_insertion_point(struct:onnx_StringStringEntryProto) */ -} onnx_StringStringEntryProto; - -typedef struct _onnx_TensorShapeProto { - pb_callback_t dim; -/* @@protoc_insertion_point(struct:onnx_TensorShapeProto) */ -} onnx_TensorShapeProto; - -typedef struct _onnx_TypeProto { - pb_callback_t tensor_type; -/* @@protoc_insertion_point(struct:onnx_TypeProto) */ -} onnx_TypeProto; - -typedef struct _onnx_ValueInfoProto { - pb_callback_t name; - pb_callback_t type; - pb_callback_t doc_string; -/* @@protoc_insertion_point(struct:onnx_ValueInfoProto) */ -} onnx_ValueInfoProto; - -typedef struct _onnx_AttributeProto { - pb_callback_t name; - bool has_f; - float f; - bool has_i; - int64_t i; - pb_callback_t s; - pb_callback_t t; - pb_callback_t g; - pb_callback_t floats; - pb_callback_t ints; - pb_callback_t strings; - pb_callback_t tensors; - pb_callback_t graphs; - pb_callback_t doc_string; - bool has_type; - onnx_AttributeProto_AttributeType type; -/* @@protoc_insertion_point(struct:onnx_AttributeProto) */ -} onnx_AttributeProto; - -typedef struct _onnx_ModelProto { - bool has_ir_version; - int64_t ir_version; - pb_callback_t producer_name; - pb_callback_t producer_version; - pb_callback_t domain; - bool has_model_version; - int64_t model_version; - pb_callback_t doc_string; - pb_callback_t graph; - pb_callback_t opset_import; - pb_callback_t metadata_props; -/* @@protoc_insertion_point(struct:onnx_ModelProto) */ -} onnx_ModelProto; - -typedef struct _onnx_OperatorSetIdProto { - pb_callback_t domain; - bool has_version; - int64_t version; -/* @@protoc_insertion_point(struct:onnx_OperatorSetIdProto) */ -} onnx_OperatorSetIdProto; - -typedef struct _onnx_TensorProto_Segment { - bool has_begin; - int64_t begin; - bool has_end; - int64_t end; -/* @@protoc_insertion_point(struct:onnx_TensorProto_Segment) */ -} onnx_TensorProto_Segment; - -typedef struct _onnx_TensorShapeProto_Dimension { - bool has_dim_value; - int64_t dim_value; - pb_callback_t dim_param; -/* @@protoc_insertion_point(struct:onnx_TensorShapeProto_Dimension) */ -} onnx_TensorShapeProto_Dimension; - -typedef struct _onnx_TypeProto_Tensor { - bool has_elem_type; - onnx_TensorProto_DataType elem_type; - pb_callback_t shape; -/* @@protoc_insertion_point(struct:onnx_TypeProto_Tensor) */ -} onnx_TypeProto_Tensor; - -typedef struct _onnx_TensorProto { - pb_callback_t dims; - bool has_data_type; - onnx_TensorProto_DataType data_type; - bool has_segment; - onnx_TensorProto_Segment segment; - pb_callback_t float_data; - pb_callback_t int32_data; - pb_callback_t string_data; - pb_callback_t int64_data; - pb_callback_t name; - pb_callback_t raw_data; - pb_callback_t double_data; - pb_callback_t uint64_data; - pb_callback_t doc_string; -/* @@protoc_insertion_point(struct:onnx_TensorProto) */ -} onnx_TensorProto; - -/* Default values for struct fields */ - -/* Initializer values for message structs */ -#define onnx_AttributeProto_init_default {{{NULL}, NULL}, false, 0, false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, false, (onnx_AttributeProto_AttributeType)0} -#define onnx_ValueInfoProto_init_default {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}} -#define onnx_NodeProto_init_default {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}} -#define onnx_ModelProto_init_default {false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}} -#define onnx_StringStringEntryProto_init_default {{{NULL}, NULL}, {{NULL}, NULL}} -#define onnx_GraphProto_init_default {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}} -#define onnx_TensorProto_init_default {{{NULL}, NULL}, false, (onnx_TensorProto_DataType)0, false, onnx_TensorProto_Segment_init_default, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}} -#define onnx_TensorProto_Segment_init_default {false, 0, false, 0} -#define onnx_TensorShapeProto_init_default {{{NULL}, NULL}} -#define onnx_TensorShapeProto_Dimension_init_default {false, 0, {{NULL}, NULL}} -#define onnx_TypeProto_init_default {{{NULL}, NULL}} -#define onnx_TypeProto_Tensor_init_default {false, (onnx_TensorProto_DataType)0, {{NULL}, NULL}} -#define onnx_OperatorSetIdProto_init_default {{{NULL}, NULL}, false, 0} -#define onnx_AttributeProto_init_zero {{{NULL}, NULL}, false, 0, false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, false, (onnx_AttributeProto_AttributeType)0} -#define onnx_ValueInfoProto_init_zero {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}} -#define onnx_NodeProto_init_zero {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}} -#define onnx_ModelProto_init_zero {false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}} -#define onnx_StringStringEntryProto_init_zero {{{NULL}, NULL}, {{NULL}, NULL}} -#define onnx_GraphProto_init_zero {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}} -#define onnx_TensorProto_init_zero {{{NULL}, NULL}, false, (onnx_TensorProto_DataType)0, false, onnx_TensorProto_Segment_init_zero, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}} -#define onnx_TensorProto_Segment_init_zero {false, 0, false, 0} -#define onnx_TensorShapeProto_init_zero {{{NULL}, NULL}} -#define onnx_TensorShapeProto_Dimension_init_zero {false, 0, {{NULL}, NULL}} -#define onnx_TypeProto_init_zero {{{NULL}, NULL}} -#define onnx_TypeProto_Tensor_init_zero {false, (onnx_TensorProto_DataType)0, {{NULL}, NULL}} -#define onnx_OperatorSetIdProto_init_zero {{{NULL}, NULL}, false, 0} - -/* Field tags (for use in manual encoding/decoding) */ -#define onnx_GraphProto_node_tag 1 -#define onnx_GraphProto_name_tag 2 -#define onnx_GraphProto_initializer_tag 5 -#define onnx_GraphProto_doc_string_tag 10 -#define onnx_GraphProto_input_tag 11 -#define onnx_GraphProto_output_tag 12 -#define onnx_GraphProto_value_info_tag 13 -#define onnx_NodeProto_input_tag 1 -#define onnx_NodeProto_output_tag 2 -#define onnx_NodeProto_name_tag 3 -#define onnx_NodeProto_op_type_tag 4 -#define onnx_NodeProto_domain_tag 7 -#define onnx_NodeProto_attribute_tag 5 -#define onnx_NodeProto_doc_string_tag 6 -#define onnx_StringStringEntryProto_key_tag 1 -#define onnx_StringStringEntryProto_value_tag 2 -#define onnx_TensorShapeProto_dim_tag 1 -#define onnx_TypeProto_tensor_type_tag 1 -#define onnx_ValueInfoProto_name_tag 1 -#define onnx_ValueInfoProto_type_tag 2 -#define onnx_ValueInfoProto_doc_string_tag 3 -#define onnx_AttributeProto_name_tag 1 -#define onnx_AttributeProto_doc_string_tag 13 -#define onnx_AttributeProto_type_tag 20 -#define onnx_AttributeProto_f_tag 2 -#define onnx_AttributeProto_i_tag 3 -#define onnx_AttributeProto_s_tag 4 -#define onnx_AttributeProto_t_tag 5 -#define onnx_AttributeProto_g_tag 6 -#define onnx_AttributeProto_floats_tag 7 -#define onnx_AttributeProto_ints_tag 8 -#define onnx_AttributeProto_strings_tag 9 -#define onnx_AttributeProto_tensors_tag 10 -#define onnx_AttributeProto_graphs_tag 11 -#define onnx_ModelProto_ir_version_tag 1 -#define onnx_ModelProto_opset_import_tag 8 -#define onnx_ModelProto_producer_name_tag 2 -#define onnx_ModelProto_producer_version_tag 3 -#define onnx_ModelProto_domain_tag 4 -#define onnx_ModelProto_model_version_tag 5 -#define onnx_ModelProto_doc_string_tag 6 -#define onnx_ModelProto_graph_tag 7 -#define onnx_ModelProto_metadata_props_tag 14 -#define onnx_OperatorSetIdProto_domain_tag 1 -#define onnx_OperatorSetIdProto_version_tag 2 -#define onnx_TensorProto_Segment_begin_tag 1 -#define onnx_TensorProto_Segment_end_tag 2 -#define onnx_TensorShapeProto_Dimension_dim_value_tag 1 -#define onnx_TensorShapeProto_Dimension_dim_param_tag 2 -#define onnx_TypeProto_Tensor_elem_type_tag 1 -#define onnx_TypeProto_Tensor_shape_tag 2 -#define onnx_TensorProto_dims_tag 1 -#define onnx_TensorProto_data_type_tag 2 -#define onnx_TensorProto_segment_tag 3 -#define onnx_TensorProto_float_data_tag 4 -#define onnx_TensorProto_int32_data_tag 5 -#define onnx_TensorProto_string_data_tag 6 -#define onnx_TensorProto_int64_data_tag 7 -#define onnx_TensorProto_name_tag 8 -#define onnx_TensorProto_doc_string_tag 12 -#define onnx_TensorProto_raw_data_tag 9 -#define onnx_TensorProto_double_data_tag 10 -#define onnx_TensorProto_uint64_data_tag 11 - -/* Struct field encoding specification for nanopb */ -extern const pb_field_t onnx_AttributeProto_fields[14]; -extern const pb_field_t onnx_ValueInfoProto_fields[4]; -extern const pb_field_t onnx_NodeProto_fields[8]; -extern const pb_field_t onnx_ModelProto_fields[10]; -extern const pb_field_t onnx_StringStringEntryProto_fields[3]; -extern const pb_field_t onnx_GraphProto_fields[8]; -extern const pb_field_t onnx_TensorProto_fields[13]; -extern const pb_field_t onnx_TensorProto_Segment_fields[3]; -extern const pb_field_t onnx_TensorShapeProto_fields[2]; -extern const pb_field_t onnx_TensorShapeProto_Dimension_fields[3]; -extern const pb_field_t onnx_TypeProto_fields[2]; -extern const pb_field_t onnx_TypeProto_Tensor_fields[3]; -extern const pb_field_t onnx_OperatorSetIdProto_fields[3]; - -/* Maximum encoded size of messages (where known) */ -/* onnx_AttributeProto_size depends on runtime parameters */ -/* onnx_ValueInfoProto_size depends on runtime parameters */ -/* onnx_NodeProto_size depends on runtime parameters */ -/* onnx_ModelProto_size depends on runtime parameters */ -/* onnx_StringStringEntryProto_size depends on runtime parameters */ -/* onnx_GraphProto_size depends on runtime parameters */ -/* onnx_TensorProto_size depends on runtime parameters */ -#define onnx_TensorProto_Segment_size 22 -/* onnx_TensorShapeProto_size depends on runtime parameters */ -/* onnx_TensorShapeProto_Dimension_size depends on runtime parameters */ -/* onnx_TypeProto_size depends on runtime parameters */ -/* onnx_TypeProto_Tensor_size depends on runtime parameters */ -/* onnx_OperatorSetIdProto_size depends on runtime parameters */ - -/* Message IDs (where set with "msgid" option) */ -#ifdef PB_MSGID - -#define ONNX_MESSAGES \ - - -#endif - -#ifdef __cplusplus -} /* extern "C" */ -#endif -/* @@protoc_insertion_point(eof) */ - -#endif diff --git a/torch/csrc/onnx/onnx.options b/torch/csrc/onnx/onnx.options deleted file mode 100644 index dd02d208eb7698..00000000000000 --- a/torch/csrc/onnx/onnx.options +++ /dev/null @@ -1,24 +0,0 @@ -# Note [Callback for nested messages] -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# nanopb's default translation for a nested, non-repeated (possibly -# optional) message is to include it *inline* (no indirection), with -# a boolean has_g/has_t field to indicate its presence or not. Why -# do we not like this? It's not compatible with our ownership model, -# where a TensorProto/GraphProto class owns the protobuf struct it -# is constructing. With the default translation, the protobuf struct -# occurs in two places: a TensorProto, AND the parent protobuf struct -# field. That's bad. Turning it back into a callback solves the -# ownership problem. -# -# Two more bonuses: at the cost of an indirection, we no longer waste fields -# when we aren't actually storing a graph/tensor; furthermore, circular -# dependencies now work! - -onnx.AttributeProto.g type:FT_CALLBACK -onnx.AttributeProto.t type:FT_CALLBACK -onnx.ModelProto.graph type:FT_CALLBACK -onnx.TypeProto.Tensor.shape type:FT_CALLBACK -onnx.TypeProto.tensor_type type:FT_CALLBACK -onnx.ValueInfoProto.type type:FT_CALLBACK -onnx.TypeProto no_unions:true -onnx.TensorShapeProto.Dimension no_unions:true diff --git a/torch/csrc/utils/hash.h b/torch/csrc/utils/hash.h index 05a5a27b51223a..954a7b5b7d0814 100644 --- a/torch/csrc/utils/hash.h +++ b/torch/csrc/utils/hash.h @@ -32,7 +32,7 @@ namespace torch { // DEALINGS IN THE SOFTWARE. inline size_t hash_combine(size_t seed, size_t value) { - return seed ^ (value + 0x9e3779b9 + (seed << 6) + (seed >> 2)); + return seed ^ (value + 0x9e3779b9 + (seed << 6u) + (seed >> 2u)); } //////////////////////////////////////////////////////////////////////////////// diff --git a/torch/csrc/utils/invalid_arguments.cpp b/torch/csrc/utils/invalid_arguments.cpp index f8d5fd1ba1cd63..0160bdd2d8e506 100644 --- a/torch/csrc/utils/invalid_arguments.cpp +++ b/torch/csrc/utils/invalid_arguments.cpp @@ -16,7 +16,7 @@ std::string py_typename(PyObject *object) { struct Type { virtual bool is_matching(PyObject *object) = 0; - virtual ~Type() {}; + virtual ~Type() = default; }; struct SimpleType: public Type { diff --git a/torch/csrc/utils/invalid_arguments.h b/torch/csrc/utils/invalid_arguments.h index 138c3331113b7c..daaccfd877f377 100644 --- a/torch/csrc/utils/invalid_arguments.h +++ b/torch/csrc/utils/invalid_arguments.h @@ -7,7 +7,9 @@ namespace torch { std::string format_invalid_args( - PyObject *args, PyObject *kwargs, const std::string& name, + PyObject* given_args, + PyObject* given_kwargs, + const std::string& function_name, const std::vector& options); } // namespace torch diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h index b00bd27c087495..0f2f51904c2554 100644 --- a/torch/csrc/utils/python_arg_parser.h +++ b/torch/csrc/utils/python_arg_parser.h @@ -90,8 +90,8 @@ struct PythonArgParser { private: [[noreturn]] - void print_error(PyObject* args, PyObject* kwargs, PyObject* dst[]); - PythonArgs raw_parse(PyObject* args, PyObject* kwargs, PyObject* dst[]); + void print_error(PyObject* args, PyObject* kwargs, PyObject* parsed_args[]); + PythonArgs raw_parse(PyObject* args, PyObject* kwargs, PyObject* parsed_args[]); std::vector signatures_; std::string function_name; diff --git a/torch/csrc/utils/tensor_apply.h b/torch/csrc/utils/tensor_apply.h index 47fbaa672c4262..5dfdef98c81db4 100644 --- a/torch/csrc/utils/tensor_apply.h +++ b/torch/csrc/utils/tensor_apply.h @@ -6,8 +6,8 @@ namespace torch { namespace utils { at::Tensor & apply_(at::Tensor & self, PyObject* fn); -at::Tensor & map_(at::Tensor & self, const at::Tensor & other, PyObject* fn); -at::Tensor & map2_(at::Tensor & self, const at::Tensor & other1, - const at::Tensor & other2, PyObject* fn); +at::Tensor & map_(at::Tensor & self, const at::Tensor & other_, PyObject* fn); +at::Tensor & map2_(at::Tensor & self, const at::Tensor & x_, + const at::Tensor & y_, PyObject* fn); }} // namespace torch::utils diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp index 3a8b4a7bbc1592..d03fd55f2accfc 100644 --- a/torch/csrc/utils/tensor_new.cpp +++ b/torch/csrc/utils/tensor_new.cpp @@ -139,8 +139,10 @@ ScalarType infer_scalar_type(PyObject *obj) { } #ifdef USE_NUMPY if (PyArray_Check(obj)) { - auto array = (PyArrayObject*)obj; - return numpy_dtype_to_aten(PyArray_TYPE(array)); + return numpy_dtype_to_aten(PyArray_TYPE((PyArrayObject*)obj)); + } + if (PyArray_CheckScalar(obj)) { + return numpy_dtype_to_aten(PyArray_TYPE((PyArrayObject*)(PyArray_FromScalar(obj, NULL)))); } #endif if (PySequence_Check(obj)) { diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py index f8b26b121fd3e8..a2086ae95b899c 100644 --- a/torch/distributed/__init__.py +++ b/torch/distributed/__init__.py @@ -61,7 +61,8 @@ def init_process_group(backend, init_method='env://', **kwargs): group_name (str, optional): Group name. See description of init methods. To enable ``backend == mpi``, PyTorch needs to built from source on a system that - supports MPI. + supports MPI. If you want to use Openmpi with CUDA-aware support, please use Openmpi + major version 2 and above. """ world_size = kwargs.pop('world_size', -1) diff --git a/torch/distributions/__init__.py b/torch/distributions/__init__.py index ca961d88ba0a63..47ee177c2cc959 100644 --- a/torch/distributions/__init__.py +++ b/torch/distributions/__init__.py @@ -96,6 +96,7 @@ from .lowrank_multivariate_normal import LowRankMultivariateNormal from .multinomial import Multinomial from .multivariate_normal import MultivariateNormal +from .negative_binomial import NegativeBinomial from .normal import Normal from .one_hot_categorical import OneHotCategorical from .pareto import Pareto @@ -129,6 +130,7 @@ 'LogisticNormal', 'Multinomial', 'MultivariateNormal', + 'NegativeBinomial', 'Normal', 'OneHotCategorical', 'Pareto', diff --git a/torch/distributions/constraint_registry.py b/torch/distributions/constraint_registry.py index a263082c967fe1..f8688af3f3a392 100644 --- a/torch/distributions/constraint_registry.py +++ b/torch/distributions/constraint_registry.py @@ -164,7 +164,9 @@ def _transform_to_positive(constraint): @biject_to.register(constraints.greater_than) +@biject_to.register(constraints.greater_than_eq) @transform_to.register(constraints.greater_than) +@transform_to.register(constraints.greater_than_eq) def _transform_to_greater_than(constraint): return transforms.ComposeTransform([transforms.ExpTransform(), transforms.AffineTransform(constraint.lower_bound, 1)]) @@ -178,7 +180,9 @@ def _transform_to_less_than(constraint): @biject_to.register(constraints.interval) +@biject_to.register(constraints.half_open_interval) @transform_to.register(constraints.interval) +@transform_to.register(constraints.half_open_interval) def _transform_to_interval(constraint): # Handle the special case of the unit interval. lower_is_0 = isinstance(constraint.lower_bound, numbers.Number) and constraint.lower_bound == 0 diff --git a/torch/distributions/constraints.py b/torch/distributions/constraints.py index 18da2bff1392a4..0b6eb53b0cd93a 100644 --- a/torch/distributions/constraints.py +++ b/torch/distributions/constraints.py @@ -27,8 +27,10 @@ 'dependent', 'dependent_property', 'greater_than', + 'greater_than_eq', 'integer_interval', 'interval', + 'half_open_interval', 'is_dependent', 'less_than', 'lower_cholesky', @@ -151,6 +153,17 @@ def check(self, value): return self.lower_bound < value +class _GreaterThanEq(Constraint): + """ + Constrain to a real half line `[lower_bound, inf)`. + """ + def __init__(self, lower_bound): + self.lower_bound = lower_bound + + def check(self, value): + return self.lower_bound <= value + + class _LessThan(Constraint): """ Constrain to a real half line `[-inf, upper_bound)`. @@ -174,6 +187,18 @@ def check(self, value): return (self.lower_bound <= value) & (value <= self.upper_bound) +class _HalfOpenInterval(Constraint): + """ + Constrain to a real interval `[lower_bound, upper_bound)`. + """ + def __init__(self, lower_bound, upper_bound): + self.lower_bound = lower_bound + self.upper_bound = upper_bound + + def check(self, value): + return (self.lower_bound <= value) & (value < self.upper_bound) + + class _Simplex(Constraint): """ Constrain to the unit simplex in the innermost (rightmost) dimension. @@ -240,9 +265,11 @@ def check(self, value): real_vector = _RealVector() positive = _GreaterThan(0.) greater_than = _GreaterThan +greater_than_eq = _GreaterThanEq less_than = _LessThan unit_interval = _Interval(0., 1.) interval = _Interval +half_open_interval = _HalfOpenInterval simplex = _Simplex() lower_triangular = _LowerTriangular() lower_cholesky = _LowerCholesky() diff --git a/torch/distributions/negative_binomial.py b/torch/distributions/negative_binomial.py new file mode 100644 index 00000000000000..854ad5b7b087fa --- /dev/null +++ b/torch/distributions/negative_binomial.py @@ -0,0 +1,83 @@ +import torch +import torch.nn.functional as F +from torch.distributions import constraints +from torch.distributions.distribution import Distribution +from torch.distributions.utils import broadcast_all, probs_to_logits, lazy_property, logits_to_probs + + +class NegativeBinomial(Distribution): + r""" + Creates a Negative Binomial distribution, i.e. distribution + of the number of independent identical Bernoulli trials + needed before `total_count` failures are achieved. The probability + of success of each Bernoulli trial is `probs`. + + Args: + total_count (float or Tensor): non-negative number of negative Bernoulli + trials to stop, although the distribution is still valid for real + valued count + probs (Tensor): Event probabilities of success in the half open interval [0, 1) + logits (Tensor): Event log-odds for probabilities of success + """ + arg_constraints = {'total_count': constraints.greater_than_eq(0), + 'probs': constraints.half_open_interval(0., 1.)} + support = constraints.nonnegative_integer + + def __init__(self, total_count, probs=None, logits=None, validate_args=None): + if (probs is None) == (logits is None): + raise ValueError("Either `probs` or `logits` must be specified, but not both.") + if probs is not None: + self.total_count, self.probs, = broadcast_all(total_count, probs) + self.total_count = self.total_count.type_as(self.probs) + else: + self.total_count, self.logits, = broadcast_all(total_count, logits) + self.total_count = self.total_count.type_as(self.logits) + + self._param = self.probs if probs is not None else self.logits + batch_shape = self._param.size() + super(NegativeBinomial, self).__init__(batch_shape, validate_args=validate_args) + + def _new(self, *args, **kwargs): + return self._param.new(*args, **kwargs) + + @property + def mean(self): + return self.total_count * torch.exp(self.logits) + + @property + def variance(self): + return self.mean / torch.sigmoid(-self.logits) + + @lazy_property + def logits(self): + return probs_to_logits(self.probs, is_binary=True) + + @lazy_property + def probs(self): + return logits_to_probs(self.logits, is_binary=True) + + @property + def param_shape(self): + return self._param.size() + + @lazy_property + def _gamma(self): + return torch.distributions.Gamma(concentration=self.total_count, + rate=torch.exp(-self.logits)) + + def sample(self, sample_shape=torch.Size()): + with torch.no_grad(): + rate = self._gamma.sample(sample_shape=sample_shape) + return torch.poisson(rate) + + def log_prob(self, value): + if self._validate_args: + self._validate_sample(value) + + log_unnormalized_prob = (self.total_count * F.logsigmoid(-self.logits) + + value * F.logsigmoid(self.logits)) + + log_normalization = (-torch.lgamma(self.total_count + value) + torch.lgamma(1. + value) + + torch.lgamma(self.total_count)) + + return log_unnormalized_prob - log_normalization diff --git a/torch/distributions/utils.py b/torch/distributions/utils.py index ccc0ffffa2ec21..0219942aac155a 100644 --- a/torch/distributions/utils.py +++ b/torch/distributions/utils.py @@ -32,30 +32,19 @@ def _finfo(tensor): return _FINFO[tensor.storage_type()] -def _broadcast_shape(shapes): - r""" - Given a list of tensor sizes, returns the size of the resulting broadcasted - tensor. - - Args: - shapes (list of torch.Size): list of tensor sizes - """ - shape = torch.Size() - for s in shapes: - shape = torch._C._infer_size(s, shape) - return shape +# promote numbers to tensors of dtype torch.get_default_dtype() +def _default_promotion(v): + return torch.tensor(v, dtype=torch.get_default_dtype()) def broadcast_all(*values): r""" Given a list of values (possibly containing numbers), returns a list where each value is broadcasted based on the following rules: - - `torch.*Tensor` instances are broadcasted as per the `broadcasting rules - `_ + - `torch.*Tensor` instances are broadcasted as per :ref:`_broadcasting-semantics`. - numbers.Number instances (scalars) are upcast to tensors having the same size and type as the first tensor passed to `values`. If all the - values are scalars, then they are upcasted to Tensors having size - `(1,)`. + values are scalars, then they are upcasted to scalar Tensors. Args: values (list of `numbers.Number` or `torch.*Tensor`) @@ -64,22 +53,16 @@ def broadcast_all(*values): ValueError: if any of the values is not a `numbers.Number` or `torch.*Tensor` instance """ - values = list(values) - scalar_idxs = [i for i in range(len(values)) if isinstance(values[i], Number)] - tensor_idxs = [i for i in range(len(values)) if values[i].__class__.__name__ == 'Tensor'] - if len(scalar_idxs) + len(tensor_idxs) != len(values): + if not all(torch.is_tensor(v) or isinstance(v, Number) for v in values): raise ValueError('Input arguments must all be instances of numbers.Number or torch.tensor.') - if tensor_idxs: - broadcast_shape = _broadcast_shape([values[i].size() for i in tensor_idxs]) - for idx in tensor_idxs: - values[idx] = values[idx].expand(broadcast_shape) - template = values[tensor_idxs[0]] - for idx in scalar_idxs: - values[idx] = template.new(template.size()).fill_(values[idx]) - else: - for idx in scalar_idxs: - values[idx] = torch.tensor(float(values[idx])) - return values + if not all(map(torch.is_tensor, values)): + new_tensor = _default_promotion + for value in values: + if torch.is_tensor(value): + new_tensor = value.new_tensor + break + values = [v if torch.is_tensor(v) else new_tensor(v) for v in values] + return torch.broadcast_tensors(*values) def _sum_rightmost(value, dim): diff --git a/torch/functional.py b/torch/functional.py index 19d47f394fa757..0133a012981854 100644 --- a/torch/functional.py +++ b/torch/functional.py @@ -10,6 +10,7 @@ 'argmin', 'btrifact', 'btriunpack', + 'broadcast_tensors', 'isfinite', 'isinf', 'isnan', @@ -19,6 +20,28 @@ ] +def broadcast_tensors(*tensors): + r"""broadcast_tensors(*tensors) -> List of Tensors + + Broadcasts the given tensors according to :ref:`_broadcasting-semantics`. + + Args: + *tensors: any number of tensors of the same type + + Example:: + + >>> x = torch.arange(3).view(1, 3) + >>> y = torch.arange(2).view(2, 1) + >>> a, b = torch.broadcast_tensors(x, y) + >>> a.size() + torch.Size([2, 3]) + >>> a + tensor([[0, 1, 2], + [0, 1, 2]]) + """ + return torch._C._VariableFunctions.broadcast_tensors(tensors) + + def split(tensor, split_size_or_sections, dim=0): r"""Splits the tensor into chunks. diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py index c0cf4f9d1c2e75..d09e970f729470 100644 --- a/torch/jit/__init__.py +++ b/torch/jit/__init__.py @@ -403,9 +403,12 @@ def wrapper(*args): else: new_args.append(arg) res = res_mod(*new_args) - # assert len(res) / 3 == 0 - # result = [BatchTensor(*res[i * 3: i * 3 + 3]) for i in range(len(res) // 3)] - result = BatchTensor(*res) + assert len(res) % 3 == 0 + if len(res) % 3 != 0: + raise "non-batched-tensor output is not supported yet" + result = [BatchTensor(*res[i * 3: i * 3 + 3]) for i in range(len(res) // 3)] + if len(result) == 1: + return result[0] return result wrapper.__doc__ = fn.__doc__ return wrapper diff --git a/torch/jit/annotations.py b/torch/jit/annotations.py index 77e6cf777f2784..1db7749e07e34e 100644 --- a/torch/jit/annotations.py +++ b/torch/jit/annotations.py @@ -3,7 +3,7 @@ import ast import inspect import torch -from torch._C import DynamicType, TupleType +from torch._C import DynamicType, TupleType, FloatType, IntType from textwrap import dedent @@ -204,9 +204,13 @@ def as_ann(ann): def ann_to_type(ann): if ann is None: - return DynamicType() + return DynamicType.get() elif ann is torch.Tensor: - return DynamicType() + return DynamicType.get() elif is_tuple(ann): return TupleType([ann_to_type(a) for a in ann.__args__]) + elif ann is float: + return FloatType.get() + elif ann is int: + return IntType.get() raise ValueError("The only supported annotations kinds are Tensor and Tuple[...]") diff --git a/torch/jit/batchop.py b/torch/jit/batchop.py index bda6a3adca3a88..053130dc0fb488 100644 --- a/torch/jit/batchop.py +++ b/torch/jit/batchop.py @@ -1,6 +1,9 @@ import torch +from torch.jit import BatchTensor +# TODO: there are some commented raise statements +# when we support rasie exception in script, we want to check them @torch.jit.script def batch_tanh(data, mask, dims): data = torch.tanh(data) @@ -14,13 +17,52 @@ def batch_sigmoid(data, mask, dims): @torch.jit.script -def batch_add(data1, mask1, dims1, data2, mask2, dims2): - data = torch.add(data1, data2) +def batch_relu(data, mask, dims): + data = torch.relu(data) + return data, mask, dims + + +@torch.jit.script +def batch_neg(data, mask, dims): + data = torch.neg(data) + return data, mask, dims + + +@torch.jit.script +def batch_neg_scalar(data): + return torch.neg(data) + + +@torch.jit.script +def batch_add(data1, mask1, dims1, data2, mask2, dims2, alpha_): + alpha = float(alpha_) + data = torch.add(data1, data2, alpha) + mask = mask1 * mask2 + dims = dims1 or dims2 + return data, mask, dims + + +@torch.jit.script +def batch_add_scalar(data, mask, dims, other, alpha_): + alpha = float(alpha_) + data = torch.add(data, other.type_as(data), alpha) + return data, mask, dims + + +@torch.jit.script +def batch_sub(data1, mask1, dims1, data2, mask2, dims2, alpha_): + alpha = float(alpha_) + data = torch.sub(data1, data2, alpha) mask = mask1 * mask2 dims = dims1 or dims2 return data, mask, dims +@torch.jit.script +def batch_sub_scalar(data1, data2): + return data1 - data2 + + @torch.jit.script def batch_mul(data1, mask1, dims1, data2, mask2, dims2): data = torch.mul(data1, data2) @@ -29,6 +71,17 @@ def batch_mul(data1, mask1, dims1, data2, mask2, dims2): return data, mask, dims +@torch.jit.script +def batch_mul_scalar(data1, data2): + return data1 * data2 + + +@torch.jit.script +def batch_div(data, mask, dims, other): # div(batchtensor, scalar) + data = torch.div(data, other) + return data, mask, dims + + @torch.jit.script def batch_mm(data1, mask1, dims1, data2, mask2, dims2): data1 = data1 * mask1.type_as(data1) @@ -88,26 +141,388 @@ def batch_select(data, mask, dims, dim_, index_): # raise ValueError("Cannot select 0 dim in BatchTensor") data = data.select(dim, index) if dims[dim - 1]: - mask = mask.select(dim, 0) - else: mask = mask.select(dim, index) + else: + mask = mask.select(dim, 0) dims = torch.cat((dims[:dim - 1], dims[dim:dims.size(0)])) return data, mask, dims +@torch.jit.script +def batch_fmod(data, mask, dims, other_): + other = int(other_) + data = torch.fmod(data, other) + return data, mask, dims + + +@torch.jit.script +def batch_zeros_like(data, mask, dims): + res_data = torch.zeros_like(data) + return res_data, mask, dims + + +@torch.jit.script +def batch_index_select(data, mask, dims, dim_, index_data, index_mask, index_dims): + dim = int(dim_) + # if dim == 0: + # raise ValueError("Cannot index_select along 0 dim in BatchTensor") + batch_size = data.size(0) # TODO maybe index_mask will be used at some point + res_data = torch.zeros([0]) + res_mask = torch.zeros([0]) + for i in range(batch_size): + d = data[i].index_select(dim - 1, index_data[i]).unsqueeze(0) + if dims[dim - 1]: + m = mask[i].index_select(dim - 1, index_data[i]).unsqueeze(0) + else: + m = mask[i].unsqueeze(0) + if i == 0: + res_data = d + res_mask = m + else: + res_data = torch.cat((res_data, d), 0) + res_mask = torch.cat((res_mask, m), 0) + return res_data, res_mask, dims + + +@torch.jit.script +def batch_view_as(data, mask, dims, data1, mask1, dims1): + # if data.size(0) != data1.size(0): + # raise ValueError("In view_as, tensor and target tensor should have the same batch_size") + # if not torch.equal(dims, dims1): + # raise ValueError("In batched view_as, dims and target dims should be the same") + data = data.view_as(data1) + mask = mask.view_as(mask1) + dims = dims1 + return data, mask, dims + + # assume data, data1, data2 have same size @torch.jit.script def batch_where(data, mask, dims, data1, mask1, dims1, data2, mask2, dims2): - res_data = torch.where(data, data1, data2) - res_mask = torch.where(data, mask1, mask2) + data = data * mask.type_as(data) + cond_data = data + cond_mask = data + if data.dim() == 1: + for _ in range(data1.dim() - 1): + data = data.unsqueeze(data.dim()) + cond_data = data.expand_as(data1) + cond_mask = data.expand_as(mask1) + res_data = torch.where(cond_data, data1, data2) + res_mask = torch.where(cond_mask, mask1, mask2) res_dims = dims1 or dims2 return res_data, res_mask, res_dims + +@torch.jit.script +def batch_where_scalar(cond_, data1, mask1, dims1, data2, mask2, dims2): + cond = torch.zeros([1], dtype=torch.uint8) * cond_ + res_data = torch.where(cond, data1, data2) + res_mask = torch.where(cond, mask1, mask2) + res_dims = torch.where(cond, dims1, dims2) + return res_data, res_mask, res_dims + + +@torch.jit.script +def batch_update(batch_data, batch_mask, batch_dims, new_data, new_mask, new_dims): + data = torch.where(new_mask, new_data, batch_data) + return data, new_mask, new_dims # TODO: consider whether return new_mask and new_dims + + +@torch.jit.script +def batch_any(data, mask, dims): + return torch.gt(torch.sum(data * mask), 0) + + +@torch.jit.script +def batch_type_as(data, mask, dims, data1, mask1, dims1): + return data.type_as(data1), mask, dims + + +@torch.jit.script +def batch_gt(data, mask, dims, data1, mask1, dims1): + return torch.gt(data, data1), mask * mask1, dims or dims1 + + +@torch.jit.script +def batch_gt_scalar(data1, data2): + return torch.gt(data1, data2) + + +@torch.jit.script +def batch_gt_one_scalar(data, mask, dims, other_): + other = float(other_) + return torch.gt(data, other), mask, dims + + +@torch.jit.script +def batch_lt(data, mask, dims, data1, mask1, dims1): + return torch.lt(data, data1), mask * mask1, dims or dims1 + + +@torch.jit.script +def batch_eq(data, mask, dims, data1, mask1, dims1): + return torch.eq(data, data1), mask * mask1, dims or dims1 + + +@torch.jit.script +def batch_size(data, mask, dims, dim_): + dim = int(dim_) + return data.size(dim) + + +@torch.jit.script +def batch_dim(data, mask, dims): + return data.dim() + + +@torch.jit.script +def batch_squeeze(data, mask, dims, dim_): + if int(dim_) < 0: + dim_ += data.dim() + dim = int(dim_) + # if dim == 0: + # raise ValueError("cannot do squeeze along batch_dim") + data = data.squeeze(dim) + mask = mask.squeeze(dim) + dims = torch.cat((dims[:dim - 1], dims[dim:dims.size(0)])) + return data, mask, dims + + +@torch.jit.script +def batch_unsqueeze(data, mask, dims, dim_): + if int(dim_) < 0: + dim_ += data.dim() + 1 + dim = int(dim_) + # if dim == 0: + # raise ValueError("cannot do unsqueeze along batch_dim") + data = data.unsqueeze(dim) + mask = mask.unsqueeze(dim) + dims = torch.cat((dims[:dim], torch.zeros([1], dtype=torch.uint8), dims[dim:dims.size(0)])) + return data, mask, dims + + +@torch.jit.script +def batch_argmax(data, mask, dims, dim_, keepdim_): + dim = int(dim_) + keepdim = int(keepdim_) + # if dim == 0: + # raise ValueError("cannot do argmax along batch_dim") + batch_size = data.size(0) + res_data = torch.zeros([0]) + for i in range(batch_size): + if dims[dim - 1]: + if dim - 1 != 0: + m = mask[i].transpose(0, dim - 1) + else: + m = mask[i] + valid_num = m.sum(0, keepdim=True) + while(valid_num.dim() >= 1): + valid_num = valid_num[0] + d = data[i].unsqueeze(0).narrow(dim, 0, int(valid_num)) + else: + d = data[i].unsqueeze(0) + d = d.argmax(dim, keepdim) + if i == 0: + res_data = d + else: + res_data = torch.cat([res_data, d], 0) + if keepdim: + mask = mask + else: + mask = mask.select(dim, 0) + dims = torch.cat((dims[:dim - 1], dims[dim:dims.size(0)])) + return res_data, mask, dims + + +@torch.jit.script +def batch_topk(data, mask, dims, k_, dim_, largest_, sorted_): + k = int(k_) + dim = int(dim_) + largest = int(largest_) + sorted = int(sorted_) + # if dim == 0: + # raise ValueError("cannot do topk along batch_dim") + batch_size = data.size(0) + res_data = torch.zeros([0]) + res_index = torch.zeros([0]) + for i in range(batch_size): + if dims[dim - 1]: + if dim - 1 != 0: + m = mask[i].transpose(0, dim - 1) + else: + m = mask[i] + valid_num = m.sum(0, keepdim=True) + while(valid_num.dim() >= 1): + valid_num = valid_num[0] + d = data[i].unsqueeze(0).narrow(dim, 0, int(valid_num)) + else: + d = data[i].unsqueeze(0) + d, idx = d.topk(k, dim, largest, sorted) + if i == 0: + res_data = d + res_index = idx + else: + res_data = torch.cat([res_data, d], 0) + res_index = torch.cat([res_index, idx], 0) + if dims[dim - 1]: + mask = mask.narrow(dim, 0, k) + return res_data, mask, dims, res_index, mask, dims + + +@torch.jit.script +def batch_softmax(data, mask, dims, dim_): + dim = int(dim_) + # if dim == 0: + # raise ValueError("cannot do softmax along batch_dim") + batch_size = data.size(0) + max_len = data.size(dim) + res_data = torch.zeros([0]) + for i in range(batch_size): + if dims[dim - 1]: + if dim - 1 != 0: + m = mask[i].transpose(0, dim - 1) + else: + m = mask[i] + valid_num = m.sum(0, keepdim=True) + while(valid_num.dim() >= 1): + valid_num = valid_num[0] + valid_num = int(valid_num) + d = data[i].unsqueeze(0).narrow(dim, 0, valid_num).softmax(dim) + if valid_num < max_len: + d = torch.cat([d, data[i].unsqueeze(0).narrow(dim, valid_num, max_len - valid_num)], dim) + else: + d = data[i].unsqueeze(0).softmax(dim) + if i == 0: + res_data = d + else: + res_data = torch.cat([res_data, d], 0) + return res_data, mask, dims + + +# size argument in dynamic dimension has to be -1 +# in static dimension, size has to be specified, -1 is not supported +@torch.jit.script +def batch_view(data, mask, dims, sizes): + batch_size = data.size(0) + # if(sizes[0] != batch_size and sizes[0] != -1 and sizes[0] != 1): + # raise "first dim in view must be 1, -1, or batch size" + # for i in range(dims.size(0)): + # if dims[0] == 1 and sizes[i + 1] != -1: + # raise "size argument in dynamic dimension has to be -1" + sizes = sizes.type_as(torch.ones([1], dtype=torch.int)) + data_sizes_ = torch.cat([torch.ones([1], dtype=torch.int) * batch_size, sizes.narrow(0, 1, sizes.size(0) - 1)], 0) + data_sizes = data_sizes_._tensor_to_list() + res_data = data.view(data_sizes) + mask_sizes_ = data_sizes_.narrow(0, 0, 1) + res_dims = data_sizes_.narrow(0, 0, 1) + for i_ in range(sizes.size(0) - 1): + i = i_ + 1 + if(sizes[i] == -1): + cur_size_ = mask.size(i) + cur_dim = 1 + else: + cur_size_ = 1 + cur_dim = 0 + mask_sizes_ = torch.cat([mask_sizes_, torch.ones([1], dtype=torch.int) * cur_size_]) + res_dims = torch.cat([res_dims, torch.ones([1], dtype=torch.int) * cur_dim]) + mask_sizes = mask_sizes_._tensor_to_list() + res_mask = mask.view(mask_sizes) + return res_data, res_mask, res_dims.narrow(0, 1, res_dims.size(0) - 1).type_as(dims) + + +@torch.jit.script +def batch_cat2(data1, mask1, dims1, data2, mask2, dims2, dim_): + dim = int(dim_) + data = torch.cat([data1, data2], dim) + if(dims1[dim - 1]): + mask = torch.cat([mask1, mask2], dim) + else: + mask = mask1 + return data, mask, dims1 + + +@torch.jit.script +def batch_cat3(data1, mask1, dims1, data2, mask2, dims2, data3, mask3, dims3, dim_): + dim = int(dim_) + data = torch.cat([data1, data2, data3], dim) + if(dims1[dim - 1]): + mask = torch.cat([mask1, mask2, mask3], dim) + else: + mask = mask1 + return data, mask, dims1 + + +@torch.jit.script +def batch_narrow(data, mask, dims, dimension_, start_, length_): + dimension = int(dimension_) + start = int(start_) + length = int(length_) + # if dimension == 0: + # raise ValueError("cannot do narrow along batch_dim") + data = data.narrow(dimension, start, length) + if dims[dimension - 1]: + mask = mask.narrow(dimension, start, length) + else: + mask = mask.narrow(dimension, 0, 1) + return data, mask, dims + + +@torch.jit.script +def batch_sum(data, mask, dims): + data = data * mask.type_as(data) + for _ in range(dims.size(0)): + data = data.sum(1) + mask = torch.ones([data.size(0)], dtype=torch.uint8) + dims = dims[:0] # empty tensor + return data, mask, dims + + +@torch.jit.script +def batch_from_scalar_tensor(data): + data = data.unsqueeze(0) + mask = torch.ones([1], dtype=torch.uint8) + dims = torch.zeros([0], dtype=torch.uint8) + return data, mask, dims + torch.register_batch_operator("tanh", batch_tanh.graph) torch.register_batch_operator("sigmoid", batch_sigmoid.graph) +torch.register_batch_operator("relu", batch_relu.graph) +torch.register_batch_operator("neg", batch_neg.graph) +torch.register_batch_operator("neg", batch_neg_scalar.graph) torch.register_batch_operator("add", batch_add.graph) +torch.register_batch_operator("add", batch_add_scalar.graph) +torch.register_batch_operator("sub", batch_sub.graph) +torch.register_batch_operator("sub", batch_sub_scalar.graph) torch.register_batch_operator("mul", batch_mul.graph) +torch.register_batch_operator("mul", batch_mul_scalar.graph) +torch.register_batch_operator("div", batch_div.graph) torch.register_batch_operator("matmul", batch_matmul.graph) torch.register_batch_operator("mm", batch_mm.graph) +torch.register_batch_operator("fmod", batch_fmod.graph) +torch.register_batch_operator("zeros_like", batch_zeros_like.graph) torch.register_batch_operator("select", batch_select.graph) +torch.register_batch_operator("index_select", batch_index_select.graph) +torch.register_batch_operator("view_as", batch_view_as.graph) torch.register_batch_operator("where", batch_where.graph) +torch.register_batch_operator("where", batch_where_scalar.graph) +torch.register_batch_operator("update", batch_update.graph) +torch.register_batch_operator("any", batch_any.graph) +torch.register_batch_operator("type_as", batch_type_as.graph) +torch.register_batch_operator("gt", batch_gt.graph) +torch.register_batch_operator("gt", batch_gt_scalar.graph) +torch.register_batch_operator("gt", batch_gt_one_scalar.graph) +torch.register_batch_operator("lt", batch_lt.graph) +torch.register_batch_operator("eq", batch_eq.graph) +torch.register_batch_operator("size", batch_size.graph) +torch.register_batch_operator("dim", batch_dim.graph) +torch.register_batch_operator("squeeze", batch_squeeze.graph) +torch.register_batch_operator("unsqueeze", batch_unsqueeze.graph) +torch.register_batch_operator("argmax", batch_argmax.graph) +torch.register_batch_operator("topk", batch_topk.graph) +torch.register_batch_operator("softmax", batch_softmax.graph) +torch.register_batch_operator("view", batch_view.graph) +torch.register_batch_operator("cat", batch_cat2.graph) +torch.register_batch_operator("cat", batch_cat3.graph) +torch.register_batch_operator("narrow", batch_narrow.graph) +torch.register_batch_operator("sum", batch_sum.graph) +torch.register_batch_operator("batch_from_scalar_tensor", batch_from_scalar_tensor.graph) diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py index d152b2010fcae4..bc979d15141121 100644 --- a/torch/jit/frontend.py +++ b/torch/jit/frontend.py @@ -435,8 +435,8 @@ def build_List(ctx, expr): @staticmethod def build_Tuple(ctx, expr): - return ListLiteral(ctx.make_range(expr.lineno, expr.col_offset, expr.col_offset + 1), - [build_expr(ctx, e) for e in expr.elts]) + return TupleLiteral(ctx.make_range(expr.lineno, expr.col_offset, expr.col_offset + 1), + [build_expr(ctx, e) for e in expr.elts]) @staticmethod def build_Num(ctx, expr): diff --git a/torch/legacy/nn/ELU.py b/torch/legacy/nn/ELU.py index 6ad240658a9e28..9e00e8a172fc88 100644 --- a/torch/legacy/nn/ELU.py +++ b/torch/legacy/nn/ELU.py @@ -23,6 +23,7 @@ def updateOutput(self, input): self.output, self.alpha, 1.0, + 1.0, self.inplace ) return self.output @@ -34,6 +35,7 @@ def updateGradInput(self, input, gradOutput): self.gradInput, self.output, self.alpha, + 1.0, 1.0 ) return self.gradInput diff --git a/torch/lib/THD/base/data_channels/DataChannelMPI.cpp b/torch/lib/THD/base/data_channels/DataChannelMPI.cpp index cc176931d8c0c2..b23157581bdfc0 100644 --- a/torch/lib/THD/base/data_channels/DataChannelMPI.cpp +++ b/torch/lib/THD/base/data_channels/DataChannelMPI.cpp @@ -100,6 +100,14 @@ void DataChannelMPI::destroy() {} bool DataChannelMPI::init() { +#ifdef OMPI_MAJOR_VERSION + // OMPI_* is specific to Openmpi implementation. + // Openmpi v1.10 segfaults in MPI_Bcast with CUDA buffer. + if (int(OMPI_MAJOR_VERSION) < 2) { + throw std::runtime_error("Please use Openmpi major version 2 and above for distributed."); + } +#endif /* OMPI_MAJOR_VERSION */ + int provided; MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &provided); if (provided != MPI_THREAD_MULTIPLE) { diff --git a/torch/lib/c10d/Utils.hpp b/torch/lib/c10d/Utils.hpp index 26f6c480420b67..9bb0ef0e98ca82 100644 --- a/torch/lib/c10d/Utils.hpp +++ b/torch/lib/c10d/Utils.hpp @@ -64,7 +64,7 @@ inline std::vector> getSizes( const std::vector& tensors) { std::vector> sizes(tensors.size()); for (size_t i = 0; i < tensors.size(); i++) { - sizes[i] = tensors[i].sizes(); + sizes[i] = tensors[i].sizes().vec(); } return sizes; } diff --git a/torch/nn/functional.py b/torch/nn/functional.py index 17a7c09b012da6..746c2664529175 100644 --- a/torch/nn/functional.py +++ b/torch/nn/functional.py @@ -741,6 +741,25 @@ def selu(input, inplace=False): """) +def celu(input, alpha=1., inplace=False): + r"""celu(input, alpha=1., inplace=False) -> Tensor + + Applies element-wise, + :math:`\text{CELU}(x) = \max(0,x) + \min(0, \alpha * (\exp(x/\alpha) - 1))`. + + See :class:`~torch.nn.CELU` for more details. + """ + if inplace: + return torch.celu_(input, alpha) + return torch.celu(input, alpha) + +celu_ = _add_docstr(torch.celu_, r""" +celu_(input, alpha=1.) -> Tensor + +In-place version of :func:`~celu`. +""") + + def leaky_relu(input, negative_slope=0.01, inplace=False): r""" leaky_relu(input, negative_slope=0.01, inplace=False) -> Tensor @@ -859,7 +878,7 @@ def softmin(input, dim=None, _stacklevel=3): """ if dim is None: dim = _get_softmax_dim('softmin', input.dim(), _stacklevel) - return -input.softmax(dim) + return (-input).softmax(dim) def softmax(input, dim=None, _stacklevel=3): @@ -1099,7 +1118,7 @@ def embedding(input, weight, padding_idx=None, max_norm=None, norm_type=2, assert padding_idx >= -weight.size(0), 'Padding_idx must be within num_embeddings' padding_idx = weight.size(0) + padding_idx elif padding_idx is None: - padding_idx = -1 + padding_idx = -1 if max_norm is not None: # `embedding_renorm_` will call .contiguous() on input anyways, so we # call it here and take advantage of the improved locality in the @@ -1350,6 +1369,41 @@ def local_response_norm(input, size, alpha=1e-4, beta=0.75, k=1): # loss +def ctc_loss(log_probs, targets, input_lengths, target_lengths, blank=0, + reduction='elementwise_mean'): + r"""The Connectionist Temporal Classification loss. + + See :class:`~torch.nn.CTCLoss` for details. + + Args: + log_probs: :math:`(T, N, C)` where `C = number of characters in alphabet including blank`, + `T = input length`, and `N = batch size`. + The logarithmized probabilities of the outputs + (e.g. obtained with :func:`torch.nn.functional.log_softmax`). + targets: :math:`(N, S)` or `(sum(target_lenghts))`. + Targets (cannot be blank). In the second form, the targets are assumed to be concatenated. + input_lengths: :math:`(N)`. + Lengths of the inputs (must each be :math:`\leq T`) + target_lengths: :math:`(N)`. + Lengths of the targets + blank (int, optional): + Blank label. Default :math:`0`. + reduction (string, optional): Specifies the reduction to apply to the output: + 'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied, + 'elementwise_mean': the output losses will be divided by the target lengths and + then the mean over the batch is taken. Default: 'elementwise_mean' + + Example:: + + >>> log_probs = torch.randn(50, 16, 20).log_softmax(2).detach().requires_grad_() + >>> targets = torch.randint(1, 21, (16, 30), dtype=torch.long) + >>> input_lengths = torch.full((16,), 50, dtype=torch.long) + >>> target_lengths = torch.randint(10,30,(16,), dtype=torch.long) + >>> loss = F.ctc_loss(log_probs, targets, input_lengths, target_lengths) + >>> loss.backward() + """ + return torch.ctc_loss(log_probs, targets, input_lengths, target_lengths, blank, _Reduction.get_enum(reduction)) + def nll_loss(input, target, weight=None, size_average=None, ignore_index=-100, reduce=None, reduction='elementwise_mean'): @@ -1671,7 +1725,7 @@ def _pointwise_loss(lambd, lambd_optimized, input, target, reduction='elementwis return d return torch.mean(d) if reduction == 'elementwise_mean' else torch.sum(d) else: - return lambd_optimized(input, target, reduction) + return lambd_optimized(input, target, _Reduction.get_enum(reduction)) def smooth_l1_loss(input, target, size_average=None, reduce=None, reduction='elementwise_mean'): @@ -1695,9 +1749,7 @@ def l1_loss(input, target, size_average=None, reduce=None, reduction='elementwis See :class:`~torch.nn.L1Loss` for details. """ if size_average is not None or reduce is not None: - reduction = _Reduction.legacy_get_enum(size_average, reduce) - else: - reduction = _Reduction.get_enum(reduction) + reduction = _Reduction.legacy_get_string(size_average, reduce) return _pointwise_loss(lambda a, b: torch.abs(a - b), torch._C._nn.l1_loss, input, target, reduction) @@ -1710,9 +1762,7 @@ def mse_loss(input, target, size_average=None, reduce=None, reduction='elementwi See :class:`~torch.nn.MSELoss` for details. """ if size_average is not None or reduce is not None: - reduction = _Reduction.legacy_get_enum(size_average, reduce) - else: - reduction = _Reduction.get_enum(reduction) + reduction = _Reduction.legacy_get_string(size_average, reduce) return _pointwise_loss(lambda a, b: (a - b) ** 2, torch._C._nn.mse_loss, input, target, reduction) diff --git a/torch/nn/modules/__init__.py b/torch/nn/modules/__init__.py index 4d98f482768a63..6c66f8d43f005f 100644 --- a/torch/nn/modules/__init__.py +++ b/torch/nn/modules/__init__.py @@ -3,10 +3,10 @@ from .conv import Conv1d, Conv2d, Conv3d, \ ConvTranspose1d, ConvTranspose2d, ConvTranspose3d from .activation import Threshold, ReLU, Hardtanh, ReLU6, Sigmoid, Tanh, \ - Softmax, Softmax2d, LogSoftmax, ELU, SELU, Hardshrink, LeakyReLU, LogSigmoid, \ + Softmax, Softmax2d, LogSoftmax, ELU, SELU, CELU, Hardshrink, LeakyReLU, LogSigmoid, \ Softplus, Softshrink, PReLU, Softsign, Softmin, Tanhshrink, RReLU, GLU from .loss import L1Loss, NLLLoss, KLDivLoss, MSELoss, BCELoss, BCEWithLogitsLoss, NLLLoss2d, \ - CosineEmbeddingLoss, HingeEmbeddingLoss, MarginRankingLoss, \ + CosineEmbeddingLoss, CTCLoss, HingeEmbeddingLoss, MarginRankingLoss, \ MultiLabelMarginLoss, MultiLabelSoftMarginLoss, MultiMarginLoss, \ SmoothL1Loss, SoftMarginLoss, CrossEntropyLoss, TripletMarginLoss, PoissonNLLLoss from .container import Container, Sequential, ModuleList, ModuleDict, ParameterList, ParameterDict @@ -31,10 +31,10 @@ __all__ = [ 'Module', 'Linear', 'Conv1d', 'Conv2d', 'Conv3d', 'ConvTranspose1d', 'ConvTranspose2d', 'ConvTranspose3d', 'Threshold', 'ReLU', 'Hardtanh', 'ReLU6', - 'Sigmoid', 'Tanh', 'Softmax', 'Softmax2d', 'LogSoftmax', 'ELU', 'SELU', 'GLU', 'Hardshrink', + 'Sigmoid', 'Tanh', 'Softmax', 'Softmax2d', 'LogSoftmax', 'ELU', 'SELU', 'CELU', 'GLU', 'Hardshrink', 'LeakyReLU', 'LogSigmoid', 'Softplus', 'Softshrink', 'PReLU', 'Softsign', 'Softmin', 'Tanhshrink', 'RReLU', 'L1Loss', 'NLLLoss', 'KLDivLoss', 'MSELoss', 'BCELoss', 'BCEWithLogitsLoss', - 'NLLLoss2d', 'PoissonNLLLoss', 'CosineEmbeddingLoss', 'HingeEmbeddingLoss', 'MarginRankingLoss', + 'NLLLoss2d', 'PoissonNLLLoss', 'CosineEmbeddingLoss', 'CTCLoss', 'HingeEmbeddingLoss', 'MarginRankingLoss', 'MultiLabelMarginLoss', 'MultiLabelSoftMarginLoss', 'MultiMarginLoss', 'SmoothL1Loss', 'SoftMarginLoss', 'CrossEntropyLoss', 'Container', 'Sequential', 'ModuleList', 'ModuleDict', 'ParameterList', 'ParameterDict', 'AvgPool1d', 'AvgPool2d', 'AvgPool3d', 'MaxPool1d', 'MaxPool2d', diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py index d372a2cae21d2c..51cfab79404145 100644 --- a/torch/nn/modules/activation.py +++ b/torch/nn/modules/activation.py @@ -118,6 +118,7 @@ class RReLU(Module): .. _`Empirical Evaluation of Rectified Activations in Convolutional Network`: https://arxiv.org/abs/1505.00853 """ + def __init__(self, lower=1. / 8, upper=1. / 3, inplace=False): super(RReLU, self).__init__() self.lower = lower @@ -299,6 +300,46 @@ def extra_repr(self): return 'alpha={}{}'.format(self.alpha, inplace_str) +class CELU(Module): + r"""Applies element-wise, + :math:`\text{CELU}(x) = \max(0,x) + \min(0, \alpha * (\exp(x/\alpha) - 1))` + + More details can be found in the paper `Continuously Differentiable Exponential Linear Units`_ . + + Args: + alpha: the :math:`\alpha` value for the CELU formulation. Default: 1.0 + inplace: can optionally do the operation in-place. Default: ``False`` + + Shape: + - Input: :math:`(N, *)` where `*` means, any number of additional + dimensions + - Output: :math:`(N, *)`, same shape as the input + + .. image:: scripts/activation_images/CELU.png + + Examples:: + + >>> m = nn.CELU() + >>> input = torch.randn(2) + >>> output = m(input) + + .. _`Continuously Differentiable Exponential Linear Units`: + https://arxiv.org/abs/1704.07483 + """ + + def __init__(self, alpha=1., inplace=False): + super(CELU, self).__init__() + self.alpha = alpha + self.inplace = inplace + + def forward(self, input): + return F.celu(input, self.alpha, self.inplace) + + def extra_repr(self): + inplace_str = ', inplace' if self.inplace else '' + return 'alpha={}{}'.format(self.alpha, inplace_str) + + class SELU(Module): r"""Applies element-wise, :math:`\text{SELU}(x) = \text{scale} * (\max(0,x) + \min(0, \alpha * (\exp(x) - 1)))`, @@ -668,6 +709,7 @@ class Softmin(Module): >>> input = torch.randn(2, 3) >>> output = m(input) """ + def __init__(self, dim=None): super(Softmin, self).__init__() self.dim = dim diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py index 489e8998843f98..ec7d60d8125152 100644 --- a/torch/nn/modules/loss.py +++ b/torch/nn/modules/loss.py @@ -1123,6 +1123,61 @@ def forward(self, anchor, positive, negative): return F.triplet_margin_loss(anchor, positive, negative, margin=self.margin, p=self.p, eps=self.eps, swap=self.swap, reduction=self.reduction) + +class CTCLoss(_Loss): + r"""The Connectionist Temporal Classification loss. + + Args: + blank (int, optional): blank label. Default :math:`0`. + reduction (string, optional): Specifies the reduction to apply to the output: + 'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied, + 'elementwise_mean': the output losses will be divided by the target lengths and + then the mean over the batch is taken. Default: 'elementwise_mean' + + Inputs: + log_probs: :math:`(T, N, C)` where `C = number of characters in alphabet including blank`, + `T = input length`, and `N = batch size`. + The logarithmized probabilities of the outputs + (e.g. obtained with :func:`torch.nn.functional.log_softmax`). + targets: :math:`(N, S)` or `(sum(target_lenghts))`. + Targets (cannot be blank). In the second form, the targets are assumed to be concatenated. + input_lengths: :math:`(N)`. + Lengths of the inputs (must each be :math:`\leq T`) + target_lengths: :math:`(N)`. + Lengths of the targets + + + Example:: + + >>> ctc_loss = nn.CTCLoss() + >>> log_probs = torch.randn(50, 16, 20).log_softmax(2).detach().requires_grad_() + >>> targets = torch.randint(1, 21, (16, 30), dtype=torch.long) + >>> input_lengths = torch.full((16,), 50, dtype=torch.long) + >>> target_lengths = torch.randint(10,30,(16,), dtype=torch.long) + >>> loss = ctc_loss(log_probs, targets, input_lengths, target_lengths) + >>> loss.backward() + + Reference: + A. Graves et al.: Connectionist Temporal Classification: + Labelling Unsegmented Sequence Data with Recurrent Neural Networks: + https://www.cs.toronto.edu/~graves/icml_2006.pdf + + .. Note:: + In order to use CuDNN, the following must be satisfied: :attr:`targets` must be + in concatenated format, all :attr:`input_lengths` must be `T`. :math:`blank=0`, + :attr:`target_lengths` :math:`\leq 256`, the integer arguments must be of + :class:`torch.IntTensor`. + + The regular implementation uses the (more common in PyTorch) `torch.long` dtype. + """ + + def __init__(self, blank=0, reduction='elementwise_mean'): + super(CTCLoss, self).__init__(reduction=reduction) + self.blank = blank + + def forward(self, log_probs, targets, input_lengths, target_lengths): + return F.ctc_loss(log_probs, targets, input_lengths, target_lengths, self.blank, self.reduction) + # TODO: L1HingeEmbeddingCriterion # TODO: MSECriterion weight # TODO: ClassSimplexCriterion diff --git a/torch/nn/parallel/distributed_c10d.py b/torch/nn/parallel/distributed_c10d.py index c2b32cb97b6b01..424670ac76fc14 100644 --- a/torch/nn/parallel/distributed_c10d.py +++ b/torch/nn/parallel/distributed_c10d.py @@ -242,11 +242,7 @@ def train(self, mode=True): module.train(mode) def _dist_broadcast_coalesced(self, tensors, buffer_size): - for tensors in _take_tensors(tensors, buffer_size): - flat_tensors = _flatten_dense_tensors(tensors) - c10d.broadcast(flat_tensors, 0, self.process_group).wait() - for tensor, synced in zip(tensors, _unflatten_dense_tensors(flat_tensors, tensors)): - tensor.copy_(synced) + c10d._dist_broadcast_coalesced(tensors, buffer_size, self.process_group) def _sync_params(self): if len(self.device_ids) > 1: diff --git a/torch/nn/utils/convert_parameters.py b/torch/nn/utils/convert_parameters.py index 7f0dd1666dad9c..36a7eb207bcc65 100644 --- a/torch/nn/utils/convert_parameters.py +++ b/torch/nn/utils/convert_parameters.py @@ -45,9 +45,9 @@ def vector_to_parameters(vec, parameters): param_device = _check_param_device(param, param_device) # The length of the parameter - num_param = torch.prod(torch.LongTensor(list(param.size()))) + num_param = param.numel() # Slice the vector, reshape it, and replace the old data of the parameter - param.data = vec[pointer:pointer + num_param].view(param.size()).data + param.data = vec[pointer:pointer + num_param].view_as(param).data # Increment the pointer pointer += num_param diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py index 3ca44f35c4eff3..3262ca282b2c5d 100644 --- a/torch/onnx/symbolic.py +++ b/torch/onnx/symbolic.py @@ -70,6 +70,12 @@ def _get_const(value, desc, arg_name): return _parse_arg(value, desc) +def _unpack_list(list_value): + list_node = list_value.node() + assert list_node.kind() == "prim::ListConstruct" + return list_node.inputs() + + def parse_args(*arg_descriptors): def decorator(fn): def wrapper(g, *args): @@ -215,13 +221,18 @@ def reciprocal(g, self): return g.op("Div", _if_scalar_type_as(g, torch.ones(1), self), self) -# This syntax is Python 2 portable -def cat(g, *args): - dim = _get_const(args[-1], 'i', 'dim') - tensors = args[:-1] +@parse_args('v', 'i') +def cat(g, tensor_list, dim): + tensors = _unpack_list(tensor_list) return g.op("Concat", *tensors, axis_i=dim) +@parse_args('v', 'i') +def stack(g, tensor_list, dim): + unsqueezed = [g.op("Unsqueeze", t, axes_i=[dim]) for t in _unpack_list(tensor_list)] + return g.op("Concat", *unsqueezed, axis_i=dim) + + def mm(g, self, other): # Create a dummy C tensor. Only needed for API purposes, the value is # since beta = 0 @@ -349,11 +360,6 @@ def view(g, self, size): return g.op("Reshape", self, shape) -def stack(g, *args): - unsqueezed = [g.op("Unsqueeze", t, axes_i=[dim]) for t in args[:-1]] + [args[-1]] - return concat(g, *unsqueezed) - - @parse_args('v', 'i', 'i') def split(g, self, split_size, dim): size = self.type().sizes()[dim] @@ -555,9 +561,10 @@ def replication_pad(g, input, padding): @parse_args('v', 'is') def upsample_nearest2d(g, input, output_size): + height_scale = float(output_size[-2]) / input.type().sizes()[-2] + width_scale = float(output_size[-1]) / input.type().sizes()[-1] return g.op("Upsample", input, - height_scale_f=float(output_size[-2]) / input.type().sizes()[-2], - width_scale_f=float(output_size[-1]) / input.type().sizes()[-1], + scales_f=[1., 1., height_scale, width_scale], mode_s="nearest") @@ -565,10 +572,11 @@ def upsample_nearest2d(g, input, output_size): def upsample_bilinear2d(g, input, output_size, align_corners): if align_corners: return _unimplemented("upsample_bilinear2d", "align_corners == True") - w_scale = float(output_size[-1]) / input.type().sizes()[-1] - h_scale = float(output_size[-2]) / input.type().sizes()[-2] - return g.op("Upsample", input, width_scale_f=w_scale, - height_scale_f=h_scale, mode_s="bilinear") + height_scale = float(output_size[-2]) / input.type().sizes()[-2] + width_scale = float(output_size[-1]) / input.type().sizes()[-1] + return g.op("Upsample", input, + scales_f=[1., 1., height_scale, width_scale], + mode_s="bilinear") def gt(g, input, other): @@ -659,10 +667,12 @@ def unfold(g, input, dimension, size, step): return g.op("ATen", input, operator_s="unfold", dimension_i=dimension, size_i=size, step_i=step) -@parse_args('v', 't', 't') -def elu(g, input, alpha, scale): +@parse_args('v', 't', 't', 't') +def elu(g, input, alpha, scale, input_scale): if scale and scale != 1.: return _unimplemented("scale", "does not support scale in Elu") + if input_scale and input_scale != 1.: + return _unimplemented("input_scale", "does not support input_scale in Elu") # See Note [Export inplace] return g.op("Elu", input, alpha_f=_scalar(alpha)) @@ -676,8 +686,10 @@ def index_select(g, self, dim, index): return g.op("Gather", self, index, axis_i=dim) -def index_put(g, *inputs): - return g.op("ATen", *inputs, operator_s='index_put') +def index_put(g, self, indices_list_value, values): + indices_list = list(_unpack_list(indices_list_value)) + args = [self] + indices_list + [values] + return g.op("ATen", *args, operator_s='index_put') def type_as(g, self, other): @@ -868,14 +880,17 @@ def topk(g, self, k, dim, largest, sorted, out=None): return g.op("TopK", self, k_i=k, axis_i=dim, outputs=2) -@parse_args('v', 'is') def repeat(g, self, repeats): - if self.isTensor(): + if not _is_value(repeats): + repeats = g.op("Constant", value_t=torch.LongTensor(repeats)) + const_repeats = _maybe_get_const(repeats, 'is') + + if self.isTensor() and not _is_value(const_repeats): sizes = self.type().sizes() - diff_dims = len(repeats) - len(sizes) + diff_dims = len(const_repeats) - len(sizes) if diff_dims > 0: self = view(g, self, [1] * diff_dims + sizes) - return g.op("Tile", self, g.op("Constant", value_t=torch.LongTensor(repeats))) + return g.op("Tile", self, repeats) def instance_norm(g, input, **kwargs): diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py index 4f9299d258ea3e..b770b900c4edd3 100644 --- a/torch/onnx/utils.py +++ b/torch/onnx/utils.py @@ -480,8 +480,14 @@ def _run_symbolic_function(g, n, inputs, env, operator_export_type=OperatorExpor raise RuntimeError("Unsupported prim::Constant kind: `{}`. Send a bug report.".format( n.kindOf("value"))) elif op_name == "ListConstruct": - unsqueezed = [g.op("Unsqueeze", input, axes_i=[0]) for input in inputs] - return g.op("Concat", *unsqueezed, axis_i=0) + t = n.output().type() + # Tensor lists are used mostly for inputs to cat/stack. They need to be handled + # in those symbolics, and should become dead afterwards. + if t == torch._C.ListType.ofTensors(): + return None + elif t == torch._C.ListType.ofInts(): + unsqueezed = [g.op("Unsqueeze", input, axes_i=[0]) for input in inputs] + return g.op("Concat", *unsqueezed, axis_i=0) elif op_name == "Undefined": # Undefined is not an ONNX operator; keep it as prim::Undefined # and let the exporter handle finally eliminating these diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py index ad7f780719ccd3..96cfaff8684cf0 100644 --- a/torch/optim/lr_scheduler.py +++ b/torch/optim/lr_scheduler.py @@ -1,3 +1,4 @@ +import types import math import torch from torch._six import inf @@ -86,6 +87,37 @@ def __init__(self, optimizer, lr_lambda, last_epoch=-1): self.last_epoch = last_epoch super(LambdaLR, self).__init__(optimizer, last_epoch) + def state_dict(self): + """Returns the state of the scheduler as a :class:`dict`. + + It contains an entry for every variable in self.__dict__ which + is not the optimizer. + The learning rate lambda functions will only be saved if they are callable objects + and not if they are functions or lambdas. + """ + state_dict = {key: value for key, value in self.__dict__.items() if key not in ('optimizer', 'lr_lambdas')} + state_dict['lr_lambdas'] = [None] * len(self.lr_lambdas) + + for idx, fn in enumerate(self.lr_lambdas): + if not isinstance(fn, types.FunctionType): + state_dict['lr_lambdas'][idx] = fn.__dict__.copy() + + return state_dict + + def load_state_dict(self, state_dict): + """Loads the schedulers state. + + Arguments: + state_dict (dict): scheduler state. Should be an object returned + from a call to :meth:`state_dict`. + """ + lr_lambdas = state_dict.pop('lr_lambdas') + self.__dict__.update(state_dict) + + for idx, fn in enumerate(lr_lambdas): + if fn is not None: + self.lr_lambdas[idx].__dict__.update(fn) + def get_lr(self): return [base_lr * lmbda(self.last_epoch) for lmbda, base_lr in zip(self.lr_lambdas, self.base_lrs)] diff --git a/torch/tensor.py b/torch/tensor.py index 6b587fcf903586..9784fd59c9d2fb 100644 --- a/torch/tensor.py +++ b/torch/tensor.py @@ -384,6 +384,8 @@ def __dir__(self): return sorted(keys) # Numpy array interface, to support `numpy.asarray(tensor) -> ndarray` + __array_priority__ = 1000 # prefer Tensor ops over numpy ones + def __array__(self, dtype=None): if dtype is None: return self.cpu().numpy()