diff --git a/.clang-tidy b/.clang-tidy index d5fc66c26d42d9..5466a4a31d20a3 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -2,7 +2,6 @@ # NOTE: there must be no spaces before the '-', so put the comma first. Checks: ' * - ,clang-analyzer-* ,modernize-* ,-cert-err58-cpp ,-cert-err60-cpp @@ -10,7 +9,6 @@ Checks: ' ,-cppcoreguidelines-owning-memory ,-cppcoreguidelines-pro-bounds-array-to-pointer-decay ,-cppcoreguidelines-pro-bounds-constant-array-index - ,-cppcoreguidelines-pro-type-member-init ,-cppcoreguidelines-pro-type-static-cast-downcast ,-cppcoreguidelines-pro-type-vararg ,-cppcoreguidelines-special-member-functions @@ -25,11 +23,9 @@ Checks: ' ,-hicpp-braces-around-statements ,-hicpp-explicit-conversions ,-hicpp-no-array-decay - ,-hicpp-signed-bitwise ,-hicpp-special-member-functions ,-hicpp-vararg ,-llvm-header-guard - ,-llvm-include-order ,-llvm-namespace-comment ,-misc-unused-parameters ,-modernize-make-unique @@ -38,6 +34,7 @@ Checks: ' ,-readability-braces-around-statements ,-readability-else-after-return ,-readability-named-parameter + ,clang-analyzer-* ' WarningsAsErrors: '' HeaderFilterRegex: 'torch/csrc/' diff --git a/.gitattributes b/.gitattributes deleted file mode 100644 index cd41d1a02f8290..00000000000000 --- a/.gitattributes +++ /dev/null @@ -1 +0,0 @@ -*.bat text eol=crlf diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh index 3bc5157d9cab7a..6b8aa6fc62bb94 100755 --- a/.jenkins/caffe2/build.sh +++ b/.jenkins/caffe2/build.sh @@ -124,7 +124,7 @@ CMAKE_ARGS+=("-DUSE_OBSERVERS=ON") CMAKE_ARGS+=("-DUSE_ZSTD=ON") CMAKE_ARGS+=("-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}") -if [[ $BUILD_ENVIRONMENT == *-aten-* || -n "$INTEGRATED" ]]; then +if [[ $BUILD_ENVIRONMENT == *-aten-* ]]; then if [[ CMAKE_ARGS != *USE_ATEN* ]] && [[ CMAKE_ARGS != *BUILD_ATEN* ]]; then CMAKE_ARGS+=("-DBUILD_ATEN=ON") fi diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh index 40e3e21417b9b2..053a9be5e05487 100755 --- a/.jenkins/caffe2/test.sh +++ b/.jenkins/caffe2/test.sh @@ -64,13 +64,7 @@ for test in $(find "${INSTALL_PREFIX}/test" -executable -type f); do ;; */aten/*) # ATen uses test framework Catch2 - # NB: We do NOT use the xml test reporter, because - # Catch doesn't support multiple reporters - # c.f. https://github.com/catchorg/Catch2/blob/master/docs/release-notes.md#223 - # which means that enabling XML output means you lose useful stdout - # output for Jenkins. It's more important to have useful console - # output than it is to have XML output for Jenkins. - "$test" + "$test" -r=xml -o "${junit_reports_dir}/$(basename $test).xml" ;; *) "$test" --gtest_output=xml:"$gtest_reports_dir/$(basename $test).xml" @@ -115,10 +109,6 @@ if [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then # Our cuda top_k op has some asm code, the hipified version doesn't # compile yet, so we don't have top_k operator for now rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/top_k_test.py") - - # Our AMD CI boxes have 4 gpus on each - # Remove this once we have added multi-gpu support - export HIP_VISIBLE_DEVICES=$(($BUILD_NUMBER % 4)) fi # Python tests diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index 48e81dfd635bce..56db6914c1c20a 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -43,9 +43,12 @@ if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then # https://github.com/RadeonOpenCompute/hcc#hcc-with-thinlto-linking export KMTHINLTO=1 - python tools/amd_build/build_pytorch_amd.py - USE_ROCM=1 python setup.py install --user - exit 0 + sudo chown -R jenkins:jenkins /usr/local + rm -rf "$(dirname "${BASH_SOURCE[0]}")/../../../pytorch_amd/" || true + python "$(dirname "${BASH_SOURCE[0]}")/../../tools/amd_build/build_pytorch_amd.py" + + USE_ROCM=1 python setup.py install + exit fi # TODO: Don't install this here diff --git a/CMakeLists.txt b/CMakeLists.txt index c7eb20d1336550..651e230ab35ea7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -214,10 +214,9 @@ if(NOT MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-strict-overflow") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-strict-aliasing") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-stringop-overflow") # These flags are not available in GCC-4.8.5. Set only when using clang. # Compared against https://gcc.gnu.org/onlinedocs/gcc-4.8.5/gcc/Option-Summary.html - if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") + if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-invalid-partial-specialization") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-typedef-redefinition") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-warning-option") @@ -227,7 +226,6 @@ if(NOT MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-c++14-extensions") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-constexpr-not-const") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-braces") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qunused-arguments") endif() if ((APPLE AND (NOT ("${CLANG_VERSION_STRING}" VERSION_LESS "9.0"))) OR (CMAKE_COMPILER_IS_GNUCXX @@ -286,8 +284,6 @@ include_directories(BEFORE ${PROJECT_SOURCE_DIR}) # in PROJECT_SOURCE_DIR. include_directories(BEFORE ${PROJECT_BINARY_DIR}) -include_directories(BEFORE ${PROJECT_SOURCE_DIR}/aten/src/) - # ---[ Old caffe protobuf if(BUILD_CAFFE2) add_subdirectory(caffe/proto) diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt index 2f2ffdce186d39..462a12b086d2d0 100644 --- a/aten/CMakeLists.txt +++ b/aten/CMakeLists.txt @@ -146,5 +146,4 @@ if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE) set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE) set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE) - set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE) endif() diff --git a/aten/src/ATen/core/AlignOf.h b/aten/src/ATen/AlignOf.h similarity index 68% rename from aten/src/ATen/core/AlignOf.h rename to aten/src/ATen/AlignOf.h index a7e42196f43ecd..5e9f0127b32e70 100644 --- a/aten/src/ATen/core/AlignOf.h +++ b/aten/src/ATen/AlignOf.h @@ -33,7 +33,7 @@ namespace at { // MSVC requires special handling here. #ifndef _MSC_VER -template +template struct AlignedCharArray { alignas(Alignment) char buffer[Size]; }; @@ -41,7 +41,7 @@ struct AlignedCharArray { #else // _MSC_VER /// \brief Create a type with an aligned char buffer. -template +template struct AlignedCharArray; // We provide special variations of this template for the most common @@ -52,7 +52,7 @@ struct AlignedCharArray; // MSVC warns on the existence of the declspec despite the union member forcing // proper alignment. -template +template struct AlignedCharArray<1, Size> { union { char aligned; @@ -60,7 +60,7 @@ struct AlignedCharArray<1, Size> { }; }; -template +template struct AlignedCharArray<2, Size> { union { short aligned; @@ -68,7 +68,7 @@ struct AlignedCharArray<2, Size> { }; }; -template +template struct AlignedCharArray<4, Size> { union { int aligned; @@ -76,7 +76,7 @@ struct AlignedCharArray<4, Size> { }; }; -template +template struct AlignedCharArray<8, Size> { union { double aligned; @@ -84,13 +84,14 @@ struct AlignedCharArray<8, Size> { }; }; + // The rest of these are provided with a __declspec(align(...)) and we simply // can't pass them by-value as function arguments on MSVC. #define AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(x) \ - template \ - struct AlignedCharArray { \ - __declspec(align(x)) char buffer[Size]; \ + template \ + struct AlignedCharArray { \ + __declspec(align(x)) char buffer[Size]; \ }; AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(16) @@ -103,47 +104,24 @@ AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(128) #endif // _MSC_VER namespace detail { -template < - typename T1, - typename T2 = char, - typename T3 = char, - typename T4 = char, - typename T5 = char, - typename T6 = char, - typename T7 = char, - typename T8 = char, - typename T9 = char, - typename T10 = char> +template class AlignerImpl { - T1 t1; - T2 t2; - T3 t3; - T4 t4; - T5 t5; - T6 t6; - T7 t7; - T8 t8; - T9 t9; - T10 t10; + T1 t1; T2 t2; T3 t3; T4 t4; T5 t5; T6 t6; T7 t7; T8 t8; T9 t9; T10 t10; AlignerImpl() = delete; }; -template < - typename T1, - typename T2 = char, - typename T3 = char, - typename T4 = char, - typename T5 = char, - typename T6 = char, - typename T7 = char, - typename T8 = char, - typename T9 = char, - typename T10 = char> +template union SizerImpl { char arr1[sizeof(T1)], arr2[sizeof(T2)], arr3[sizeof(T3)], arr4[sizeof(T4)], - arr5[sizeof(T5)], arr6[sizeof(T6)], arr7[sizeof(T7)], arr8[sizeof(T8)], - arr9[sizeof(T9)], arr10[sizeof(T10)]; + arr5[sizeof(T5)], arr6[sizeof(T6)], arr7[sizeof(T7)], arr8[sizeof(T8)], + arr9[sizeof(T9)], arr10[sizeof(T10)]; }; } // end namespace detail @@ -154,20 +132,14 @@ union SizerImpl { /// expose a char array buffer member which can be used as suitable storage for /// a placement new of any of these types. Support for more than ten types can /// be added at the cost of more boilerplate. -template < - typename T1, - typename T2 = char, - typename T3 = char, - typename T4 = char, - typename T5 = char, - typename T6 = char, - typename T7 = char, - typename T8 = char, - typename T9 = char, - typename T10 = char> -struct AlignedCharArrayUnion - : AlignedCharArray< - alignof(detail::AlignerImpl), - sizeof(::at::detail:: - SizerImpl)> {}; +template +struct AlignedCharArrayUnion : AlignedCharArray< + alignof(detail::AlignerImpl), + sizeof(::at::detail::SizerImpl)> { +}; } // end namespace at diff --git a/aten/src/ATen/Allocator.h b/aten/src/ATen/Allocator.h index 26989a7ea7fbed..c1c78102a0fef8 100644 --- a/aten/src/ATen/Allocator.h +++ b/aten/src/ATen/Allocator.h @@ -6,7 +6,7 @@ #include #include #include -#include +#include namespace at { diff --git a/aten/src/ATen/ArrayRef.cpp b/aten/src/ATen/ArrayRef.cpp deleted file mode 100644 index 2a5d1f7a7cb595..00000000000000 --- a/aten/src/ATen/ArrayRef.cpp +++ /dev/null @@ -1 +0,0 @@ -#include diff --git a/aten/src/ATen/ArrayRef.h b/aten/src/ATen/ArrayRef.h index f52a5fcf1c2c58..df144025578c6b 100644 --- a/aten/src/ATen/ArrayRef.h +++ b/aten/src/ATen/ArrayRef.h @@ -1,2 +1,192 @@ +//===--- ArrayRef.h - Array Reference Wrapper -------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// ATen: modified from llvm::ArrayRef. +// removed llvm-specific functionality +// removed some implicit const -> non-const conversions that rely on +// complicated std::enable_if meta-programming +// removed a bunch of slice variants for simplicity... + #pragma once -#include + +#include +#include + +#include +#include +#include + +namespace at { + /// ArrayRef - Represent a constant reference to an array (0 or more elements + /// consecutively in memory), i.e. a start pointer and a length. It allows + /// various APIs to take consecutive elements easily and conveniently. + /// + /// This class does not own the underlying data, it is expected to be used in + /// situations where the data resides in some other buffer, whose lifetime + /// extends past that of the ArrayRef. For this reason, it is not in general + /// safe to store an ArrayRef. + /// + /// This is intended to be trivially copyable, so it should be passed by + /// value. + template + class ArrayRef { + public: + typedef const T *iterator; + typedef const T *const_iterator; + typedef size_t size_type; + + typedef std::reverse_iterator reverse_iterator; + + private: + /// The start of the array, in an external buffer. + const T *Data; + + /// The number of elements. + size_type Length; + + public: + /// @name Constructors + /// @{ + + /// Construct an empty ArrayRef. + /*implicit*/ ArrayRef() : Data(nullptr), Length(0) {} + + /// Construct an ArrayRef from a single element. + /*implicit*/ ArrayRef(const T &OneElt) + : Data(&OneElt), Length(1) {} + + /// Construct an ArrayRef from a pointer and length. + /*implicit*/ ArrayRef(const T *data, size_t length) + : Data(data), Length(length) {} + + /// Construct an ArrayRef from a range. + ArrayRef(const T *begin, const T *end) + : Data(begin), Length(end - begin) {} + + /// Construct an ArrayRef from a SmallVector. This is templated in order to + /// avoid instantiating SmallVectorTemplateCommon whenever we + /// copy-construct an ArrayRef. + template + /*implicit*/ ArrayRef(const SmallVectorTemplateCommon &Vec) + : Data(Vec.data()), Length(Vec.size()) { + } + + /// Construct an ArrayRef from a std::vector. + template + /*implicit*/ ArrayRef(const std::vector &Vec) + : Data(Vec.data()), Length(Vec.size()) {} + + /// Construct an ArrayRef from a std::array + template + /*implicit*/ constexpr ArrayRef(const std::array &Arr) + : Data(Arr.data()), Length(N) {} + + /// Construct an ArrayRef from a C array. + template + /*implicit*/ constexpr ArrayRef(const T (&Arr)[N]) : Data(Arr), Length(N) {} + + /// Construct an ArrayRef from a std::initializer_list. + /*implicit*/ ArrayRef(const std::initializer_list &Vec) + : Data(Vec.begin() == Vec.end() ? (T*)nullptr : Vec.begin()), + Length(Vec.size()) {} + + /// @} + /// @name Simple Operations + /// @{ + + const_iterator begin() const { return Data; } + const_iterator end() const { return Data + Length; } + + reverse_iterator rbegin() const { return reverse_iterator(end()); } + reverse_iterator rend() const { return reverse_iterator(begin()); } + + /// empty - Check if the array is empty. + bool empty() const { return Length == 0; } + + const T *data() const { return Data; } + + /// size - Get the array size. + size_t size() const { return Length; } + + /// front - Get the first element. + const T &front() const { + AT_CHECK(!empty(), "ArrayRef: attempted to access front() of empty list"); + return Data[0]; + } + + /// back - Get the last element. + const T &back() const { + AT_CHECK(!empty(), "ArrayRef: attempted to access back() of empty list"); + return Data[Length-1]; + } + + /// equals - Check for element-wise equality. + bool equals(ArrayRef RHS) const { + if (Length != RHS.Length) + return false; + return std::equal(begin(), end(), RHS.begin()); + } + + /// slice(n, m) - Chop off the first N elements of the array, and keep M + /// elements in the array. + ArrayRef slice(size_t N, size_t M) const { + AT_CHECK(N+M <= size(), "ArrayRef: invalid slice, ", N, " + ", M, " is not <= ", size()); + return ArrayRef(data()+N, M); + } + + /// slice(n) - Chop off the first N elements of the array. + ArrayRef slice(size_t N) const { return slice(N, size() - N); } + + /// @} + /// @name Operator Overloads + /// @{ + const T &operator[](size_t Index) const { + return Data[Index]; + } + + /// Vector compatibility + const T &at(size_t Index) const { + AT_CHECK(Index < Length, "ArrayRef: invalid index ", Index, " for length ", Length); + return Data[Index]; + } + + /// Disallow accidental assignment from a temporary. + /// + /// The declaration here is extra complicated so that "arrayRef = {}" + /// continues to select the move assignment operator. + template + typename std::enable_if::value, ArrayRef>::type & + operator=(U &&Temporary) = delete; + + /// Disallow accidental assignment from a temporary. + /// + /// The declaration here is extra complicated so that "arrayRef = {}" + /// continues to select the move assignment operator. + template + typename std::enable_if::value, ArrayRef>::type & + operator=(std::initializer_list) = delete; + + /// @} + /// @name Expensive Operations + /// @{ + std::vector vec() const { + return std::vector(Data, Data+Length); + } + + /// @} + /// @name Conversion operators + /// @{ + operator std::vector() const { + return std::vector(Data, Data+Length); + } + + /// @} + }; + +} // end namespace at diff --git a/aten/src/ATen/core/Backtrace.cpp b/aten/src/ATen/Backtrace.cpp similarity index 92% rename from aten/src/ATen/core/Backtrace.cpp rename to aten/src/ATen/Backtrace.cpp index 7914489d50ece3..a8e062051ee633 100644 --- a/aten/src/ATen/core/Backtrace.cpp +++ b/aten/src/ATen/Backtrace.cpp @@ -1,5 +1,5 @@ -#include -#include +#include +#include #include #include @@ -7,30 +7,18 @@ #include #include -#if defined(__ANDROID__) -#define AT_CORE_MOBILE 1 -#elif ( \ - defined(__APPLE__) && \ - (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE)) -#define AT_CORE_MOBILE 1 -#else -#define AT_CORE_MOBILE 0 -#endif - -#if !AT_CORE_MOBILE && !defined(_WIN32) && !defined(__EMSCRIPTEN__) -#define SUPPORTS_BACKTRACE 1 -#else -#define SUPPORTS_BACKTRACE 0 -#endif - -#if SUPPORTS_BACKTRACE +#if !defined(_WIN32) && !defined(__EMSCRIPTEN__) #include #include #endif // !defined(_WIN32) namespace at { - -#if SUPPORTS_BACKTRACE +#if defined(_MSC_VER) +// Windows does not have cxxabi.h, so we will simply return the original. +std::string demangle(const char* name) { + return std::string(name); +} +#elif !defined(__EMSCRIPTEN__) std::string demangle(const char* name) { int status = -1; @@ -57,10 +45,6 @@ std::string demangle(const char* name) { return name; } } -#else -std::string demangle(const char* name) { - return std::string(name); -} #endif // TODO: This backtrace retrieval can be implemented on Windows via the Windows @@ -68,7 +52,8 @@ std::string demangle(const char* name) { // https://stackoverflow.com/questions/5693192/win32-backtrace-from-c-code // https://stackoverflow.com/questions/26398064/counterpart-to-glibcs-backtrace-and-backtrace-symbols-on-windows // https://msdn.microsoft.com/en-us/library/windows/desktop/bb204633%28v=vs.85%29.aspx. -#if SUPPORTS_BACKTRACE +#if !defined(_WIN32) && !defined(__EMSCRIPTEN__) + namespace { struct FrameInformation { @@ -158,13 +143,14 @@ at::optional parse_frame_information( } } // anonymous namespace -#endif // SUPPORTS_BACKTRACE + +#endif // !defined(_WIN32) std::string get_backtrace( size_t frames_to_skip, size_t maximum_number_of_frames, bool skip_python_frames) { -#if SUPPORTS_BACKTRACE +#if !defined(_WIN32) && !defined(__EMSCRIPTEN__) // We always skip this frame (backtrace). frames_to_skip += 1; @@ -235,9 +221,10 @@ std::string get_backtrace( } return stream.str(); -#else // !SUPPORTS_BACKTRACE + +#else + return "(no backtrace available)"; -#endif // SUPPORTS_BACKTRACE +#endif } - } // namespace at diff --git a/aten/src/ATen/Backtrace.h b/aten/src/ATen/Backtrace.h index bdef9f4a9de439..347c430d61b75c 100644 --- a/aten/src/ATen/Backtrace.h +++ b/aten/src/ATen/Backtrace.h @@ -1,2 +1,28 @@ #pragma once -#include + +#include +#include +#include + +#include + +namespace at { +/// Utility to demangle a C++ symbol name. +AT_API std::string demangle(const char* name); + +/// Returns the printable name of the type. +template +inline const char* demangle_type() { +#ifdef __GXX_RTTI + static const std::string name = demangle(typeid(T).name()); + return name.c_str(); +#else // __GXX_RTTI + return "(RTTI disabled, cannot show name)"; +#endif // __GXX_RTTI +} + +AT_API std::string get_backtrace( + size_t frames_to_skip = 0, + size_t maximum_number_of_frames = 64, + bool skip_python_frames = true); +} // namespace at diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index 25a2e6d8b501f0..562910ad86a298 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -44,7 +44,6 @@ CONFIGURE_FILE(cuda/CUDAConfig.h.in "${CMAKE_CURRENT_SOURCE_DIR}/cuda/CUDAConfig # NB: If you edit these globs, you'll have to update setup.py package_data as well FILE(GLOB base_h "*.h" "detail/*.h") FILE(GLOB base_cpp "*.cpp" "detail/*.cpp") -add_subdirectory(core) FILE(GLOB cuda_h "cuda/*.h" "cuda/detail/*.h" "cuda/*.cuh" "cuda/detail/*.cuh") FILE(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp") FILE(GLOB cuda_cu "cuda/*.cu" "cuda/detail/*.cu") @@ -63,7 +62,7 @@ FILE(GLOB native_cuda_cpp "native/cuda/*.cpp") FILE(GLOB native_mkl_cpp "native/mkl/*.cpp") FILE(GLOB native_mkldnn_cpp "native/mkldnn/*.cpp") -set(all_cpu_cpp ${base_cpp} ${ATen_CORE_SRCS} ${native_cpp} ${native_sparse_cpp} ${native_mkl_cpp} ${native_mkldnn_cpp} ${generated_cpp} ${ATen_CPU_SRCS} ${cpu_kernel_cpp}) +set(all_cpu_cpp ${base_cpp} ${native_cpp} ${native_sparse_cpp} ${native_mkl_cpp} ${native_mkldnn_cpp} ${generated_cpp} ${ATen_CPU_SRCS} ${cpu_kernel_cpp}) if(AT_MKL_ENABLED) set(all_cpu_cpp ${all_cpu_cpp} ${mkl_cpp}) endif() @@ -394,7 +393,7 @@ INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake" DESTINATION "${AT_INSTALL_SHARE_DIR}/cmake/ATen") # https://stackoverflow.com/questions/11096471/how-can-i-install-a-hierarchy-of-files-using-cmake -FOREACH(HEADER ${base_h} ${ATen_CORE_HEADERS} ${cuda_h} ${cudnn_h}) +FOREACH(HEADER ${base_h} ${cuda_h} ${cudnn_h}) string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}/" "" HEADER_SUB ${HEADER}) GET_FILENAME_COMPONENT(DIR ${HEADER_SUB} DIRECTORY) INSTALL(FILES ${HEADER} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen/${DIR}) @@ -445,7 +444,6 @@ if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) endif() # Pass source, includes, and libs to parent -set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE) set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE) set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} PARENT_SCOPE) set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE) diff --git a/aten/src/ATen/CPUApplyUtils.h b/aten/src/ATen/CPUApplyUtils.h index ef370ea6e0bc30..2db2786b1c66cd 100644 --- a/aten/src/ATen/CPUApplyUtils.h +++ b/aten/src/ATen/CPUApplyUtils.h @@ -109,8 +109,8 @@ struct strided_tensor_iter { : data_(tensor.data()), dim_(tensor.ndimension()), counter_(dim_, 0), - sizes_(tensor.sizes().vec()), - strides_(tensor.strides().vec()) { + sizes_(tensor.sizes()), + strides_(tensor.strides()) { _setup_arrays(tensor, this); } }; diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp index d153e6bc6ada00..59f6ff755ee3f1 100644 --- a/aten/src/ATen/Context.cpp +++ b/aten/src/ATen/Context.cpp @@ -37,11 +37,8 @@ Context::Context() Type::registerCPU(this); } -// NB: Ensure that globalContext is initialized before we load -// variable hooks, otherwise we will deadlock. Regardless, the -// deadlock is bad, and being tracked at https://github.com/pytorch/pytorch/issues/9784 -static Context globalContext_; Context & globalContext() { + static Context globalContext_; return globalContext_; } diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index 7d3fdd1cc2d4af..309c4be2e651dd 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -9,9 +9,6 @@ #include "ATen/detail/CUDAHooksInterface.h" #include "ATen/CUDAStream.h" -// This is temporary -#include "ATen/core/ATenCoreTest.h" - #include #include #include diff --git a/aten/src/ATen/core/Error.cpp b/aten/src/ATen/Error.cpp similarity index 64% rename from aten/src/ATen/core/Error.cpp rename to aten/src/ATen/Error.cpp index 35ba7d644e109b..1261fbe0295d6c 100644 --- a/aten/src/ATen/core/Error.cpp +++ b/aten/src/ATen/Error.cpp @@ -1,5 +1,5 @@ -#include -#include +#include +#include #include #include @@ -11,13 +11,9 @@ std::ostream& operator<<(std::ostream& out, const SourceLocation& loc) { } Error::Error(SourceLocation source_location, std::string err) - : what_without_backtrace_(err), - what_( - str(err, - " (", - source_location, - ")\n", - get_backtrace(/*frames_to_skip=*/2))) {} + : what_without_backtrace_(err) + , what_(str(err, " (", source_location, ")\n", get_backtrace(/*frames_to_skip=*/2))) + {} void Warning::warn(SourceLocation source_location, std::string msg) { warning_handler_(source_location, msg.c_str()); @@ -27,9 +23,7 @@ void Warning::set_warning_handler(handler_t handler) { warning_handler_ = handler; } -void Warning::print_warning( - const SourceLocation& source_location, - const char* msg) { +void Warning::print_warning(const SourceLocation& source_location, const char* msg) { std::cerr << "Warning: " << msg << " (" << source_location << ")\n"; } diff --git a/aten/src/ATen/Error.h b/aten/src/ATen/Error.h index 2a184d4ecbd5ea..5a41eb7c74e7cb 100644 --- a/aten/src/ATen/Error.h +++ b/aten/src/ATen/Error.h @@ -1,2 +1,131 @@ #pragma once -#include + +#include // for AT_API +#include + +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) && _MSC_VER <= 1900 +#define __func__ __FUNCTION__ +#endif + +namespace at { + +namespace detail { + +inline std::ostream& _str(std::ostream& ss) { return ss; } + +template +inline std::ostream& _str(std::ostream& ss, const T& t) { + ss << t; + return ss; +} + +template +inline std::ostream& +_str(std::ostream& ss, const T& t, const Args&... args) { + return _str(_str(ss, t), args...); +} + +} // namespace detail + +// Convert a list of string-like arguments into a single string. +template +inline std::string str(const Args&... args) { + std::ostringstream ss; + detail::_str(ss, args...); + return ss.str(); +} + +// Specializations for already-a-string types. +template <> +inline std::string str(const std::string& str) { + return str; +} +inline std::string str(const char* c_str) { + return c_str; +} + +/// Represents a location in source code (for debugging). +struct SourceLocation { + const char* function; + const char* file; + uint32_t line; +}; + +std::ostream& operator<<(std::ostream& out, const SourceLocation& loc); + +/// The primary ATen error class. +/// Provides a complete error message with source location information via +/// `what()`, and a more concise message via `what_without_backtrace()`. Should +/// primarily be used with the `AT_ERROR` macro. +/// +/// NB: at::Error is handled specially by the default torch to suppress the +/// backtrace, see torch/csrc/Exceptions.h +class AT_API Error : public std::exception { + std::string what_without_backtrace_; + std::string what_; + +public: + Error(SourceLocation source_location, std::string err); + + /// Returns the complete error message, including the source location. + const char* what() const noexcept override { + return what_.c_str(); + } + + /// Returns only the error message string, without source location. + const char* what_without_backtrace() const noexcept { + return what_without_backtrace_.c_str(); + } +}; + +class AT_API Warning { + using handler_t = void(*)(const SourceLocation& source_location, const char* msg); + +public: + /// Issue a warning with a given message. Dispatched to the current + /// warning handler. + static void warn(SourceLocation source_location, std::string msg); + + /// Sets the global warning handler. This is not thread-safe, so it should + /// generally be called once during initialization. + static void set_warning_handler(handler_t handler); + + /// The default warning handler. Prints the message to stderr. + static void print_warning(const SourceLocation& source_location, const char* msg); + +private: + static handler_t warning_handler_; +}; + + +} // namespace at + +// TODO: variants that print the expression tested and thus don't require strings +// TODO: CAFFE_ENFORCE_WITH_CALLER style macro + +#define AT_ERROR(...) \ + throw at::Error({__func__, __FILE__, __LINE__}, at::str(__VA_ARGS__)) + +#define AT_WARN(...) \ + at::Warning::warn({__func__, __FILE__, __LINE__}, at::str(__VA_ARGS__)) + +#define AT_ASSERT(cond) \ + if (!(cond)) { \ + AT_ERROR(#cond " ASSERT FAILED at ", __FILE__, ":", __LINE__, ", please report a bug to PyTorch."); \ + } + +#define AT_ASSERTM(cond, ...) \ + if (!(cond)) { \ + AT_ERROR(at::str(#cond, " ASSERT FAILED at ", __FILE__, ":", __LINE__, ", please report a bug to PyTorch. ", __VA_ARGS__)); \ + } + +#define AT_CHECK(cond, ...) \ + if (!(cond)) { \ + AT_ERROR(at::str(__VA_ARGS__)); \ + } diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h index 934be4093b7257..35125cfa6751bb 100644 --- a/aten/src/ATen/ExpandUtils.h +++ b/aten/src/ATen/ExpandUtils.h @@ -111,7 +111,7 @@ inline std::vector expand_outplace(TensorList to_expand) { if (!to_expand[i].defined()) { continue; } else if (first) { - sizes = to_expand[i].sizes().vec(); + sizes = to_expand[i].sizes(); first = false; } else { sizes = infer_size(sizes, to_expand[i].sizes()); diff --git a/aten/src/ATen/Half-inl.h b/aten/src/ATen/Half-inl.h new file mode 100644 index 00000000000000..e5563faca3ab33 --- /dev/null +++ b/aten/src/ATen/Half-inl.h @@ -0,0 +1,168 @@ +#pragma once + +#include "ATen/ATenGeneral.h" +#include +#include + +#ifdef __CUDACC__ +#include +#endif + +namespace at { + +/// Constructors + +inline AT_HOSTDEVICE Half::Half(float value) { +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + x = __half_as_short(__float2half(value)); +#else + x = detail::float2halfbits(value); +#endif +} + +/// Implicit conversions + +inline AT_HOSTDEVICE Half::operator float() const { +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + return __half2float(*reinterpret_cast(&x)); +#else + return detail::halfbits2float(x); +#endif +} + +#ifdef __CUDACC__ +inline AT_HOSTDEVICE Half::Half(const __half& value) { + x = *reinterpret_cast(&value); +} +inline AT_HOSTDEVICE Half::operator __half() const { + return *reinterpret_cast(&x); +} +#endif + +/// Arithmetic + +inline AT_HOSTDEVICE Half operator+(const Half& a, const Half& b) { + return (float)a + (float)b; +} + +inline AT_HOSTDEVICE Half operator-(const Half& a, const Half& b) { + return (float)a - (float)b; +} + +inline AT_HOSTDEVICE Half operator*(const Half& a, const Half& b) { + return (float)a * (float)b; +} + +inline AT_HOSTDEVICE Half operator/(const Half& a, const Half& b) { + return (float)a / (float)b; +} + +inline AT_HOSTDEVICE Half operator-(const Half& a) { + return -(float)a; +} + +inline AT_HOSTDEVICE Half& operator+=(Half& a, const Half& b) { + a = a + b; + return a; +} + +inline AT_HOSTDEVICE Half& operator-=(Half& a, const Half& b) { + a = a - b; + return a; +} + +inline AT_HOSTDEVICE Half& operator*=(Half& a, const Half& b) { + a = a * b; + return a; +} + +inline AT_HOSTDEVICE Half& operator/=(Half& a, const Half& b) { + a = a / b; + return a; +} + +/// Arithmetic with floats + +inline AT_HOSTDEVICE float operator+(Half a, float b) { return (float)a + b; } +inline AT_HOSTDEVICE float operator-(Half a, float b) { return (float)a - b; } +inline AT_HOSTDEVICE float operator*(Half a, float b) { return (float)a * b; } +inline AT_HOSTDEVICE float operator/(Half a, float b) { return (float)a / b; } + +inline AT_HOSTDEVICE float operator+(float a, Half b) { return a + (float)b; } +inline AT_HOSTDEVICE float operator-(float a, Half b) { return a - (float)b; } +inline AT_HOSTDEVICE float operator*(float a, Half b) { return a * (float)b; } +inline AT_HOSTDEVICE float operator/(float a, Half b) { return a / (float)b; } + +inline AT_HOSTDEVICE float& operator+=(float& a, const Half& b) { return a += (float)b; } +inline AT_HOSTDEVICE float& operator-=(float& a, const Half& b) { return a -= (float)b; } +inline AT_HOSTDEVICE float& operator*=(float& a, const Half& b) { return a *= (float)b; } +inline AT_HOSTDEVICE float& operator/=(float& a, const Half& b) { return a /= (float)b; } + +/// Arithmetic with doubles + +inline AT_HOSTDEVICE double operator+(Half a, double b) { return (double)a + b; } +inline AT_HOSTDEVICE double operator-(Half a, double b) { return (double)a - b; } +inline AT_HOSTDEVICE double operator*(Half a, double b) { return (double)a * b; } +inline AT_HOSTDEVICE double operator/(Half a, double b) { return (double)a / b; } + +inline AT_HOSTDEVICE double operator+(double a, Half b) { return a + (double)b; } +inline AT_HOSTDEVICE double operator-(double a, Half b) { return a - (double)b; } +inline AT_HOSTDEVICE double operator*(double a, Half b) { return a * (double)b; } +inline AT_HOSTDEVICE double operator/(double a, Half b) { return a / (double)b; } + +/// Arithmetic with ints + +inline AT_HOSTDEVICE Half operator+(Half a, int b) { return a + (Half)b; } +inline AT_HOSTDEVICE Half operator-(Half a, int b) { return a - (Half)b; } +inline AT_HOSTDEVICE Half operator*(Half a, int b) { return a * (Half)b; } +inline AT_HOSTDEVICE Half operator/(Half a, int b) { return a / (Half)b; } + +inline AT_HOSTDEVICE Half operator+(int a, Half b) { return (Half)a + b; } +inline AT_HOSTDEVICE Half operator-(int a, Half b) { return (Half)a - b; } +inline AT_HOSTDEVICE Half operator*(int a, Half b) { return (Half)a * b; } +inline AT_HOSTDEVICE Half operator/(int a, Half b) { return (Half)a / b; } + +/// NOTE: we do not define comparisons directly and instead rely on the implicit +/// conversion from at::Half to float. + +} // namespace at + +namespace std { + +template<> class numeric_limits { + public: + static constexpr bool is_specialized = true; + static constexpr bool is_signed = true; + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr bool has_infinity = true; + static constexpr bool has_quiet_NaN = true; + static constexpr bool has_signaling_NaN = true; + static constexpr auto has_denorm = numeric_limits::has_denorm; + static constexpr auto has_denorm_loss = numeric_limits::has_denorm_loss; + static constexpr auto round_style = numeric_limits::round_style; + static constexpr bool is_iec559 = true; + static constexpr bool is_bounded = true; + static constexpr bool is_modulo = false; + static constexpr int digits = 11; + static constexpr int digits10 = 3; + static constexpr int max_digits10 = 5; + static constexpr int radix = 2; + static constexpr int min_exponent = -13; + static constexpr int min_exponent10 = -4; + static constexpr int max_exponent = 16; + static constexpr int max_exponent10 = 4; + static constexpr auto traps = numeric_limits::traps; + static constexpr auto tinyness_before = numeric_limits::tinyness_before; + static constexpr at::Half min() { return at::Half(0x0400, at::Half::from_bits); } + static constexpr at::Half lowest() { return at::Half(0xFBFF, at::Half::from_bits); } + static constexpr at::Half max() { return at::Half(0x7BFF, at::Half::from_bits); } + static constexpr at::Half epsilon() { return at::Half(0x1400, at::Half::from_bits); } + static constexpr at::Half round_error() { return at::Half(0x3800, at::Half::from_bits); } + static constexpr at::Half infinity() { return at::Half(0x7C00, at::Half::from_bits); } + static constexpr at::Half quiet_NaN() { return at::Half(0x7E00, at::Half::from_bits); } + static constexpr at::Half signaling_NaN() { return at::Half(0x7D00, at::Half::from_bits); } + static constexpr at::Half denorm_min() { return at::Half(0x0001, at::Half::from_bits); } +}; + +} // namespace std diff --git a/aten/src/ATen/Half.cpp b/aten/src/ATen/Half.cpp new file mode 100644 index 00000000000000..68f80a56ea8195 --- /dev/null +++ b/aten/src/ATen/Half.cpp @@ -0,0 +1,34 @@ +#include "ATen/Half.h" + +#include "ATen/Tensor.h" +#include "ATen/Context.h" + +#include +#include + +namespace at { + +static_assert(std::is_standard_layout::value, "at::Half must be standard layout."); + +namespace detail { + +float halfbits2float(unsigned short bits) { + float value; + TH_halfbits2float(&bits, &value); + return value; +} + +unsigned short float2halfbits(float value) { + unsigned short bits; + TH_float2halfbits(&value, &bits); + return bits; +} + +} // namespace detail + +std::ostream& operator<<(std::ostream & out, const Half& value) { + out << (float)value; + return out; +} + +} // namespace at diff --git a/aten/src/ATen/Half.h b/aten/src/ATen/Half.h index 21941116f19e82..b7ac47e4fda79a 100644 --- a/aten/src/ATen/Half.h +++ b/aten/src/ATen/Half.h @@ -1,2 +1,120 @@ #pragma once -#include + +/// Defines the Half type (half-precision floating-point) including conversions +/// to standard C types and basic arithmetic operations. Note that arithmetic +/// operations are implemented by converting to floating point and +/// performing the operation in float32, instead of using CUDA half intrinisics. +/// Most uses of this type within ATen are memory bound, including the +/// element-wise kernels, and the half intrinisics aren't efficient on all GPUs. +/// If you are writing a compute bound kernel, you can use the CUDA half +/// intrinsics directly on the Half type from device code. + +#include "ATen/ATenGeneral.h" + +#include +#include +#include +#include +#include +#include +#include + +#ifdef __CUDACC__ +#include +#endif + +#ifndef AT_HOSTDEVICE + #ifdef __CUDACC__ + #define AT_HOSTDEVICE __host__ __device__ + #else + #define AT_HOSTDEVICE + #endif +#endif + +namespace at { + +namespace detail { + +AT_API float halfbits2float(unsigned short bits); +AT_API unsigned short float2halfbits(float value); + +} + +struct alignas(2) Half { + unsigned short x; + + struct from_bits_t {}; + static constexpr from_bits_t from_bits = from_bits_t(); + + // HIP wants __host__ __device__ tag, CUDA does not +#ifdef __HIP_PLATFORM_HCC__ + AT_HOSTDEVICE Half() = default; +#else + Half() = default; +#endif + + constexpr AT_HOSTDEVICE Half(unsigned short bits, from_bits_t) : x(bits) {}; + inline AT_HOSTDEVICE Half(float value); + inline AT_HOSTDEVICE operator float() const; + +#ifdef __CUDACC__ + inline AT_HOSTDEVICE Half(const __half& value); + inline AT_HOSTDEVICE operator __half() const; +#endif +}; + +template To convert(From f) { + return static_cast(f); +} + +// skip isnan and isinf check for integral types +template +typename std::enable_if::value, bool>::type overflows(From f) { + using limit = std::numeric_limits; + if (!limit::is_signed && std::numeric_limits::is_signed) { + // allow for negative numbers to wrap using two's complement arithmetic. + // For example, with uint8, this allows for `a - b` to be treated as + // `a + 255 * b`. + return f > limit::max() || (f < 0 && -(uint64_t)f > limit::max()); + } else { + return f < limit::lowest() || f > limit::max(); + } +} + +template +typename std::enable_if::value, bool>::type overflows(From f) { + using limit = std::numeric_limits; + if (limit::has_infinity && std::isinf((double)f)) { + return false; + } + if (!limit::has_quiet_NaN && (f != f)) { + return true; + } + return f < limit::lowest() || f > limit::max(); +} + +template To checked_convert(From f, const char* name) { + if (overflows(f)) { + std::string msg = "value cannot be converted to type "; + msg += name; + msg += " without overflow: "; + msg += std::to_string(f); + throw std::domain_error(std::move(msg)); + } + return convert(f); +} + +template +To HalfFix(From h) { + To ret; + ret.x = h.x; + return ret; +} + +AT_API std::ostream& operator<<(std::ostream & out, const Half& value); + +} // namespace at + +#include "Half-inl.h" + +#undef AT_HOSTDEVICE diff --git a/aten/src/ATen/Parallel.h b/aten/src/ATen/Parallel.h index 6aadd62eb1d3fd..794d8e5f8c31a9 100644 --- a/aten/src/ATen/Parallel.h +++ b/aten/src/ATen/Parallel.h @@ -37,9 +37,7 @@ inline void parallel_for( f(begin_tid, std::min(end, chunk_size + begin_tid)); } #else - if (begin < end) { - f(begin, end); - } + f(begin, end); #endif } diff --git a/aten/src/ATen/Scalar.h b/aten/src/ATen/Scalar.h index f0b84d67554c02..e80d467b138ac3 100644 --- a/aten/src/ATen/Scalar.h +++ b/aten/src/ATen/Scalar.h @@ -10,6 +10,7 @@ #include "ATen/Half.h" #include "ATen/ScalarType.h" #include "ATen/TensorBase.h" +#include "ATen/Utils.h" namespace at { diff --git a/aten/src/ATen/ScalarType.h b/aten/src/ATen/ScalarType.h index 3651aef60e3e1e..f7c9243a89df2a 100644 --- a/aten/src/ATen/ScalarType.h +++ b/aten/src/ATen/ScalarType.h @@ -10,16 +10,16 @@ namespace at { // NB: Order matters for this macro; it is relied upon in -// _promoteTypesLookup and the serialization format. +// _promoteTypesLookup and probably other places. #define AT_FORALL_SCALAR_TYPES(_) \ -_(uint8_t,Byte,i) /* 0 */ \ -_(int8_t,Char,i) /* 1 */ \ -_(int16_t,Short,i) /* 2 */ \ -_(int,Int,i) /* 3 */ \ -_(int64_t,Long,i) /* 4 */ \ -_(at::Half,Half,d) /* 5 */ \ -_(float,Float,d) /* 6 */ \ -_(double,Double,d) /* 7 */ +_(uint8_t,Byte,i) \ +_(int8_t,Char,i) \ +_(int16_t,Short,i) \ +_(int,Int,i) \ +_(int64_t,Long,i) \ +_(at::Half,Half,d) \ +_(float,Float,d) \ +_(double,Double,d) #define AT_FORALL_SCALAR_TYPES_EXCEPT_HALF(_) \ _(uint8_t,Byte,i) \ @@ -35,7 +35,7 @@ enum class ScalarType { n, AT_FORALL_SCALAR_TYPES(DEFINE_ENUM) #undef DEFINE_ENUM - Undefined, // 8 + Undefined, NumOptions }; diff --git a/aten/src/ATen/core/SmallVector.cpp b/aten/src/ATen/SmallVector.cpp similarity index 87% rename from aten/src/ATen/core/SmallVector.cpp rename to aten/src/ATen/SmallVector.cpp index 976809c5b50931..59095a2809c7a8 100644 --- a/aten/src/ATen/core/SmallVector.cpp +++ b/aten/src/ATen/SmallVector.cpp @@ -14,22 +14,20 @@ // ATen: modified from llvm::SmallVector. // replaced report_bad_alloc_error with std::bad_alloc -#include +#include "SmallVector.h" namespace at { /// grow_pod - This is an implementation of the grow() method which only works /// on POD-like datatypes and is out of line to reduce code duplication. -void SmallVectorBase::grow_pod( - void* FirstEl, - size_t MinSizeInBytes, - size_t TSize) { +void SmallVectorBase::grow_pod(void *FirstEl, size_t MinSizeInBytes, + size_t TSize) { size_t CurSizeBytes = size_in_bytes(); size_t NewCapacityInBytes = 2 * capacity_in_bytes() + TSize; // Always grow. if (NewCapacityInBytes < MinSizeInBytes) NewCapacityInBytes = MinSizeInBytes; - void* NewElts; + void *NewElts; if (BeginX == FirstEl) { NewElts = malloc(NewCapacityInBytes); if (NewElts == nullptr) @@ -44,9 +42,9 @@ void SmallVectorBase::grow_pod( throw std::bad_alloc(); } - this->EndX = (char*)NewElts + CurSizeBytes; + this->EndX = (char*)NewElts+CurSizeBytes; this->BeginX = NewElts; this->CapacityX = (char*)this->BeginX + NewCapacityInBytes; } -} // namespace at +} diff --git a/aten/src/ATen/SmallVector.h b/aten/src/ATen/SmallVector.h index 1dbaa933c555dd..7c52ef686aa41a 100644 --- a/aten/src/ATen/SmallVector.h +++ b/aten/src/ATen/SmallVector.h @@ -1,2 +1,982 @@ +//===- llvm/ADT/SmallVector.h - 'Normally small' vectors --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the SmallVector class. +// +//===----------------------------------------------------------------------===// + +// ATen: modified from llvm::SmallVector. +// replaced report_bad_alloc_error with std::bad_alloc +// replaced isPodLike with AT_IS_TRIVIALLY_COPYABLE +// replaced iterator_range constructor with inline Container&& constructor +// removed LLVM_NODISCARD and LLVM_ATTRIBUTE_ALWAYS_INLINE qualifiers +// removed LLVM_UNLIKELY + #pragma once -#include + +#include "AlignOf.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#if __GNUG__ && __GNUC__ < 5 +#define AT_IS_TRIVIALLY_COPYABLE(T) __has_trivial_copy(T) +#else +#define AT_IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable::value +#endif + +namespace at { + +namespace detail { + +// From llvm/Support/MathExtras.h +static inline uint64_t NextPowerOf2(uint64_t A) { + A |= (A >> 1); + A |= (A >> 2); + A |= (A >> 4); + A |= (A >> 8); + A |= (A >> 16); + A |= (A >> 32); + return A + 1; +} + +} + +/// This is all the non-templated stuff common to all SmallVectors. +class AT_API SmallVectorBase { +protected: + void *BeginX, *EndX, *CapacityX; + +protected: + SmallVectorBase(void *FirstEl, size_t Size) + : BeginX(FirstEl), EndX(FirstEl), CapacityX((char*)FirstEl+Size) {} + + /// This is an implementation of the grow() method which only works + /// on POD-like data types and is out of line to reduce code duplication. + void grow_pod(void *FirstEl, size_t MinSizeInBytes, size_t TSize); + +public: + /// This returns size()*sizeof(T). + size_t size_in_bytes() const { + return size_t((char*)EndX - (char*)BeginX); + } + + /// capacity_in_bytes - This returns capacity()*sizeof(T). + size_t capacity_in_bytes() const { + return size_t((char*)CapacityX - (char*)BeginX); + } + + bool empty() const { return BeginX == EndX; } +}; + +/// This is the part of SmallVectorTemplateBase which does not depend on whether +/// the type T is a POD. The extra dummy template argument is used by ArrayRef +/// to avoid unnecessarily requiring T to be complete. +template +class SmallVectorTemplateCommon : public SmallVectorBase { +private: + template friend struct SmallVectorStorage; + + // Allocate raw space for N elements of type T. If T has a ctor or dtor, we + // don't want it to be automatically run, so we need to represent the space as + // something else. Use an array of char of sufficient alignment. + using U = AlignedCharArrayUnion; + U FirstEl; + // Space after 'FirstEl' is clobbered, do not add any instance vars after it. + +protected: + SmallVectorTemplateCommon(size_t Size) : SmallVectorBase(&FirstEl, Size) {} + + void grow_pod(size_t MinSizeInBytes, size_t TSize) { + SmallVectorBase::grow_pod(&FirstEl, MinSizeInBytes, TSize); + } + + /// Return true if this is a smallvector which has not had dynamic + /// memory allocated for it. + bool isSmall() const { + return BeginX == static_cast(&FirstEl); + } + + /// Put this vector in a state of being small. + void resetToSmall() { + BeginX = EndX = CapacityX = &FirstEl; + } + + void setEnd(T *P) { this->EndX = P; } + +public: + using size_type = size_t; + using difference_type = ptrdiff_t; + using value_type = T; + using iterator = T *; + using const_iterator = const T *; + + using const_reverse_iterator = std::reverse_iterator; + using reverse_iterator = std::reverse_iterator; + + using reference = T &; + using const_reference = const T &; + using pointer = T *; + using const_pointer = const T *; + + // forward iterator creation methods. + iterator begin() { return (iterator)this->BeginX; } + const_iterator begin() const { return (const_iterator)this->BeginX; } + iterator end() { return (iterator)this->EndX; } + const_iterator end() const { return (const_iterator)this->EndX; } + +protected: + iterator capacity_ptr() { return (iterator)this->CapacityX; } + const_iterator capacity_ptr() const { return (const_iterator)this->CapacityX;} + +public: + // reverse iterator creation methods. + reverse_iterator rbegin() { return reverse_iterator(end()); } + const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); } + reverse_iterator rend() { return reverse_iterator(begin()); } + const_reverse_iterator rend() const { return const_reverse_iterator(begin());} + + size_type size() const { return end()-begin(); } + size_type max_size() const { return size_type(-1) / sizeof(T); } + + /// Return the total number of elements in the currently allocated buffer. + size_t capacity() const { return capacity_ptr() - begin(); } + + /// Return a pointer to the vector's buffer, even if empty(). + pointer data() { return pointer(begin()); } + /// Return a pointer to the vector's buffer, even if empty(). + const_pointer data() const { return const_pointer(begin()); } + + reference operator[](size_type idx) { + assert(idx < size()); + return begin()[idx]; + } + const_reference operator[](size_type idx) const { + assert(idx < size()); + return begin()[idx]; + } + + reference front() { + assert(!empty()); + return begin()[0]; + } + const_reference front() const { + assert(!empty()); + return begin()[0]; + } + + reference back() { + assert(!empty()); + return end()[-1]; + } + const_reference back() const { + assert(!empty()); + return end()[-1]; + } +}; + +/// SmallVectorTemplateBase - This is where we put method +/// implementations that are designed to work with non-POD-like T's. +template +class SmallVectorTemplateBase : public SmallVectorTemplateCommon { +protected: + SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon(Size) {} + + static void destroy_range(T *S, T *E) { + while (S != E) { + --E; + E->~T(); + } + } + + /// Move the range [I, E) into the uninitialized memory starting with "Dest", + /// constructing elements as needed. + template + static void uninitialized_move(It1 I, It1 E, It2 Dest) { + std::uninitialized_copy(std::make_move_iterator(I), + std::make_move_iterator(E), Dest); + } + + /// Copy the range [I, E) onto the uninitialized memory starting with "Dest", + /// constructing elements as needed. + template + static void uninitialized_copy(It1 I, It1 E, It2 Dest) { + std::uninitialized_copy(I, E, Dest); + } + + /// Grow the allocated memory (without initializing new elements), doubling + /// the size of the allocated memory. Guarantees space for at least one more + /// element, or MinSize more elements if specified. + void grow(size_t MinSize = 0); + +public: + void push_back(const T &Elt) { + if (this->EndX >= this->CapacityX) + this->grow(); + ::new ((void*) this->end()) T(Elt); + this->setEnd(this->end()+1); + } + + void push_back(T &&Elt) { + if (this->EndX >= this->CapacityX) + this->grow(); + ::new ((void*) this->end()) T(::std::move(Elt)); + this->setEnd(this->end()+1); + } + + void pop_back() { + this->setEnd(this->end()-1); + this->end()->~T(); + } +}; + +// Define this out-of-line to dissuade the C++ compiler from inlining it. +template +void SmallVectorTemplateBase::grow(size_t MinSize) { + size_t CurCapacity = this->capacity(); + size_t CurSize = this->size(); + // Always grow, even from zero. + size_t NewCapacity = size_t(detail::NextPowerOf2(CurCapacity+2)); + if (NewCapacity < MinSize) + NewCapacity = MinSize; + T *NewElts = static_cast(malloc(NewCapacity*sizeof(T))); + if (NewElts == nullptr) + throw std::bad_alloc(); + + // Move the elements over. + this->uninitialized_move(this->begin(), this->end(), NewElts); + + // Destroy the original elements. + destroy_range(this->begin(), this->end()); + + // If this wasn't grown from the inline copy, deallocate the old space. + if (!this->isSmall()) + free(this->begin()); + + this->setEnd(NewElts+CurSize); + this->BeginX = NewElts; + this->CapacityX = this->begin()+NewCapacity; +} + + +/// SmallVectorTemplateBase - This is where we put method +/// implementations that are designed to work with POD-like T's. +template +class SmallVectorTemplateBase : public SmallVectorTemplateCommon { +protected: + SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon(Size) {} + + // No need to do a destroy loop for POD's. + static void destroy_range(T *, T *) {} + + /// Move the range [I, E) onto the uninitialized memory + /// starting with "Dest", constructing elements into it as needed. + template + static void uninitialized_move(It1 I, It1 E, It2 Dest) { + // Just do a copy. + uninitialized_copy(I, E, Dest); + } + + /// Copy the range [I, E) onto the uninitialized memory + /// starting with "Dest", constructing elements into it as needed. + template + static void uninitialized_copy(It1 I, It1 E, It2 Dest) { + // Arbitrary iterator types; just use the basic implementation. + std::uninitialized_copy(I, E, Dest); + } + + /// Copy the range [I, E) onto the uninitialized memory + /// starting with "Dest", constructing elements into it as needed. + template + static void uninitialized_copy( + T1 *I, T1 *E, T2 *Dest, + typename std::enable_if::type, + T2>::value>::type * = nullptr) { + // Use memcpy for PODs iterated by pointers (which includes SmallVector + // iterators): std::uninitialized_copy optimizes to memmove, but we can + // use memcpy here. Note that I and E are iterators and thus might be + // invalid for memcpy if they are equal. + if (I != E) + memcpy(Dest, I, (E - I) * sizeof(T)); + } + + /// Double the size of the allocated memory, guaranteeing space for at + /// least one more element or MinSize if specified. + void grow(size_t MinSize = 0) { + this->grow_pod(MinSize*sizeof(T), sizeof(T)); + } + +public: + void push_back(const T &Elt) { + if (this->EndX >= this->CapacityX) + this->grow(); + memcpy(this->end(), &Elt, sizeof(T)); + this->setEnd(this->end()+1); + } + + void pop_back() { + this->setEnd(this->end()-1); + } +}; + +/// This class consists of common code factored out of the SmallVector class to +/// reduce code duplication based on the SmallVector 'N' template parameter. +template +class SmallVectorImpl : public SmallVectorTemplateBase { + using SuperClass = SmallVectorTemplateBase; + +public: + using iterator = typename SuperClass::iterator; + using const_iterator = typename SuperClass::const_iterator; + using size_type = typename SuperClass::size_type; + +protected: + // Default ctor - Initialize to empty. + explicit SmallVectorImpl(unsigned N) + : SmallVectorTemplateBase(N*sizeof(T)) { + } + +public: + SmallVectorImpl(const SmallVectorImpl &) = delete; + + ~SmallVectorImpl() { + // Destroy the constructed elements in the vector. + this->destroy_range(this->begin(), this->end()); + + // If this wasn't grown from the inline copy, deallocate the old space. + if (!this->isSmall()) + free(this->begin()); + } + + void clear() { + this->destroy_range(this->begin(), this->end()); + this->EndX = this->BeginX; + } + + void resize(size_type N) { + if (N < this->size()) { + this->destroy_range(this->begin()+N, this->end()); + this->setEnd(this->begin()+N); + } else if (N > this->size()) { + if (this->capacity() < N) + this->grow(N); + auto I = this->end(); + for (auto E = this->begin() + N; I != E; ++I) + new (&*I) T(); + this->setEnd(this->begin()+N); + } + } + + void resize(size_type N, const T &NV) { + if (N < this->size()) { + this->destroy_range(this->begin()+N, this->end()); + this->setEnd(this->begin()+N); + } else if (N > this->size()) { + if (this->capacity() < N) + this->grow(N); + std::uninitialized_fill(this->end(), this->begin()+N, NV); + this->setEnd(this->begin()+N); + } + } + + void reserve(size_type N) { + if (this->capacity() < N) + this->grow(N); + } + + T pop_back_val() { + T Result = ::std::move(this->back()); + this->pop_back(); + return Result; + } + + void swap(SmallVectorImpl &RHS); + + /// Add the specified range to the end of the SmallVector. + template ::iterator_category, + std::input_iterator_tag>::value>::type> + void append(in_iter in_start, in_iter in_end) { + size_type NumInputs = std::distance(in_start, in_end); + // Grow allocated space if needed. + if (NumInputs > size_type(this->capacity_ptr()-this->end())) + this->grow(this->size()+NumInputs); + + // Copy the new elements over. + this->uninitialized_copy(in_start, in_end, this->end()); + this->setEnd(this->end() + NumInputs); + } + + /// Add the specified range to the end of the SmallVector. + void append(size_type NumInputs, const T &Elt) { + // Grow allocated space if needed. + if (NumInputs > size_type(this->capacity_ptr()-this->end())) + this->grow(this->size()+NumInputs); + + // Copy the new elements over. + std::uninitialized_fill_n(this->end(), NumInputs, Elt); + this->setEnd(this->end() + NumInputs); + } + + void append(std::initializer_list IL) { + append(IL.begin(), IL.end()); + } + + // FIXME: Consider assigning over existing elements, rather than clearing & + // re-initializing them - for all assign(...) variants. + + void assign(size_type NumElts, const T &Elt) { + clear(); + if (this->capacity() < NumElts) + this->grow(NumElts); + this->setEnd(this->begin()+NumElts); + std::uninitialized_fill(this->begin(), this->end(), Elt); + } + + template ::iterator_category, + std::input_iterator_tag>::value>::type> + void assign(in_iter in_start, in_iter in_end) { + clear(); + append(in_start, in_end); + } + + void assign(std::initializer_list IL) { + clear(); + append(IL); + } + + iterator erase(const_iterator CI) { + // Just cast away constness because this is a non-const member function. + iterator I = const_cast(CI); + + assert(I >= this->begin() && "Iterator to erase is out of bounds."); + assert(I < this->end() && "Erasing at past-the-end iterator."); + + iterator N = I; + // Shift all elts down one. + std::move(I+1, this->end(), I); + // Drop the last elt. + this->pop_back(); + return(N); + } + + iterator erase(const_iterator CS, const_iterator CE) { + // Just cast away constness because this is a non-const member function. + iterator S = const_cast(CS); + iterator E = const_cast(CE); + + assert(S >= this->begin() && "Range to erase is out of bounds."); + assert(S <= E && "Trying to erase invalid range."); + assert(E <= this->end() && "Trying to erase past the end."); + + iterator N = S; + // Shift all elts down. + iterator I = std::move(E, this->end(), S); + // Drop the last elts. + this->destroy_range(I, this->end()); + this->setEnd(I); + return(N); + } + + iterator insert(iterator I, T &&Elt) { + if (I == this->end()) { // Important special case for empty vector. + this->push_back(::std::move(Elt)); + return this->end()-1; + } + + assert(I >= this->begin() && "Insertion iterator is out of bounds."); + assert(I <= this->end() && "Inserting past the end of the vector."); + + if (this->EndX >= this->CapacityX) { + size_t EltNo = I-this->begin(); + this->grow(); + I = this->begin()+EltNo; + } + + ::new ((void*) this->end()) T(::std::move(this->back())); + // Push everything else over. + std::move_backward(I, this->end()-1, this->end()); + this->setEnd(this->end()+1); + + // If we just moved the element we're inserting, be sure to update + // the reference. + T *EltPtr = &Elt; + if (I <= EltPtr && EltPtr < this->EndX) + ++EltPtr; + + *I = ::std::move(*EltPtr); + return I; + } + + iterator insert(iterator I, const T &Elt) { + if (I == this->end()) { // Important special case for empty vector. + this->push_back(Elt); + return this->end()-1; + } + + assert(I >= this->begin() && "Insertion iterator is out of bounds."); + assert(I <= this->end() && "Inserting past the end of the vector."); + + if (this->EndX >= this->CapacityX) { + size_t EltNo = I-this->begin(); + this->grow(); + I = this->begin()+EltNo; + } + ::new ((void*) this->end()) T(std::move(this->back())); + // Push everything else over. + std::move_backward(I, this->end()-1, this->end()); + this->setEnd(this->end()+1); + + // If we just moved the element we're inserting, be sure to update + // the reference. + const T *EltPtr = &Elt; + if (I <= EltPtr && EltPtr < this->EndX) + ++EltPtr; + + *I = *EltPtr; + return I; + } + + iterator insert(iterator I, size_type NumToInsert, const T &Elt) { + // Convert iterator to elt# to avoid invalidating iterator when we reserve() + size_t InsertElt = I - this->begin(); + + if (I == this->end()) { // Important special case for empty vector. + append(NumToInsert, Elt); + return this->begin()+InsertElt; + } + + assert(I >= this->begin() && "Insertion iterator is out of bounds."); + assert(I <= this->end() && "Inserting past the end of the vector."); + + // Ensure there is enough space. + reserve(this->size() + NumToInsert); + + // Uninvalidate the iterator. + I = this->begin()+InsertElt; + + // If there are more elements between the insertion point and the end of the + // range than there are being inserted, we can use a simple approach to + // insertion. Since we already reserved space, we know that this won't + // reallocate the vector. + if (size_t(this->end()-I) >= NumToInsert) { + T *OldEnd = this->end(); + append(std::move_iterator(this->end() - NumToInsert), + std::move_iterator(this->end())); + + // Copy the existing elements that get replaced. + std::move_backward(I, OldEnd-NumToInsert, OldEnd); + + std::fill_n(I, NumToInsert, Elt); + return I; + } + + // Otherwise, we're inserting more elements than exist already, and we're + // not inserting at the end. + + // Move over the elements that we're about to overwrite. + T *OldEnd = this->end(); + this->setEnd(this->end() + NumToInsert); + size_t NumOverwritten = OldEnd-I; + this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten); + + // Replace the overwritten part. + std::fill_n(I, NumOverwritten, Elt); + + // Insert the non-overwritten middle part. + std::uninitialized_fill_n(OldEnd, NumToInsert-NumOverwritten, Elt); + return I; + } + + template ::iterator_category, + std::input_iterator_tag>::value>::type> + iterator insert(iterator I, ItTy From, ItTy To) { + // Convert iterator to elt# to avoid invalidating iterator when we reserve() + size_t InsertElt = I - this->begin(); + + if (I == this->end()) { // Important special case for empty vector. + append(From, To); + return this->begin()+InsertElt; + } + + assert(I >= this->begin() && "Insertion iterator is out of bounds."); + assert(I <= this->end() && "Inserting past the end of the vector."); + + size_t NumToInsert = std::distance(From, To); + + // Ensure there is enough space. + reserve(this->size() + NumToInsert); + + // Uninvalidate the iterator. + I = this->begin()+InsertElt; + + // If there are more elements between the insertion point and the end of the + // range than there are being inserted, we can use a simple approach to + // insertion. Since we already reserved space, we know that this won't + // reallocate the vector. + if (size_t(this->end()-I) >= NumToInsert) { + T *OldEnd = this->end(); + append(std::move_iterator(this->end() - NumToInsert), + std::move_iterator(this->end())); + + // Copy the existing elements that get replaced. + std::move_backward(I, OldEnd-NumToInsert, OldEnd); + + std::copy(From, To, I); + return I; + } + + // Otherwise, we're inserting more elements than exist already, and we're + // not inserting at the end. + + // Move over the elements that we're about to overwrite. + T *OldEnd = this->end(); + this->setEnd(this->end() + NumToInsert); + size_t NumOverwritten = OldEnd-I; + this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten); + + // Replace the overwritten part. + for (T *J = I; NumOverwritten > 0; --NumOverwritten) { + *J = *From; + ++J; ++From; + } + + // Insert the non-overwritten middle part. + this->uninitialized_copy(From, To, OldEnd); + return I; + } + + void insert(iterator I, std::initializer_list IL) { + insert(I, IL.begin(), IL.end()); + } + + template void emplace_back(ArgTypes &&... Args) { + if (this->EndX >= this->CapacityX) + this->grow(); + ::new ((void *)this->end()) T(std::forward(Args)...); + this->setEnd(this->end() + 1); + } + + SmallVectorImpl &operator=(const SmallVectorImpl &RHS); + + SmallVectorImpl &operator=(SmallVectorImpl &&RHS); + + bool operator==(const SmallVectorImpl &RHS) const { + if (this->size() != RHS.size()) return false; + return std::equal(this->begin(), this->end(), RHS.begin()); + } + bool operator!=(const SmallVectorImpl &RHS) const { + return !(*this == RHS); + } + + bool operator<(const SmallVectorImpl &RHS) const { + return std::lexicographical_compare(this->begin(), this->end(), + RHS.begin(), RHS.end()); + } + + /// Set the array size to \p N, which the current array must have enough + /// capacity for. + /// + /// This does not construct or destroy any elements in the vector. + /// + /// Clients can use this in conjunction with capacity() to write past the end + /// of the buffer when they know that more elements are available, and only + /// update the size later. This avoids the cost of value initializing elements + /// which will only be overwritten. + void set_size(size_type N) { + assert(N <= this->capacity()); + this->setEnd(this->begin() + N); + } +}; + +template +void SmallVectorImpl::swap(SmallVectorImpl &RHS) { + if (this == &RHS) return; + + // We can only avoid copying elements if neither vector is small. + if (!this->isSmall() && !RHS.isSmall()) { + std::swap(this->BeginX, RHS.BeginX); + std::swap(this->EndX, RHS.EndX); + std::swap(this->CapacityX, RHS.CapacityX); + return; + } + if (RHS.size() > this->capacity()) + this->grow(RHS.size()); + if (this->size() > RHS.capacity()) + RHS.grow(this->size()); + + // Swap the shared elements. + size_t NumShared = this->size(); + if (NumShared > RHS.size()) NumShared = RHS.size(); + for (size_type i = 0; i != NumShared; ++i) + std::swap((*this)[i], RHS[i]); + + // Copy over the extra elts. + if (this->size() > RHS.size()) { + size_t EltDiff = this->size() - RHS.size(); + this->uninitialized_copy(this->begin()+NumShared, this->end(), RHS.end()); + RHS.setEnd(RHS.end()+EltDiff); + this->destroy_range(this->begin()+NumShared, this->end()); + this->setEnd(this->begin()+NumShared); + } else if (RHS.size() > this->size()) { + size_t EltDiff = RHS.size() - this->size(); + this->uninitialized_copy(RHS.begin()+NumShared, RHS.end(), this->end()); + this->setEnd(this->end() + EltDiff); + this->destroy_range(RHS.begin()+NumShared, RHS.end()); + RHS.setEnd(RHS.begin()+NumShared); + } +} + +template +SmallVectorImpl &SmallVectorImpl:: + operator=(const SmallVectorImpl &RHS) { + // Avoid self-assignment. + if (this == &RHS) return *this; + + // If we already have sufficient space, assign the common elements, then + // destroy any excess. + size_t RHSSize = RHS.size(); + size_t CurSize = this->size(); + if (CurSize >= RHSSize) { + // Assign common elements. + iterator NewEnd; + if (RHSSize) + NewEnd = std::copy(RHS.begin(), RHS.begin()+RHSSize, this->begin()); + else + NewEnd = this->begin(); + + // Destroy excess elements. + this->destroy_range(NewEnd, this->end()); + + // Trim. + this->setEnd(NewEnd); + return *this; + } + + // If we have to grow to have enough elements, destroy the current elements. + // This allows us to avoid copying them during the grow. + // FIXME: don't do this if they're efficiently moveable. + if (this->capacity() < RHSSize) { + // Destroy current elements. + this->destroy_range(this->begin(), this->end()); + this->setEnd(this->begin()); + CurSize = 0; + this->grow(RHSSize); + } else if (CurSize) { + // Otherwise, use assignment for the already-constructed elements. + std::copy(RHS.begin(), RHS.begin()+CurSize, this->begin()); + } + + // Copy construct the new elements in place. + this->uninitialized_copy(RHS.begin()+CurSize, RHS.end(), + this->begin()+CurSize); + + // Set end. + this->setEnd(this->begin()+RHSSize); + return *this; +} + +template +SmallVectorImpl &SmallVectorImpl::operator=(SmallVectorImpl &&RHS) { + // Avoid self-assignment. + if (this == &RHS) return *this; + + // If the RHS isn't small, clear this vector and then steal its buffer. + if (!RHS.isSmall()) { + this->destroy_range(this->begin(), this->end()); + if (!this->isSmall()) free(this->begin()); + this->BeginX = RHS.BeginX; + this->EndX = RHS.EndX; + this->CapacityX = RHS.CapacityX; + RHS.resetToSmall(); + return *this; + } + + // If we already have sufficient space, assign the common elements, then + // destroy any excess. + size_t RHSSize = RHS.size(); + size_t CurSize = this->size(); + if (CurSize >= RHSSize) { + // Assign common elements. + iterator NewEnd = this->begin(); + if (RHSSize) + NewEnd = std::move(RHS.begin(), RHS.end(), NewEnd); + + // Destroy excess elements and trim the bounds. + this->destroy_range(NewEnd, this->end()); + this->setEnd(NewEnd); + + // Clear the RHS. + RHS.clear(); + + return *this; + } + + // If we have to grow to have enough elements, destroy the current elements. + // This allows us to avoid copying them during the grow. + // FIXME: this may not actually make any sense if we can efficiently move + // elements. + if (this->capacity() < RHSSize) { + // Destroy current elements. + this->destroy_range(this->begin(), this->end()); + this->setEnd(this->begin()); + CurSize = 0; + this->grow(RHSSize); + } else if (CurSize) { + // Otherwise, use assignment for the already-constructed elements. + std::move(RHS.begin(), RHS.begin()+CurSize, this->begin()); + } + + // Move-construct the new elements in place. + this->uninitialized_move(RHS.begin()+CurSize, RHS.end(), + this->begin()+CurSize); + + // Set end. + this->setEnd(this->begin()+RHSSize); + + RHS.clear(); + return *this; +} + +/// Storage for the SmallVector elements which aren't contained in +/// SmallVectorTemplateCommon. There are 'N-1' elements here. The remaining '1' +/// element is in the base class. This is specialized for the N=1 and N=0 cases +/// to avoid allocating unnecessary storage. +template +struct SmallVectorStorage { + typename SmallVectorTemplateCommon::U InlineElts[N - 1]; +}; +template struct SmallVectorStorage {}; +template struct SmallVectorStorage {}; + +/// This is a 'vector' (really, a variable-sized array), optimized +/// for the case when the array is small. It contains some number of elements +/// in-place, which allows it to avoid heap allocation when the actual number of +/// elements is below that threshold. This allows normal "small" cases to be +/// fast without losing generality for large inputs. +/// +/// Note that this does not attempt to be exception safe. +/// +template +class SmallVector : public SmallVectorImpl { + /// Inline space for elements which aren't stored in the base class. + SmallVectorStorage Storage; + +public: + SmallVector() : SmallVectorImpl(N) {} + + explicit SmallVector(size_t Size, const T &Value = T()) + : SmallVectorImpl(N) { + this->assign(Size, Value); + } + + template ::iterator_category, + std::input_iterator_tag>::value>::type> + SmallVector(ItTy S, ItTy E) : SmallVectorImpl(N) { + this->append(S, E); + } + + template + explicit SmallVector(Container &&c) : SmallVectorImpl(N) { + this->append(c.begin(), c.end()); + } + + SmallVector(std::initializer_list IL) : SmallVectorImpl(N) { + this->assign(IL); + } + + SmallVector(const SmallVector &RHS) : SmallVectorImpl(N) { + if (!RHS.empty()) + SmallVectorImpl::operator=(RHS); + } + + const SmallVector &operator=(const SmallVector &RHS) { + SmallVectorImpl::operator=(RHS); + return *this; + } + + SmallVector(SmallVector &&RHS) : SmallVectorImpl(N) { + if (!RHS.empty()) + SmallVectorImpl::operator=(::std::move(RHS)); + } + + template + const SmallVector &operator=(const Container &RHS) { + this->assign(RHS.begin(), RHS.end()); + return *this; + } + + SmallVector(SmallVectorImpl &&RHS) : SmallVectorImpl(N) { + if (!RHS.empty()) + SmallVectorImpl::operator=(::std::move(RHS)); + } + + const SmallVector &operator=(SmallVector &&RHS) { + SmallVectorImpl::operator=(::std::move(RHS)); + return *this; + } + + const SmallVector &operator=(SmallVectorImpl &&RHS) { + SmallVectorImpl::operator=(::std::move(RHS)); + return *this; + } + + template + const SmallVector &operator=(Container &&C) { + this->assign(C.begin(), C.end()); + return *this; + } + + const SmallVector &operator=(std::initializer_list IL) { + this->assign(IL); + return *this; + } +}; + +template +inline size_t capacity_in_bytes(const SmallVector &X) { + return X.capacity_in_bytes(); +} + +} // end namespace at + +namespace std { + + /// Implement std::swap in terms of SmallVector swap. + template + inline void + swap(at::SmallVectorImpl &LHS, at::SmallVectorImpl &RHS) { + LHS.swap(RHS); + } + + /// Implement std::swap in terms of SmallVector swap. + template + inline void + swap(at::SmallVector &LHS, at::SmallVector &RHS) { + LHS.swap(RHS); + } + +} // end namespace std diff --git a/aten/src/ATen/SparseTensorImpl.cpp b/aten/src/ATen/SparseTensorImpl.cpp index 03a5a6008e7d24..968fd8ebbec266 100644 --- a/aten/src/ATen/SparseTensorImpl.cpp +++ b/aten/src/ATen/SparseTensorImpl.cpp @@ -18,14 +18,14 @@ namespace at { // tensor and a [0] size values tensor for such an empty tensor. However, // we don't currently support zero-size dimensions, so we can't actually // do this; so we just allocate zero-size tensors for everything. -SparseTensorImpl::SparseTensorImpl(at::Backend backend, at::ScalarType scalar_type) - : TensorImpl(backend, scalar_type, nullptr, false) +SparseTensorImpl::SparseTensorImpl(Type * type) + : TensorImpl(type, nullptr) , size_{0} , sparseDims_(1) , denseDims_(0) - , indices_(globalContext().getTypeOpt(toDense(backend), ScalarType::Long)->tensor()) - , values_(globalContext().getTypeOpt(toDense(backend), scalar_type)->tensor()) { - AT_ASSERT(backend == Backend::SparseCPU || backend == Backend::SparseCUDA); + , indices_(type->toDense().toScalarType(ScalarType::Long).tensor()) + , values_(type->toDense().tensor()) { + AT_ASSERT(type->is_sparse()); } IntList SparseTensorImpl::sizes() const { diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h index 307c0f9e5574d1..9ef08705bb0f45 100644 --- a/aten/src/ATen/SparseTensorImpl.h +++ b/aten/src/ATen/SparseTensorImpl.h @@ -48,7 +48,7 @@ struct AT_API SparseTensorImpl : public TensorImpl { public: // Public for now... - explicit SparseTensorImpl(at::Backend, at::ScalarType); + explicit SparseTensorImpl(Type * type); int64_t nnz() const { return nnz_; } int64_t sparseDims() const { return sparseDims_; } @@ -75,7 +75,7 @@ struct AT_API SparseTensorImpl : public TensorImpl { if (size.size() == 0) { size_ = {0}; } else { - size_ = size.vec(); + size_ = size; } sparseDims_ = sparseDims; denseDims_ = denseDims; diff --git a/aten/src/ATen/Storage.cpp b/aten/src/ATen/Storage.cpp index 991cfba92efd2a..f5ba512cc27105 100644 --- a/aten/src/ATen/Storage.cpp +++ b/aten/src/ATen/Storage.cpp @@ -1,32 +1,23 @@ #include +#include #include namespace at { -Storage::Storage(at::ScalarType scalar_type, size_t size, Allocator* allocator) - : storage_impl_(new StorageImpl( - scalar_type, - size, - allocator, - /* resizable */ false)) {} - -Storage::Storage( - at::ScalarType scalar_type, - at::DataPtr data_ptr, - size_t size, - const std::function& deleter) - : storage_impl_(new StorageImpl( - scalar_type, - size, - std::move(data_ptr), - /* allocator */ nullptr, - /* resizable */ false)) {} - Storage::~Storage() { if (!storage_impl_) { return; } - storage_impl_->release(); + if (--storage_impl_->refcount == 0) { + if (storage_impl_->finalizer) { + (*storage_impl_->finalizer)(); + } + storage_impl_->finalizer = nullptr; + storage_impl_->data_ptr.clear(); + if (storage_impl_ && --storage_impl_->weakcount == 0) { + delete storage_impl_; + } + } } } // namespace at diff --git a/aten/src/ATen/Storage.h b/aten/src/ATen/Storage.h index aa27296c74d40f..a5c85192e36f8c 100644 --- a/aten/src/ATen/Storage.h +++ b/aten/src/ATen/Storage.h @@ -8,12 +8,6 @@ struct AT_API Storage { public: Storage() = delete; Storage(StorageImpl* storage_impl) : storage_impl_(storage_impl) {} - Storage(at::ScalarType, size_t size, Allocator* allocator); - Storage( - at::ScalarType, - at::DataPtr, - size_t size, - const std::function& deleter); ~Storage(); // There are reasonable interpretations of these constructors, but they're to // be implemented on demand. diff --git a/aten/src/ATen/StorageImpl.cpp b/aten/src/ATen/StorageImpl.cpp index 6e3d693d012c5c..a26f8971310aa5 100644 --- a/aten/src/ATen/StorageImpl.cpp +++ b/aten/src/ATen/StorageImpl.cpp @@ -12,6 +12,8 @@ StorageImpl::StorageImpl( : scalar_type(scalar_type), data_ptr(std::move(data_ptr)), size(size), + refcount(1), + weakcount(1), // from the strong reference resizable(resizable), allocator(allocator), finalizer(nullptr) {} diff --git a/aten/src/ATen/StorageImpl.h b/aten/src/ATen/StorageImpl.h index f1c23c54677dba..c48ec51e013d4c 100644 --- a/aten/src/ATen/StorageImpl.h +++ b/aten/src/ATen/StorageImpl.h @@ -5,7 +5,6 @@ #include #include #include -#include #include #include @@ -40,7 +39,7 @@ namespace at { struct Type; -struct AT_API StorageImpl : public Retainable { +struct TH_CPP_API StorageImpl { StorageImpl() = delete; virtual ~StorageImpl() {}; @@ -49,6 +48,8 @@ struct AT_API StorageImpl : public Retainable { at::ScalarType scalar_type; at::DataPtr data_ptr; ptrdiff_t size; + std::atomic refcount; + std::atomic weakcount; bool resizable; at::Allocator* allocator; std::unique_ptr finalizer; @@ -57,8 +58,6 @@ struct AT_API StorageImpl : public Retainable { StorageImpl(StorageImpl&&) = delete; StorageImpl(const StorageImpl&&) = delete; - // TODO: Rename this into th_data, and move it out of the class; - // the real data shouldn't call th::from_type template inline T* data() const { auto scalar_type_T = at::CTypeToScalarType>::to(); @@ -77,14 +76,6 @@ struct AT_API StorageImpl : public Retainable { return static_cast(this->data_ptr.get()); } - void release_resources() { - if (finalizer) { - (*finalizer)(); - } - finalizer = nullptr; - data_ptr.clear(); - } - void operator=(const StorageImpl&) = delete; virtual size_t elementSize() const { @@ -103,6 +94,9 @@ struct AT_API StorageImpl : public Retainable { const void* data() const { return data_ptr.get(); }; + void retain() { + ++refcount; + } int getDevice() const { return data_ptr.device().index(); diff --git a/aten/src/ATen/THLongStorageView.h b/aten/src/ATen/THLongStorageView.h index 8ebcfdaeada40f..55e7d3de6dea4a 100644 --- a/aten/src/ATen/THLongStorageView.h +++ b/aten/src/ATen/THLongStorageView.h @@ -64,6 +64,7 @@ class THLongStorageView { storage.size = ref.size(); } storage.scalar_type = at::CTypeToScalarType>::to(); + storage.refcount = 0; storage.set_resizable(false); } private: diff --git a/aten/src/ATen/TensorGeometry.h b/aten/src/ATen/TensorGeometry.h index 15f59e902182c4..60f6098762cd05 100644 --- a/aten/src/ATen/TensorGeometry.h +++ b/aten/src/ATen/TensorGeometry.h @@ -9,7 +9,7 @@ struct AT_API TensorGeometry { TensorGeometry() : storage_offset_(0) {} explicit TensorGeometry(IntList sizes) - : sizes_(sizes.vec()) + : sizes_(sizes) , strides_(sizes.size()) , storage_offset_(0) { int64_t dim = sizes.size(); @@ -21,8 +21,8 @@ struct AT_API TensorGeometry { } explicit TensorGeometry(const Tensor& t) - : sizes_(t.sizes().vec()) - , strides_(t.strides().vec()) + : sizes_(t.sizes()) + , strides_(t.strides()) , storage_offset_(t.storage_offset()) {} // true if the tensor is contiguous diff --git a/aten/src/ATen/TensorImpl.cpp b/aten/src/ATen/TensorImpl.cpp index a48cb033b2de49..59cc303a1acf5c 100644 --- a/aten/src/ATen/TensorImpl.cpp +++ b/aten/src/ATen/TensorImpl.cpp @@ -2,23 +2,10 @@ #include #include -#include - -#include #include namespace at { - -Type& TensorImpl::type() const { - Type* base_type = &globalContext().getType(backend_, scalar_type_); - if (is_variable_) { - return detail::getVariableHooks().getVariableType(*base_type); - } else { - return *base_type; - } -} - Tensor& TensorImpl::grad() { AT_ERROR("grad is not implemented for Tensor"); } diff --git a/aten/src/ATen/TensorImpl.h b/aten/src/ATen/TensorImpl.h index 1aa4d8390ed175..9c3591eb96b31f 100644 --- a/aten/src/ATen/TensorImpl.h +++ b/aten/src/ATen/TensorImpl.h @@ -18,18 +18,16 @@ struct Tensor; namespace at { struct AT_API TensorImpl : public Retainable { - explicit TensorImpl(Backend backend, ScalarType scalar_type, THTensor * tensor, bool is_variable) - : backend_(backend), scalar_type_(scalar_type), is_variable_(is_variable), tensor(tensor) {} + explicit TensorImpl(Type * type, THTensor * tensor) + : type_(type), tensor(tensor) {} virtual ~TensorImpl(); virtual void release_resources() override; - // The implementation of this method will have to be hoisted out and - // hooked in, so that Caffe2 doesn't need to know about Context - // TODO: This really really needs to be inlined. - Type & type() const; - + Type & type() const { + return *type_; + } const char * toString() const; virtual IntList sizes() const; virtual IntList strides() const; @@ -93,12 +91,8 @@ struct AT_API TensorImpl : public Retainable { virtual void set_data(Tensor new_data); protected: - Backend backend_; - // INVARIANT: When storage is non-null, this scalar type must - // agree with the scalar type in storage - ScalarType scalar_type_; - bool is_variable_ = false; bool is_wrapped_number_ = false; + Type * type_; public: THTensor * tensor; }; diff --git a/aten/src/ATen/UndefinedTensor.cpp b/aten/src/ATen/UndefinedTensor.cpp index ecfb70fa1bbede..5e4059421c1283 100644 --- a/aten/src/ATen/UndefinedTensor.cpp +++ b/aten/src/ATen/UndefinedTensor.cpp @@ -6,7 +6,7 @@ namespace at { // should this use the globalContext? Can it get a context passed in somehow? UndefinedTensor::UndefinedTensor() -: TensorImpl(Backend::Undefined, ScalarType::Undefined, nullptr, /* is variable */ false) { +: TensorImpl(&(globalContext().getType(Backend::Undefined,ScalarType::Undefined)), nullptr) { } IntList UndefinedTensor::sizes() const { diff --git a/aten/src/ATen/core/ATenCoreTest.cpp b/aten/src/ATen/core/ATenCoreTest.cpp deleted file mode 100644 index 5bb595a0bce5de..00000000000000 --- a/aten/src/ATen/core/ATenCoreTest.cpp +++ /dev/null @@ -1,10 +0,0 @@ -#include - -namespace at { - -static int CoreTestGlobal = 0; -int CoreTest() { - return CoreTestGlobal++; -} - -} // namespace at diff --git a/aten/src/ATen/core/ATenCoreTest.h b/aten/src/ATen/core/ATenCoreTest.h deleted file mode 100644 index ee8471f66fe258..00000000000000 --- a/aten/src/ATen/core/ATenCoreTest.h +++ /dev/null @@ -1,8 +0,0 @@ -#pragma once - -#include - -namespace at { - -AT_CORE_API int CoreTest(); -} diff --git a/aten/src/ATen/core/ArrayRef.h b/aten/src/ATen/core/ArrayRef.h deleted file mode 100644 index 7e997d6572f3c0..00000000000000 --- a/aten/src/ATen/core/ArrayRef.h +++ /dev/null @@ -1,212 +0,0 @@ -//===--- ArrayRef.h - Array Reference Wrapper -------------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -// ATen: modified from llvm::ArrayRef. -// removed llvm-specific functionality -// removed some implicit const -> non-const conversions that rely on -// complicated std::enable_if meta-programming -// removed a bunch of slice variants for simplicity... - -#pragma once - -#include -#include -#include - -#include -#include -#include - -namespace at { - -/// ArrayRef - Represent a constant reference to an array (0 or more elements -/// consecutively in memory), i.e. a start pointer and a length. It allows -/// various APIs to take consecutive elements easily and conveniently. -/// -/// This class does not own the underlying data, it is expected to be used in -/// situations where the data resides in some other buffer, whose lifetime -/// extends past that of the ArrayRef. For this reason, it is not in general -/// safe to store an ArrayRef. -/// -/// This is intended to be trivially copyable, so it should be passed by -/// value. -template -class ArrayRef final { - public: - using iterator = const T*; - using const_iterator = const T*; - using size_type = size_t; - - using reverse_iterator = std::reverse_iterator; - - private: - /// The start of the array, in an external buffer. - const T* Data; - - /// The number of elements. - size_type Length; - - public: - /// @name Constructors - /// @{ - - /// Construct an empty ArrayRef. - /* implicit */ constexpr ArrayRef() : Data(nullptr), Length(0) {} - - /// Construct an ArrayRef from a single element. - // TODO Make this explicit - constexpr ArrayRef(const T& OneElt) : Data(&OneElt), Length(1) {} - - /// Construct an ArrayRef from a pointer and length. - constexpr ArrayRef(const T* data, size_t length) - : Data(data), Length(length) {} - - /// Construct an ArrayRef from a range. - constexpr ArrayRef(const T* begin, const T* end) - : Data(begin), Length(end - begin) {} - - /// Construct an ArrayRef from a SmallVector. This is templated in order to - /// avoid instantiating SmallVectorTemplateCommon whenever we - /// copy-construct an ArrayRef. - template - /* implicit */ ArrayRef(const SmallVectorTemplateCommon& Vec) - : Data(Vec.data()), Length(Vec.size()) {} - - /// Construct an ArrayRef from a std::vector. - template - /* implicit */ ArrayRef(const std::vector& Vec) - : Data(Vec.data()), Length(Vec.size()) {} - - /// Construct an ArrayRef from a std::array - template - /* implicit */ constexpr ArrayRef(const std::array& Arr) - : Data(Arr.data()), Length(N) {} - - /// Construct an ArrayRef from a C array. - template - /* implicit */ constexpr ArrayRef(const T (&Arr)[N]) : Data(Arr), Length(N) {} - - /// Construct an ArrayRef from a std::initializer_list. - /* implicit */ constexpr ArrayRef(const std::initializer_list& Vec) - : Data(Vec.begin() == Vec.end() ? static_cast(nullptr) : Vec.begin()), - Length(Vec.size()) {} - - /// @} - /// @name Simple Operations - /// @{ - - constexpr iterator begin() const { - return Data; - } - constexpr iterator end() const { - return Data + Length; - } - - constexpr reverse_iterator rbegin() const { - return reverse_iterator(end()); - } - constexpr reverse_iterator rend() const { - return reverse_iterator(begin()); - } - - /// empty - Check if the array is empty. - constexpr bool empty() const { - return Length == 0; - } - - constexpr const T* data() const { - return Data; - } - - /// size - Get the array size. - constexpr size_t size() const { - return Length; - } - - /// front - Get the first element. - AT_CPP14_CONSTEXPR const T& front() const { - AT_CHECK(!empty(), "ArrayRef: attempted to access front() of empty list"); - return Data[0]; - } - - /// back - Get the last element. - AT_CPP14_CONSTEXPR const T& back() const { - AT_CHECK(!empty(), "ArrayRef: attempted to access back() of empty list"); - return Data[Length - 1]; - } - - /// equals - Check for element-wise equality. - constexpr bool equals(ArrayRef RHS) const { - return Length == RHS.Length && std::equal(begin(), end(), RHS.begin()); - } - - /// slice(n, m) - Chop off the first N elements of the array, and keep M - /// elements in the array. - AT_CPP14_CONSTEXPR ArrayRef slice(size_t N, size_t M) const { - AT_CHECK( - N + M <= size(), - "ArrayRef: invalid slice, N = ", - N, - "; M = ", - M, - "; size = ", - size()); - return ArrayRef(data() + N, M); - } - - /// slice(n) - Chop off the first N elements of the array. - constexpr ArrayRef slice(size_t N) const { - return slice(N, size() - N); - } - - /// @} - /// @name Operator Overloads - /// @{ - constexpr const T& operator[](size_t Index) const { - return Data[Index]; - } - - /// Vector compatibility - AT_CPP14_CONSTEXPR const T& at(size_t Index) const { - AT_CHECK( - Index < Length, - "ArrayRef: invalid index Index = ", - Index, - "; Length = ", - Length); - return Data[Index]; - } - - /// Disallow accidental assignment from a temporary. - /// - /// The declaration here is extra complicated so that "arrayRef = {}" - /// continues to select the move assignment operator. - template - typename std::enable_if::value, ArrayRef>::type& - operator=(U&& Temporary) = delete; - - /// Disallow accidental assignment from a temporary. - /// - /// The declaration here is extra complicated so that "arrayRef = {}" - /// continues to select the move assignment operator. - template - typename std::enable_if::value, ArrayRef>::type& - operator=(std::initializer_list) = delete; - - /// @} - /// @name Expensive Operations - /// @{ - std::vector vec() const { - return std::vector(Data, Data + Length); - } - - /// @} -}; - -} // namespace at diff --git a/aten/src/ATen/core/Backtrace.h b/aten/src/ATen/core/Backtrace.h deleted file mode 100644 index ec4c17c6f6a531..00000000000000 --- a/aten/src/ATen/core/Backtrace.h +++ /dev/null @@ -1,28 +0,0 @@ -#pragma once - -#include -#include -#include - -#include - -namespace at { -/// Utility to demangle a C++ symbol name. -AT_CORE_API std::string demangle(const char* name); - -/// Returns the printable name of the type. -template -inline const char* demangle_type() { -#ifdef __GXX_RTTI - static const std::string name = demangle(typeid(T).name()); - return name.c_str(); -#else // __GXX_RTTI - return "(RTTI disabled, cannot show name)"; -#endif // __GXX_RTTI -} - -AT_CORE_API std::string get_backtrace( - size_t frames_to_skip = 0, - size_t maximum_number_of_frames = 64, - bool skip_python_frames = true); -} // namespace at diff --git a/aten/src/ATen/core/C++17.cpp b/aten/src/ATen/core/C++17.cpp deleted file mode 100644 index 6074cb6be15e9c..00000000000000 --- a/aten/src/ATen/core/C++17.cpp +++ /dev/null @@ -1 +0,0 @@ -#include diff --git a/aten/src/ATen/core/CMakeLists.txt b/aten/src/ATen/core/CMakeLists.txt deleted file mode 100644 index 59149be784c3a6..00000000000000 --- a/aten/src/ATen/core/CMakeLists.txt +++ /dev/null @@ -1,16 +0,0 @@ -# This file solely exists to let Caffe2 Android build get at the list -# of core files without having to trundle through all of ATen's CMakeLists.txt - -FILE(GLOB ATen_CORE_HEADERS "*.h") -FILE(GLOB ATen_CORE_SRCS "*.cpp") -FILE(GLOB ATen_CORE_TEST_SRCS "*_test.cpp") -EXCLUDE(ATen_CORE_SRCS "${ATen_CORE_SRCS}" ${ATen_CORE_TEST_SRCS}) - -# Pass to parent -set(ATen_CORE_HEADERS ${ATen_CORE_HEADERS} PARENT_SCOPE) -set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE) -set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE) -# This is a little dodgy, because it means ALL ATen headers are made -# visible. Fortunately, you should just get a lot of undefined symbol -# errors if you go outside core -set(ATen_CORE_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../.. PARENT_SCOPE) diff --git a/aten/src/ATen/core/CoreAPI.h b/aten/src/ATen/core/CoreAPI.h deleted file mode 100644 index 0ee114d9f4cfdd..00000000000000 --- a/aten/src/ATen/core/CoreAPI.h +++ /dev/null @@ -1,20 +0,0 @@ -// You can use the definition AT_CORE_STATIC_WINDOWS to control whether -// or not we apply __declspec. You will want to set this as -// -DAT_CORE_STATIC_WINDOWS=1 when compiling code which links -// against ATen/core on Windows, when ATen/core is built as a -// static library (in which case, saying the symbol is coming -// from a DLL would be incorrect). - -#ifdef _WIN32 -#if !defined(AT_CORE_STATIC_WINDOWS) -#if defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS) -#define AT_CORE_API __declspec(dllexport) -#else -#define AT_CORE_API __declspec(dllimport) -#endif -#else -#define AT_CORE_API -#endif -#else -#define AT_CORE_API -#endif diff --git a/aten/src/ATen/core/Error.h b/aten/src/ATen/core/Error.h deleted file mode 100644 index b95a5f120f21b8..00000000000000 --- a/aten/src/ATen/core/Error.h +++ /dev/null @@ -1,147 +0,0 @@ -#pragma once - -#include -#include - -#include -#include -#include -#include -#include - -#if defined(_MSC_VER) && _MSC_VER <= 1900 -#define __func__ __FUNCTION__ -#endif - -namespace at { - -namespace detail { - -inline std::ostream& _str(std::ostream& ss) { - return ss; -} - -template -inline std::ostream& _str(std::ostream& ss, const T& t) { - ss << t; - return ss; -} - -template -inline std::ostream& _str(std::ostream& ss, const T& t, const Args&... args) { - return _str(_str(ss, t), args...); -} - -} // namespace detail - -// Convert a list of string-like arguments into a single string. -template -inline std::string str(const Args&... args) { - std::ostringstream ss; - detail::_str(ss, args...); - return ss.str(); -} - -// Specializations for already-a-string types. -template <> -inline std::string str(const std::string& str) { - return str; -} -inline std::string str(const char* c_str) { - return c_str; -} - -/// Represents a location in source code (for debugging). -struct SourceLocation { - const char* function; - const char* file; - uint32_t line; -}; - -std::ostream& operator<<(std::ostream& out, const SourceLocation& loc); - -/// The primary ATen error class. -/// Provides a complete error message with source location information via -/// `what()`, and a more concise message via `what_without_backtrace()`. Should -/// primarily be used with the `AT_ERROR` macro. -/// -/// NB: at::Error is handled specially by the default torch to suppress the -/// backtrace, see torch/csrc/Exceptions.h -class AT_CORE_API Error : public std::exception { - std::string what_without_backtrace_; - std::string what_; - - public: - Error(SourceLocation source_location, std::string err); - - /// Returns the complete error message, including the source location. - const char* what() const noexcept override { - return what_.c_str(); - } - - /// Returns only the error message string, without source location. - const char* what_without_backtrace() const noexcept { - return what_without_backtrace_.c_str(); - } -}; - -class AT_CORE_API Warning { - using handler_t = - void (*)(const SourceLocation& source_location, const char* msg); - - public: - /// Issue a warning with a given message. Dispatched to the current - /// warning handler. - static void warn(SourceLocation source_location, std::string msg); - - /// Sets the global warning handler. This is not thread-safe, so it should - /// generally be called once during initialization. - static void set_warning_handler(handler_t handler); - - /// The default warning handler. Prints the message to stderr. - static void print_warning( - const SourceLocation& source_location, - const char* msg); - - private: - static handler_t warning_handler_; -}; - -} // namespace at - -// TODO: variants that print the expression tested and thus don't require -// strings -// TODO: CAFFE_ENFORCE_WITH_CALLER style macro - -#define AT_ERROR(...) \ - throw at::Error({__func__, __FILE__, __LINE__}, at::str(__VA_ARGS__)) - -#define AT_WARN(...) \ - at::Warning::warn({__func__, __FILE__, __LINE__}, at::str(__VA_ARGS__)) - -#define AT_ASSERT(cond) \ - if (!(cond)) { \ - AT_ERROR( \ - #cond " ASSERT FAILED at ", \ - __FILE__, \ - ":", \ - __LINE__, \ - ", please report a bug to PyTorch."); \ - } - -#define AT_ASSERTM(cond, ...) \ - if (!(cond)) { \ - AT_ERROR(at::str( \ - #cond, \ - " ASSERT FAILED at ", \ - __FILE__, \ - ":", \ - __LINE__, \ - ", please report a bug to PyTorch. ", \ - __VA_ARGS__)); \ - } - -#define AT_CHECK(cond, ...) \ - if (!(cond)) { \ - AT_ERROR(at::str(__VA_ARGS__)); \ - } diff --git a/aten/src/ATen/core/Half-inl.h b/aten/src/ATen/core/Half-inl.h deleted file mode 100644 index d89b496d7083b8..00000000000000 --- a/aten/src/ATen/core/Half-inl.h +++ /dev/null @@ -1,249 +0,0 @@ -#pragma once - -#include -#include -#include - -#ifdef __CUDACC__ -#include -#endif - -#if defined(__HIP_DEVICE_COMPILE__) -#include -#endif - -namespace at { - -/// Constructors - -inline AT_HOSTDEVICE Half::Half(float value) { -#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - x = __half_as_short(__float2half(value)); -#else - x = detail::float2halfbits(value); -#endif -} - -/// Implicit conversions - -inline AT_HOSTDEVICE Half::operator float() const { -#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - return __half2float(*reinterpret_cast(&x)); -#else - return detail::halfbits2float(x); -#endif -} - -#ifdef __CUDACC__ -inline AT_HOSTDEVICE Half::Half(const __half& value) { - x = *reinterpret_cast(&value); -} -inline AT_HOSTDEVICE Half::operator __half() const { - return *reinterpret_cast(&x); -} -#endif - -/// Arithmetic - -inline AT_HOSTDEVICE Half operator+(const Half& a, const Half& b) { - return (float)a + (float)b; -} - -inline AT_HOSTDEVICE Half operator-(const Half& a, const Half& b) { - return (float)a - (float)b; -} - -inline AT_HOSTDEVICE Half operator*(const Half& a, const Half& b) { - return (float)a * (float)b; -} - -inline AT_HOSTDEVICE Half operator/(const Half& a, const Half& b) { - return (float)a / (float)b; -} - -inline AT_HOSTDEVICE Half operator-(const Half& a) { - return -(float)a; -} - -inline AT_HOSTDEVICE Half& operator+=(Half& a, const Half& b) { - a = a + b; - return a; -} - -inline AT_HOSTDEVICE Half& operator-=(Half& a, const Half& b) { - a = a - b; - return a; -} - -inline AT_HOSTDEVICE Half& operator*=(Half& a, const Half& b) { - a = a * b; - return a; -} - -inline AT_HOSTDEVICE Half& operator/=(Half& a, const Half& b) { - a = a / b; - return a; -} - -/// Arithmetic with floats - -inline AT_HOSTDEVICE float operator+(Half a, float b) { - return (float)a + b; -} -inline AT_HOSTDEVICE float operator-(Half a, float b) { - return (float)a - b; -} -inline AT_HOSTDEVICE float operator*(Half a, float b) { - return (float)a * b; -} -inline AT_HOSTDEVICE float operator/(Half a, float b) { - return (float)a / b; -} - -inline AT_HOSTDEVICE float operator+(float a, Half b) { - return a + (float)b; -} -inline AT_HOSTDEVICE float operator-(float a, Half b) { - return a - (float)b; -} -inline AT_HOSTDEVICE float operator*(float a, Half b) { - return a * (float)b; -} -inline AT_HOSTDEVICE float operator/(float a, Half b) { - return a / (float)b; -} - -inline AT_HOSTDEVICE float& operator+=(float& a, const Half& b) { - return a += (float)b; -} -inline AT_HOSTDEVICE float& operator-=(float& a, const Half& b) { - return a -= (float)b; -} -inline AT_HOSTDEVICE float& operator*=(float& a, const Half& b) { - return a *= (float)b; -} -inline AT_HOSTDEVICE float& operator/=(float& a, const Half& b) { - return a /= (float)b; -} - -/// Arithmetic with doubles - -inline AT_HOSTDEVICE double operator+(Half a, double b) { - return (double)a + b; -} -inline AT_HOSTDEVICE double operator-(Half a, double b) { - return (double)a - b; -} -inline AT_HOSTDEVICE double operator*(Half a, double b) { - return (double)a * b; -} -inline AT_HOSTDEVICE double operator/(Half a, double b) { - return (double)a / b; -} - -inline AT_HOSTDEVICE double operator+(double a, Half b) { - return a + (double)b; -} -inline AT_HOSTDEVICE double operator-(double a, Half b) { - return a - (double)b; -} -inline AT_HOSTDEVICE double operator*(double a, Half b) { - return a * (double)b; -} -inline AT_HOSTDEVICE double operator/(double a, Half b) { - return a / (double)b; -} - -/// Arithmetic with ints - -inline AT_HOSTDEVICE Half operator+(Half a, int b) { - return a + (Half)b; -} -inline AT_HOSTDEVICE Half operator-(Half a, int b) { - return a - (Half)b; -} -inline AT_HOSTDEVICE Half operator*(Half a, int b) { - return a * (Half)b; -} -inline AT_HOSTDEVICE Half operator/(Half a, int b) { - return a / (Half)b; -} - -inline AT_HOSTDEVICE Half operator+(int a, Half b) { - return (Half)a + b; -} -inline AT_HOSTDEVICE Half operator-(int a, Half b) { - return (Half)a - b; -} -inline AT_HOSTDEVICE Half operator*(int a, Half b) { - return (Half)a * b; -} -inline AT_HOSTDEVICE Half operator/(int a, Half b) { - return (Half)a / b; -} - -/// NOTE: we do not define comparisons directly and instead rely on the implicit -/// conversion from at::Half to float. - -} // namespace at - -namespace std { - -template <> -class numeric_limits { - public: - static constexpr bool is_specialized = true; - static constexpr bool is_signed = true; - static constexpr bool is_integer = false; - static constexpr bool is_exact = false; - static constexpr bool has_infinity = true; - static constexpr bool has_quiet_NaN = true; - static constexpr bool has_signaling_NaN = true; - static constexpr auto has_denorm = numeric_limits::has_denorm; - static constexpr auto has_denorm_loss = - numeric_limits::has_denorm_loss; - static constexpr auto round_style = numeric_limits::round_style; - static constexpr bool is_iec559 = true; - static constexpr bool is_bounded = true; - static constexpr bool is_modulo = false; - static constexpr int digits = 11; - static constexpr int digits10 = 3; - static constexpr int max_digits10 = 5; - static constexpr int radix = 2; - static constexpr int min_exponent = -13; - static constexpr int min_exponent10 = -4; - static constexpr int max_exponent = 16; - static constexpr int max_exponent10 = 4; - static constexpr auto traps = numeric_limits::traps; - static constexpr auto tinyness_before = - numeric_limits::tinyness_before; - static constexpr at::Half min() { - return at::Half(0x0400, at::Half::from_bits); - } - static constexpr at::Half lowest() { - return at::Half(0xFBFF, at::Half::from_bits); - } - static constexpr at::Half max() { - return at::Half(0x7BFF, at::Half::from_bits); - } - static constexpr at::Half epsilon() { - return at::Half(0x1400, at::Half::from_bits); - } - static constexpr at::Half round_error() { - return at::Half(0x3800, at::Half::from_bits); - } - static constexpr at::Half infinity() { - return at::Half(0x7C00, at::Half::from_bits); - } - static constexpr at::Half quiet_NaN() { - return at::Half(0x7E00, at::Half::from_bits); - } - static constexpr at::Half signaling_NaN() { - return at::Half(0x7D00, at::Half::from_bits); - } - static constexpr at::Half denorm_min() { - return at::Half(0x0001, at::Half::from_bits); - } -}; - -} // namespace std diff --git a/aten/src/ATen/core/Half.cpp b/aten/src/ATen/core/Half.cpp deleted file mode 100644 index e511f03a92bc73..00000000000000 --- a/aten/src/ATen/core/Half.cpp +++ /dev/null @@ -1,105 +0,0 @@ -#include - -#include - -namespace at { - -static_assert( - std::is_standard_layout::value, - "at::Half must be standard layout."); - -namespace detail { - -// Host functions for converting between FP32 and FP16 formats - -float halfbits2float(unsigned short h) { - unsigned sign = ((h >> 15) & 1); - unsigned exponent = ((h >> 10) & 0x1f); - unsigned mantissa = ((h & 0x3ff) << 13); - - if (exponent == 0x1f) { /* NaN or Inf */ - mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0); - exponent = 0xff; - } else if (!exponent) { /* Denorm or Zero */ - if (mantissa) { - unsigned int msb; - exponent = 0x71; - do { - msb = (mantissa & 0x400000); - mantissa <<= 1; /* normalize */ - --exponent; - } while (!msb); - mantissa &= 0x7fffff; /* 1.mantissa is implicit */ - } - } else { - exponent += 0x70; - } - - unsigned result_bit = (sign << 31) | (exponent << 23) | mantissa; - - // Reinterpret the result bit pattern as a float - float result_float; - std::memcpy(&result_float, &result_bit, sizeof(result_float)); - return result_float; -} - -unsigned short float2halfbits(float src) { - // Reinterpret the float as a bit pattern - unsigned x; - std::memcpy(&x, &src, sizeof(x)); - - unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1; - unsigned sign, exponent, mantissa; - - // Get rid of +NaN/-NaN case first. - if (u > 0x7f800000) { - return 0x7fffU; - } - - sign = ((x >> 16) & 0x8000); - - // Get rid of +Inf/-Inf, +0/-0. - if (u > 0x477fefff) { - return sign | 0x7c00U; - } - if (u < 0x33000001) { - return (sign | 0x0000); - } - - exponent = ((u >> 23) & 0xff); - mantissa = (u & 0x7fffff); - - if (exponent > 0x70) { - shift = 13; - exponent -= 0x70; - } else { - shift = 0x7e - exponent; - exponent = 0; - mantissa |= 0x800000; - } - lsb = (1 << shift); - lsb_s1 = (lsb >> 1); - lsb_m1 = (lsb - 1); - - // Round to nearest even. - remainder = (mantissa & lsb_m1); - mantissa >>= shift; - if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) { - ++mantissa; - if (!(mantissa & 0x3ff)) { - ++exponent; - mantissa = 0; - } - } - - return (sign | (exponent << 10) | mantissa); -} - -} // namespace detail - -std::ostream& operator<<(std::ostream& out, const Half& value) { - out << (float)value; - return out; -} - -} // namespace at diff --git a/aten/src/ATen/core/Half.h b/aten/src/ATen/core/Half.h deleted file mode 100644 index 385f18e78cab02..00000000000000 --- a/aten/src/ATen/core/Half.h +++ /dev/null @@ -1,127 +0,0 @@ -#pragma once - -/// Defines the Half type (half-precision floating-point) including conversions -/// to standard C types and basic arithmetic operations. Note that arithmetic -/// operations are implemented by converting to floating point and -/// performing the operation in float32, instead of using CUDA half intrinisics. -/// Most uses of this type within ATen are memory bound, including the -/// element-wise kernels, and the half intrinisics aren't efficient on all GPUs. -/// If you are writing a compute bound kernel, you can use the CUDA half -/// intrinsics directly on the Half type from device code. - -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef __CUDACC__ -#include -#endif - -#if defined(__HIP_DEVICE_COMPILE__) -#include -#endif - -#ifndef AT_HOSTDEVICE -#ifdef __CUDACC__ -#define AT_HOSTDEVICE __host__ __device__ -#else -#define AT_HOSTDEVICE -#endif -#endif - -namespace at { - -namespace detail { - -AT_CORE_API float halfbits2float(unsigned short bits); -AT_CORE_API unsigned short float2halfbits(float value); - -} // namespace detail - -struct alignas(2) Half { - unsigned short x; - - struct from_bits_t {}; - static constexpr from_bits_t from_bits = from_bits_t(); - - // HIP wants __host__ __device__ tag, CUDA does not -#ifdef __HIP_PLATFORM_HCC__ - AT_HOSTDEVICE Half() = default; -#else - Half() = default; -#endif - - constexpr AT_HOSTDEVICE Half(unsigned short bits, from_bits_t) : x(bits){}; - inline AT_HOSTDEVICE Half(float value); - inline AT_HOSTDEVICE operator float() const; - -#ifdef __CUDACC__ - inline AT_HOSTDEVICE Half(const __half& value); - inline AT_HOSTDEVICE operator __half() const; -#endif -}; - -template -To convert(From f) { - return static_cast(f); -} - -// skip isnan and isinf check for integral types -template -typename std::enable_if::value, bool>::type overflows( - From f) { - using limit = std::numeric_limits; - if (!limit::is_signed && std::numeric_limits::is_signed) { - // allow for negative numbers to wrap using two's complement arithmetic. - // For example, with uint8, this allows for `a - b` to be treated as - // `a + 255 * b`. - return f > limit::max() || (f < 0 && -(uint64_t)f > limit::max()); - } else { - return f < limit::lowest() || f > limit::max(); - } -} - -template -typename std::enable_if::value, bool>::type overflows( - From f) { - using limit = std::numeric_limits; - if (limit::has_infinity && std::isinf((double)f)) { - return false; - } - if (!limit::has_quiet_NaN && (f != f)) { - return true; - } - return f < limit::lowest() || f > limit::max(); -} - -template -To checked_convert(From f, const char* name) { - if (overflows(f)) { - std::ostringstream oss; - oss << "value cannot be converted to type " << name << " without overflow: " << f; - throw std::domain_error(oss.str()); - } - return convert(f); -} - -template -To HalfFix(From h) { - To ret; - ret.x = h.x; - return ret; -} - -AT_CORE_API std::ostream& operator<<(std::ostream& out, const Half& value); - -} // namespace at - -#include "ATen/core/Half-inl.h" - -#undef AT_HOSTDEVICE diff --git a/aten/src/ATen/core/IdWrapper.h b/aten/src/ATen/core/IdWrapper.h deleted file mode 100644 index 7d152269d9a8c2..00000000000000 --- a/aten/src/ATen/core/IdWrapper.h +++ /dev/null @@ -1,75 +0,0 @@ -#pragma once - -#include - -namespace at { - -/** - * This template simplifies generation of simple classes that wrap an id - * in a typesafe way. Namely, you can use it to create a very lightweight - * type that only offers equality comparators and hashing. Example: - * - * struct MyIdType final : IdWrapper { - * constexpr explicit MyIdType(uint32_t id): IdWrapper(id) {} - * }; - * - * Then in the global top level namespace: - * - * AT_DEFINE_HASH_FOR_IDWRAPPER(MyIdType); - * - * That's it - equality operators and hash functions are automatically defined - * for you, given the underlying type supports it. - */ -template -class IdWrapper { - public: - using underlying_type = UnderlyingType; - using concrete_type = ConcreteType; - - protected: - constexpr explicit IdWrapper(underlying_type id) noexcept( - noexcept(underlying_type(std::declval()))) - : id_(id) {} - - constexpr underlying_type underlyingId() const - noexcept(noexcept(underlying_type(std::declval()))) { - return id_; - } - - private: - friend size_t hash_value(const concrete_type& v) { - return std::hash()(v.id_); - } - - // TODO Making operator== noexcept if underlying type is noexcept equality - // comparable doesn't work with GCC 4.8. - // Fix this once we don't need GCC 4.8 anymore. - friend constexpr bool operator==( - const concrete_type& lhs, - const concrete_type& rhs) { - return lhs.id_ == rhs.id_; - } - - // TODO Making operator!= noexcept if operator== is noexcept doesn't work with - // GCC 4.8. - // Fix this once we don't need GCC 4.8 anymore. - friend constexpr bool operator!=( - const concrete_type& lhs, - const concrete_type& rhs) { - return !(lhs == rhs); - } - - underlying_type id_; -}; - -} // namespace at - -#define AT_DEFINE_HASH_FOR_IDWRAPPER(ClassName) \ - namespace std { \ - template <> \ - struct hash { \ - size_t operator()(ClassName x) const { \ - return hash_value(x); \ - } \ - }; \ - } diff --git a/aten/src/ATen/core/README.md b/aten/src/ATen/core/README.md deleted file mode 100644 index 71654f44e26f91..00000000000000 --- a/aten/src/ATen/core/README.md +++ /dev/null @@ -1,5 +0,0 @@ -ATen Core ---------- - -ATen Core is a minimal subset of ATen which is suitable for deployment -on mobile. Binary size of files in this folder is an important constraint. diff --git a/aten/src/ATen/core/SmallVector.h b/aten/src/ATen/core/SmallVector.h deleted file mode 100644 index 269b21b0d5cf37..00000000000000 --- a/aten/src/ATen/core/SmallVector.h +++ /dev/null @@ -1,1034 +0,0 @@ -//===- llvm/ADT/SmallVector.h - 'Normally small' vectors --------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines the SmallVector class. -// -//===----------------------------------------------------------------------===// - -// ATen: modified from llvm::SmallVector. -// replaced report_bad_alloc_error with std::bad_alloc -// replaced isPodLike with AT_IS_TRIVIALLY_COPYABLE -// replaced iterator_range constructor with inline Container&& constructor -// removed LLVM_NODISCARD and LLVM_ATTRIBUTE_ALWAYS_INLINE qualifiers -// removed LLVM_UNLIKELY - -#pragma once - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if __GNUG__ && __GNUC__ < 5 -#define AT_IS_TRIVIALLY_COPYABLE(T) __has_trivial_copy(T) -#else -#define AT_IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable::value -#endif - -namespace at { - -namespace detail { - -// From llvm/Support/MathExtras.h -static inline uint64_t NextPowerOf2(uint64_t A) { - A |= (A >> 1); - A |= (A >> 2); - A |= (A >> 4); - A |= (A >> 8); - A |= (A >> 16); - A |= (A >> 32); - return A + 1; -} - -} // namespace detail - -/// This is all the non-templated stuff common to all SmallVectors. -class AT_CORE_API SmallVectorBase { - protected: - void *BeginX, *EndX, *CapacityX; - - protected: - SmallVectorBase(void* FirstEl, size_t Size) - : BeginX(FirstEl), EndX(FirstEl), CapacityX((char*)FirstEl + Size) {} - - /// This is an implementation of the grow() method which only works - /// on POD-like data types and is out of line to reduce code duplication. - void grow_pod(void* FirstEl, size_t MinSizeInBytes, size_t TSize); - - public: - /// This returns size()*sizeof(T). - size_t size_in_bytes() const { - return size_t((char*)EndX - (char*)BeginX); - } - - /// capacity_in_bytes - This returns capacity()*sizeof(T). - size_t capacity_in_bytes() const { - return size_t((char*)CapacityX - (char*)BeginX); - } - - bool empty() const { - return BeginX == EndX; - } -}; - -/// This is the part of SmallVectorTemplateBase which does not depend on whether -/// the type T is a POD. The extra dummy template argument is used by ArrayRef -/// to avoid unnecessarily requiring T to be complete. -template -class SmallVectorTemplateCommon : public SmallVectorBase { - private: - template - friend struct SmallVectorStorage; - - // Allocate raw space for N elements of type T. If T has a ctor or dtor, we - // don't want it to be automatically run, so we need to represent the space as - // something else. Use an array of char of sufficient alignment. - using U = AlignedCharArrayUnion; - U FirstEl; - // Space after 'FirstEl' is clobbered, do not add any instance vars after it. - - protected: - SmallVectorTemplateCommon(size_t Size) : SmallVectorBase(&FirstEl, Size) {} - - void grow_pod(size_t MinSizeInBytes, size_t TSize) { - SmallVectorBase::grow_pod(&FirstEl, MinSizeInBytes, TSize); - } - - /// Return true if this is a smallvector which has not had dynamic - /// memory allocated for it. - bool isSmall() const { - return BeginX == static_cast(&FirstEl); - } - - /// Put this vector in a state of being small. - void resetToSmall() { - BeginX = EndX = CapacityX = &FirstEl; - } - - void setEnd(T* P) { - this->EndX = P; - } - - public: - using size_type = size_t; - using difference_type = ptrdiff_t; - using value_type = T; - using iterator = T*; - using const_iterator = const T*; - - using const_reverse_iterator = std::reverse_iterator; - using reverse_iterator = std::reverse_iterator; - - using reference = T&; - using const_reference = const T&; - using pointer = T*; - using const_pointer = const T*; - - // forward iterator creation methods. - iterator begin() { - return (iterator)this->BeginX; - } - const_iterator begin() const { - return (const_iterator)this->BeginX; - } - iterator end() { - return (iterator)this->EndX; - } - const_iterator end() const { - return (const_iterator)this->EndX; - } - - protected: - iterator capacity_ptr() { - return (iterator)this->CapacityX; - } - const_iterator capacity_ptr() const { - return (const_iterator)this->CapacityX; - } - - public: - // reverse iterator creation methods. - reverse_iterator rbegin() { - return reverse_iterator(end()); - } - const_reverse_iterator rbegin() const { - return const_reverse_iterator(end()); - } - reverse_iterator rend() { - return reverse_iterator(begin()); - } - const_reverse_iterator rend() const { - return const_reverse_iterator(begin()); - } - - size_type size() const { - return end() - begin(); - } - size_type max_size() const { - return size_type(-1) / sizeof(T); - } - - /// Return the total number of elements in the currently allocated buffer. - size_t capacity() const { - return capacity_ptr() - begin(); - } - - /// Return a pointer to the vector's buffer, even if empty(). - pointer data() { - return pointer(begin()); - } - /// Return a pointer to the vector's buffer, even if empty(). - const_pointer data() const { - return const_pointer(begin()); - } - - reference operator[](size_type idx) { - assert(idx < size()); - return begin()[idx]; - } - const_reference operator[](size_type idx) const { - assert(idx < size()); - return begin()[idx]; - } - - reference front() { - assert(!empty()); - return begin()[0]; - } - const_reference front() const { - assert(!empty()); - return begin()[0]; - } - - reference back() { - assert(!empty()); - return end()[-1]; - } - const_reference back() const { - assert(!empty()); - return end()[-1]; - } -}; - -/// SmallVectorTemplateBase - This is where we put method -/// implementations that are designed to work with non-POD-like T's. -template -class SmallVectorTemplateBase : public SmallVectorTemplateCommon { - protected: - SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon(Size) {} - - static void destroy_range(T* S, T* E) { - while (S != E) { - --E; - E->~T(); - } - } - - /// Move the range [I, E) into the uninitialized memory starting with "Dest", - /// constructing elements as needed. - template - static void uninitialized_move(It1 I, It1 E, It2 Dest) { - std::uninitialized_copy( - std::make_move_iterator(I), std::make_move_iterator(E), Dest); - } - - /// Copy the range [I, E) onto the uninitialized memory starting with "Dest", - /// constructing elements as needed. - template - static void uninitialized_copy(It1 I, It1 E, It2 Dest) { - std::uninitialized_copy(I, E, Dest); - } - - /// Grow the allocated memory (without initializing new elements), doubling - /// the size of the allocated memory. Guarantees space for at least one more - /// element, or MinSize more elements if specified. - void grow(size_t MinSize = 0); - - public: - void push_back(const T& Elt) { - if (this->EndX >= this->CapacityX) - this->grow(); - ::new ((void*)this->end()) T(Elt); - this->setEnd(this->end() + 1); - } - - void push_back(T&& Elt) { - if (this->EndX >= this->CapacityX) - this->grow(); - ::new ((void*)this->end()) T(::std::move(Elt)); - this->setEnd(this->end() + 1); - } - - void pop_back() { - this->setEnd(this->end() - 1); - this->end()->~T(); - } -}; - -// Define this out-of-line to dissuade the C++ compiler from inlining it. -template -void SmallVectorTemplateBase::grow(size_t MinSize) { - size_t CurCapacity = this->capacity(); - size_t CurSize = this->size(); - // Always grow, even from zero. - size_t NewCapacity = size_t(detail::NextPowerOf2(CurCapacity + 2)); - if (NewCapacity < MinSize) - NewCapacity = MinSize; - T* NewElts = static_cast(malloc(NewCapacity * sizeof(T))); - if (NewElts == nullptr) - throw std::bad_alloc(); - - // Move the elements over. - this->uninitialized_move(this->begin(), this->end(), NewElts); - - // Destroy the original elements. - destroy_range(this->begin(), this->end()); - - // If this wasn't grown from the inline copy, deallocate the old space. - if (!this->isSmall()) - free(this->begin()); - - this->setEnd(NewElts + CurSize); - this->BeginX = NewElts; - this->CapacityX = this->begin() + NewCapacity; -} - -/// SmallVectorTemplateBase - This is where we put method -/// implementations that are designed to work with POD-like T's. -template -class SmallVectorTemplateBase : public SmallVectorTemplateCommon { - protected: - SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon(Size) {} - - // No need to do a destroy loop for POD's. - static void destroy_range(T*, T*) {} - - /// Move the range [I, E) onto the uninitialized memory - /// starting with "Dest", constructing elements into it as needed. - template - static void uninitialized_move(It1 I, It1 E, It2 Dest) { - // Just do a copy. - uninitialized_copy(I, E, Dest); - } - - /// Copy the range [I, E) onto the uninitialized memory - /// starting with "Dest", constructing elements into it as needed. - template - static void uninitialized_copy(It1 I, It1 E, It2 Dest) { - // Arbitrary iterator types; just use the basic implementation. - std::uninitialized_copy(I, E, Dest); - } - - /// Copy the range [I, E) onto the uninitialized memory - /// starting with "Dest", constructing elements into it as needed. - template - static void uninitialized_copy( - T1* I, - T1* E, - T2* Dest, - typename std::enable_if< - std::is_same::type, T2>::value>:: - type* = nullptr) { - // Use memcpy for PODs iterated by pointers (which includes SmallVector - // iterators): std::uninitialized_copy optimizes to memmove, but we can - // use memcpy here. Note that I and E are iterators and thus might be - // invalid for memcpy if they are equal. - if (I != E) - memcpy(Dest, I, (E - I) * sizeof(T)); - } - - /// Double the size of the allocated memory, guaranteeing space for at - /// least one more element or MinSize if specified. - void grow(size_t MinSize = 0) { - this->grow_pod(MinSize * sizeof(T), sizeof(T)); - } - - public: - void push_back(const T& Elt) { - if (this->EndX >= this->CapacityX) - this->grow(); - memcpy(this->end(), &Elt, sizeof(T)); - this->setEnd(this->end() + 1); - } - - void pop_back() { - this->setEnd(this->end() - 1); - } -}; - -/// This class consists of common code factored out of the SmallVector class to -/// reduce code duplication based on the SmallVector 'N' template parameter. -template -class SmallVectorImpl - : public SmallVectorTemplateBase { - using SuperClass = SmallVectorTemplateBase; - - public: - using iterator = typename SuperClass::iterator; - using const_iterator = typename SuperClass::const_iterator; - using size_type = typename SuperClass::size_type; - - protected: - // Default ctor - Initialize to empty. - explicit SmallVectorImpl(unsigned N) - : SmallVectorTemplateBase(N * sizeof(T)) { - } - - public: - SmallVectorImpl(const SmallVectorImpl&) = delete; - - ~SmallVectorImpl() { - // Destroy the constructed elements in the vector. - this->destroy_range(this->begin(), this->end()); - - // If this wasn't grown from the inline copy, deallocate the old space. - if (!this->isSmall()) - free(this->begin()); - } - - void clear() { - this->destroy_range(this->begin(), this->end()); - this->EndX = this->BeginX; - } - - void resize(size_type N) { - if (N < this->size()) { - this->destroy_range(this->begin() + N, this->end()); - this->setEnd(this->begin() + N); - } else if (N > this->size()) { - if (this->capacity() < N) - this->grow(N); - auto I = this->end(); - for (auto E = this->begin() + N; I != E; ++I) - new (&*I) T(); - this->setEnd(this->begin() + N); - } - } - - void resize(size_type N, const T& NV) { - if (N < this->size()) { - this->destroy_range(this->begin() + N, this->end()); - this->setEnd(this->begin() + N); - } else if (N > this->size()) { - if (this->capacity() < N) - this->grow(N); - std::uninitialized_fill(this->end(), this->begin() + N, NV); - this->setEnd(this->begin() + N); - } - } - - void reserve(size_type N) { - if (this->capacity() < N) - this->grow(N); - } - - T pop_back_val() { - T Result = ::std::move(this->back()); - this->pop_back(); - return Result; - } - - void swap(SmallVectorImpl& RHS); - - /// Add the specified range to the end of the SmallVector. - template < - typename in_iter, - typename = typename std::enable_if::iterator_category, - std::input_iterator_tag>::value>::type> - void append(in_iter in_start, in_iter in_end) { - size_type NumInputs = std::distance(in_start, in_end); - // Grow allocated space if needed. - if (NumInputs > size_type(this->capacity_ptr() - this->end())) - this->grow(this->size() + NumInputs); - - // Copy the new elements over. - this->uninitialized_copy(in_start, in_end, this->end()); - this->setEnd(this->end() + NumInputs); - } - - /// Add the specified range to the end of the SmallVector. - void append(size_type NumInputs, const T& Elt) { - // Grow allocated space if needed. - if (NumInputs > size_type(this->capacity_ptr() - this->end())) - this->grow(this->size() + NumInputs); - - // Copy the new elements over. - std::uninitialized_fill_n(this->end(), NumInputs, Elt); - this->setEnd(this->end() + NumInputs); - } - - void append(std::initializer_list IL) { - append(IL.begin(), IL.end()); - } - - // FIXME: Consider assigning over existing elements, rather than clearing & - // re-initializing them - for all assign(...) variants. - - void assign(size_type NumElts, const T& Elt) { - clear(); - if (this->capacity() < NumElts) - this->grow(NumElts); - this->setEnd(this->begin() + NumElts); - std::uninitialized_fill(this->begin(), this->end(), Elt); - } - - template < - typename in_iter, - typename = typename std::enable_if::iterator_category, - std::input_iterator_tag>::value>::type> - void assign(in_iter in_start, in_iter in_end) { - clear(); - append(in_start, in_end); - } - - void assign(std::initializer_list IL) { - clear(); - append(IL); - } - - iterator erase(const_iterator CI) { - // Just cast away constness because this is a non-const member function. - iterator I = const_cast(CI); - - assert(I >= this->begin() && "Iterator to erase is out of bounds."); - assert(I < this->end() && "Erasing at past-the-end iterator."); - - iterator N = I; - // Shift all elts down one. - std::move(I + 1, this->end(), I); - // Drop the last elt. - this->pop_back(); - return (N); - } - - iterator erase(const_iterator CS, const_iterator CE) { - // Just cast away constness because this is a non-const member function. - iterator S = const_cast(CS); - iterator E = const_cast(CE); - - assert(S >= this->begin() && "Range to erase is out of bounds."); - assert(S <= E && "Trying to erase invalid range."); - assert(E <= this->end() && "Trying to erase past the end."); - - iterator N = S; - // Shift all elts down. - iterator I = std::move(E, this->end(), S); - // Drop the last elts. - this->destroy_range(I, this->end()); - this->setEnd(I); - return (N); - } - - iterator insert(iterator I, T&& Elt) { - if (I == this->end()) { // Important special case for empty vector. - this->push_back(::std::move(Elt)); - return this->end() - 1; - } - - assert(I >= this->begin() && "Insertion iterator is out of bounds."); - assert(I <= this->end() && "Inserting past the end of the vector."); - - if (this->EndX >= this->CapacityX) { - size_t EltNo = I - this->begin(); - this->grow(); - I = this->begin() + EltNo; - } - - ::new ((void*)this->end()) T(::std::move(this->back())); - // Push everything else over. - std::move_backward(I, this->end() - 1, this->end()); - this->setEnd(this->end() + 1); - - // If we just moved the element we're inserting, be sure to update - // the reference. - T* EltPtr = &Elt; - if (I <= EltPtr && EltPtr < this->EndX) - ++EltPtr; - - *I = ::std::move(*EltPtr); - return I; - } - - iterator insert(iterator I, const T& Elt) { - if (I == this->end()) { // Important special case for empty vector. - this->push_back(Elt); - return this->end() - 1; - } - - assert(I >= this->begin() && "Insertion iterator is out of bounds."); - assert(I <= this->end() && "Inserting past the end of the vector."); - - if (this->EndX >= this->CapacityX) { - size_t EltNo = I - this->begin(); - this->grow(); - I = this->begin() + EltNo; - } - ::new ((void*)this->end()) T(std::move(this->back())); - // Push everything else over. - std::move_backward(I, this->end() - 1, this->end()); - this->setEnd(this->end() + 1); - - // If we just moved the element we're inserting, be sure to update - // the reference. - const T* EltPtr = &Elt; - if (I <= EltPtr && EltPtr < this->EndX) - ++EltPtr; - - *I = *EltPtr; - return I; - } - - iterator insert(iterator I, size_type NumToInsert, const T& Elt) { - // Convert iterator to elt# to avoid invalidating iterator when we reserve() - size_t InsertElt = I - this->begin(); - - if (I == this->end()) { // Important special case for empty vector. - append(NumToInsert, Elt); - return this->begin() + InsertElt; - } - - assert(I >= this->begin() && "Insertion iterator is out of bounds."); - assert(I <= this->end() && "Inserting past the end of the vector."); - - // Ensure there is enough space. - reserve(this->size() + NumToInsert); - - // Uninvalidate the iterator. - I = this->begin() + InsertElt; - - // If there are more elements between the insertion point and the end of the - // range than there are being inserted, we can use a simple approach to - // insertion. Since we already reserved space, we know that this won't - // reallocate the vector. - if (size_t(this->end() - I) >= NumToInsert) { - T* OldEnd = this->end(); - append( - std::move_iterator(this->end() - NumToInsert), - std::move_iterator(this->end())); - - // Copy the existing elements that get replaced. - std::move_backward(I, OldEnd - NumToInsert, OldEnd); - - std::fill_n(I, NumToInsert, Elt); - return I; - } - - // Otherwise, we're inserting more elements than exist already, and we're - // not inserting at the end. - - // Move over the elements that we're about to overwrite. - T* OldEnd = this->end(); - this->setEnd(this->end() + NumToInsert); - size_t NumOverwritten = OldEnd - I; - this->uninitialized_move(I, OldEnd, this->end() - NumOverwritten); - - // Replace the overwritten part. - std::fill_n(I, NumOverwritten, Elt); - - // Insert the non-overwritten middle part. - std::uninitialized_fill_n(OldEnd, NumToInsert - NumOverwritten, Elt); - return I; - } - - template < - typename ItTy, - typename = typename std::enable_if::iterator_category, - std::input_iterator_tag>::value>::type> - iterator insert(iterator I, ItTy From, ItTy To) { - // Convert iterator to elt# to avoid invalidating iterator when we reserve() - size_t InsertElt = I - this->begin(); - - if (I == this->end()) { // Important special case for empty vector. - append(From, To); - return this->begin() + InsertElt; - } - - assert(I >= this->begin() && "Insertion iterator is out of bounds."); - assert(I <= this->end() && "Inserting past the end of the vector."); - - size_t NumToInsert = std::distance(From, To); - - // Ensure there is enough space. - reserve(this->size() + NumToInsert); - - // Uninvalidate the iterator. - I = this->begin() + InsertElt; - - // If there are more elements between the insertion point and the end of the - // range than there are being inserted, we can use a simple approach to - // insertion. Since we already reserved space, we know that this won't - // reallocate the vector. - if (size_t(this->end() - I) >= NumToInsert) { - T* OldEnd = this->end(); - append( - std::move_iterator(this->end() - NumToInsert), - std::move_iterator(this->end())); - - // Copy the existing elements that get replaced. - std::move_backward(I, OldEnd - NumToInsert, OldEnd); - - std::copy(From, To, I); - return I; - } - - // Otherwise, we're inserting more elements than exist already, and we're - // not inserting at the end. - - // Move over the elements that we're about to overwrite. - T* OldEnd = this->end(); - this->setEnd(this->end() + NumToInsert); - size_t NumOverwritten = OldEnd - I; - this->uninitialized_move(I, OldEnd, this->end() - NumOverwritten); - - // Replace the overwritten part. - for (T* J = I; NumOverwritten > 0; --NumOverwritten) { - *J = *From; - ++J; - ++From; - } - - // Insert the non-overwritten middle part. - this->uninitialized_copy(From, To, OldEnd); - return I; - } - - void insert(iterator I, std::initializer_list IL) { - insert(I, IL.begin(), IL.end()); - } - - template - void emplace_back(ArgTypes&&... Args) { - if (this->EndX >= this->CapacityX) - this->grow(); - ::new ((void*)this->end()) T(std::forward(Args)...); - this->setEnd(this->end() + 1); - } - - SmallVectorImpl& operator=(const SmallVectorImpl& RHS); - - SmallVectorImpl& operator=(SmallVectorImpl&& RHS); - - bool operator==(const SmallVectorImpl& RHS) const { - if (this->size() != RHS.size()) - return false; - return std::equal(this->begin(), this->end(), RHS.begin()); - } - bool operator!=(const SmallVectorImpl& RHS) const { - return !(*this == RHS); - } - - bool operator<(const SmallVectorImpl& RHS) const { - return std::lexicographical_compare( - this->begin(), this->end(), RHS.begin(), RHS.end()); - } - - /// Set the array size to \p N, which the current array must have enough - /// capacity for. - /// - /// This does not construct or destroy any elements in the vector. - /// - /// Clients can use this in conjunction with capacity() to write past the end - /// of the buffer when they know that more elements are available, and only - /// update the size later. This avoids the cost of value initializing elements - /// which will only be overwritten. - void set_size(size_type N) { - assert(N <= this->capacity()); - this->setEnd(this->begin() + N); - } -}; - -template -void SmallVectorImpl::swap(SmallVectorImpl& RHS) { - if (this == &RHS) - return; - - // We can only avoid copying elements if neither vector is small. - if (!this->isSmall() && !RHS.isSmall()) { - std::swap(this->BeginX, RHS.BeginX); - std::swap(this->EndX, RHS.EndX); - std::swap(this->CapacityX, RHS.CapacityX); - return; - } - if (RHS.size() > this->capacity()) - this->grow(RHS.size()); - if (this->size() > RHS.capacity()) - RHS.grow(this->size()); - - // Swap the shared elements. - size_t NumShared = this->size(); - if (NumShared > RHS.size()) - NumShared = RHS.size(); - for (size_type i = 0; i != NumShared; ++i) - std::swap((*this)[i], RHS[i]); - - // Copy over the extra elts. - if (this->size() > RHS.size()) { - size_t EltDiff = this->size() - RHS.size(); - this->uninitialized_copy(this->begin() + NumShared, this->end(), RHS.end()); - RHS.setEnd(RHS.end() + EltDiff); - this->destroy_range(this->begin() + NumShared, this->end()); - this->setEnd(this->begin() + NumShared); - } else if (RHS.size() > this->size()) { - size_t EltDiff = RHS.size() - this->size(); - this->uninitialized_copy(RHS.begin() + NumShared, RHS.end(), this->end()); - this->setEnd(this->end() + EltDiff); - this->destroy_range(RHS.begin() + NumShared, RHS.end()); - RHS.setEnd(RHS.begin() + NumShared); - } -} - -template -SmallVectorImpl& SmallVectorImpl::operator=( - const SmallVectorImpl& RHS) { - // Avoid self-assignment. - if (this == &RHS) - return *this; - - // If we already have sufficient space, assign the common elements, then - // destroy any excess. - size_t RHSSize = RHS.size(); - size_t CurSize = this->size(); - if (CurSize >= RHSSize) { - // Assign common elements. - iterator NewEnd; - if (RHSSize) - NewEnd = std::copy(RHS.begin(), RHS.begin() + RHSSize, this->begin()); - else - NewEnd = this->begin(); - - // Destroy excess elements. - this->destroy_range(NewEnd, this->end()); - - // Trim. - this->setEnd(NewEnd); - return *this; - } - - // If we have to grow to have enough elements, destroy the current elements. - // This allows us to avoid copying them during the grow. - // FIXME: don't do this if they're efficiently moveable. - if (this->capacity() < RHSSize) { - // Destroy current elements. - this->destroy_range(this->begin(), this->end()); - this->setEnd(this->begin()); - CurSize = 0; - this->grow(RHSSize); - } else if (CurSize) { - // Otherwise, use assignment for the already-constructed elements. - std::copy(RHS.begin(), RHS.begin() + CurSize, this->begin()); - } - - // Copy construct the new elements in place. - this->uninitialized_copy( - RHS.begin() + CurSize, RHS.end(), this->begin() + CurSize); - - // Set end. - this->setEnd(this->begin() + RHSSize); - return *this; -} - -template -SmallVectorImpl& SmallVectorImpl::operator=(SmallVectorImpl&& RHS) { - // Avoid self-assignment. - if (this == &RHS) - return *this; - - // If the RHS isn't small, clear this vector and then steal its buffer. - if (!RHS.isSmall()) { - this->destroy_range(this->begin(), this->end()); - if (!this->isSmall()) - free(this->begin()); - this->BeginX = RHS.BeginX; - this->EndX = RHS.EndX; - this->CapacityX = RHS.CapacityX; - RHS.resetToSmall(); - return *this; - } - - // If we already have sufficient space, assign the common elements, then - // destroy any excess. - size_t RHSSize = RHS.size(); - size_t CurSize = this->size(); - if (CurSize >= RHSSize) { - // Assign common elements. - iterator NewEnd = this->begin(); - if (RHSSize) - NewEnd = std::move(RHS.begin(), RHS.end(), NewEnd); - - // Destroy excess elements and trim the bounds. - this->destroy_range(NewEnd, this->end()); - this->setEnd(NewEnd); - - // Clear the RHS. - RHS.clear(); - - return *this; - } - - // If we have to grow to have enough elements, destroy the current elements. - // This allows us to avoid copying them during the grow. - // FIXME: this may not actually make any sense if we can efficiently move - // elements. - if (this->capacity() < RHSSize) { - // Destroy current elements. - this->destroy_range(this->begin(), this->end()); - this->setEnd(this->begin()); - CurSize = 0; - this->grow(RHSSize); - } else if (CurSize) { - // Otherwise, use assignment for the already-constructed elements. - std::move(RHS.begin(), RHS.begin() + CurSize, this->begin()); - } - - // Move-construct the new elements in place. - this->uninitialized_move( - RHS.begin() + CurSize, RHS.end(), this->begin() + CurSize); - - // Set end. - this->setEnd(this->begin() + RHSSize); - - RHS.clear(); - return *this; -} - -/// Storage for the SmallVector elements which aren't contained in -/// SmallVectorTemplateCommon. There are 'N-1' elements here. The remaining '1' -/// element is in the base class. This is specialized for the N=1 and N=0 cases -/// to avoid allocating unnecessary storage. -template -struct SmallVectorStorage { - typename SmallVectorTemplateCommon::U InlineElts[N - 1]; -}; -template -struct SmallVectorStorage {}; -template -struct SmallVectorStorage {}; - -/// This is a 'vector' (really, a variable-sized array), optimized -/// for the case when the array is small. It contains some number of elements -/// in-place, which allows it to avoid heap allocation when the actual number of -/// elements is below that threshold. This allows normal "small" cases to be -/// fast without losing generality for large inputs. -/// -/// Note that this does not attempt to be exception safe. -/// -template -class SmallVector : public SmallVectorImpl { - /// Inline space for elements which aren't stored in the base class. - SmallVectorStorage Storage; - - public: - SmallVector() : SmallVectorImpl(N) {} - - explicit SmallVector(size_t Size, const T& Value = T()) - : SmallVectorImpl(N) { - this->assign(Size, Value); - } - - template < - typename ItTy, - typename = typename std::enable_if::iterator_category, - std::input_iterator_tag>::value>::type> - SmallVector(ItTy S, ItTy E) : SmallVectorImpl(N) { - this->append(S, E); - } - - template - explicit SmallVector(Container&& c) : SmallVectorImpl(N) { - this->append(c.begin(), c.end()); - } - - SmallVector(std::initializer_list IL) : SmallVectorImpl(N) { - this->assign(IL); - } - - SmallVector(const SmallVector& RHS) : SmallVectorImpl(N) { - if (!RHS.empty()) - SmallVectorImpl::operator=(RHS); - } - - const SmallVector& operator=(const SmallVector& RHS) { - SmallVectorImpl::operator=(RHS); - return *this; - } - - SmallVector(SmallVector&& RHS) : SmallVectorImpl(N) { - if (!RHS.empty()) - SmallVectorImpl::operator=(::std::move(RHS)); - } - - template - const SmallVector& operator=(const Container& RHS) { - this->assign(RHS.begin(), RHS.end()); - return *this; - } - - SmallVector(SmallVectorImpl&& RHS) : SmallVectorImpl(N) { - if (!RHS.empty()) - SmallVectorImpl::operator=(::std::move(RHS)); - } - - const SmallVector& operator=(SmallVector&& RHS) { - SmallVectorImpl::operator=(::std::move(RHS)); - return *this; - } - - const SmallVector& operator=(SmallVectorImpl&& RHS) { - SmallVectorImpl::operator=(::std::move(RHS)); - return *this; - } - - template - const SmallVector& operator=(Container&& C) { - this->assign(C.begin(), C.end()); - return *this; - } - - const SmallVector& operator=(std::initializer_list IL) { - this->assign(IL); - return *this; - } -}; - -template -inline size_t capacity_in_bytes(const SmallVector& X) { - return X.capacity_in_bytes(); -} - -} // end namespace at - -namespace std { - -/// Implement std::swap in terms of SmallVector swap. -template -inline void swap(at::SmallVectorImpl& LHS, at::SmallVectorImpl& RHS) { - LHS.swap(RHS); -} - -/// Implement std::swap in terms of SmallVector swap. -template -inline void swap(at::SmallVector& LHS, at::SmallVector& RHS) { - LHS.swap(RHS); -} - -} // end namespace std diff --git a/aten/src/ATen/core/UniqueVoidPtr.cpp b/aten/src/ATen/core/UniqueVoidPtr.cpp deleted file mode 100644 index fd08f7e13d2bf8..00000000000000 --- a/aten/src/ATen/core/UniqueVoidPtr.cpp +++ /dev/null @@ -1,9 +0,0 @@ -#include - -namespace at { -namespace detail { - -void deleteNothing(void*) {} - -} // namespace detail -} // namespace at diff --git a/aten/src/ATen/core/optional.h b/aten/src/ATen/core/optional.h deleted file mode 100644 index 8b0a7bfc4ead31..00000000000000 --- a/aten/src/ATen/core/optional.h +++ /dev/null @@ -1,1027 +0,0 @@ -// Copyright (C) 2011 - 2012 Andrzej Krzemienski. -// -// Use, modification, and distribution is subject to the Boost Software -// License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt) -// -// The idea and interface is based on Boost.Optional library -// authored by Fernando Luis Cacciola Carballal -// -// From https://github.com/akrzemi1/Optional -// -// ATen: -// - Move to `at` namespace. -// - Remove macro use in line 478 because the nvcc device compiler cannot handle -// it. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -#define TR2_OPTIONAL_REQUIRES(...) \ - typename std::enable_if<__VA_ARGS__::value, bool>::type = false - -#if defined __GNUC__ // NOTE: GNUC is also defined for Clang -#if (__GNUC__ == 4) && (__GNUC_MINOR__ >= 8) -#define TR2_OPTIONAL_GCC_4_8_AND_HIGHER___ -#elif (__GNUC__ > 4) -#define TR2_OPTIONAL_GCC_4_8_AND_HIGHER___ -#endif -# -#if (__GNUC__ == 4) && (__GNUC_MINOR__ >= 7) -#define TR2_OPTIONAL_GCC_4_7_AND_HIGHER___ -#elif (__GNUC__ > 4) -#define TR2_OPTIONAL_GCC_4_7_AND_HIGHER___ -#endif -# -#if (__GNUC__ == 4) && (__GNUC_MINOR__ == 8) && (__GNUC_PATCHLEVEL__ >= 1) -#define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ -#elif (__GNUC__ == 4) && (__GNUC_MINOR__ >= 9) -#define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ -#elif (__GNUC__ > 4) -#define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ -#endif -#endif -# -#if defined __clang_major__ -#if (__clang_major__ == 3 && __clang_minor__ >= 5) -#define TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ -#elif (__clang_major__ > 3) -#define TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ -#endif -#if defined TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ -#define TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_ -#elif ( \ - __clang_major__ == 3 && __clang_minor__ == 4 && __clang_patchlevel__ >= 2) -#define TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_ -#endif -#endif -# -#if defined _MSC_VER -#if (_MSC_VER >= 1900) -#define TR2_OPTIONAL_MSVC_2015_AND_HIGHER___ -#endif -#endif - -#if defined __clang__ -#if (__clang_major__ > 2) || (__clang_major__ == 2) && (__clang_minor__ >= 9) -#define OPTIONAL_HAS_THIS_RVALUE_REFS 1 -#else -#define OPTIONAL_HAS_THIS_RVALUE_REFS 0 -#endif -#elif defined TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ -#define OPTIONAL_HAS_THIS_RVALUE_REFS 1 -#elif defined TR2_OPTIONAL_MSVC_2015_AND_HIGHER___ -#define OPTIONAL_HAS_THIS_RVALUE_REFS 1 -#else -#define OPTIONAL_HAS_THIS_RVALUE_REFS 0 -#endif - -#if defined TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ -#define OPTIONAL_HAS_CONSTEXPR_INIT_LIST 1 -#define OPTIONAL_CONSTEXPR_INIT_LIST constexpr -#else -#define OPTIONAL_HAS_CONSTEXPR_INIT_LIST 0 -#define OPTIONAL_CONSTEXPR_INIT_LIST -#endif - -#if defined TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ && (defined __cplusplus) && \ - (__cplusplus != 201103L) -#define OPTIONAL_HAS_MOVE_ACCESSORS 1 -#else -#define OPTIONAL_HAS_MOVE_ACCESSORS 0 -#endif - -#// In C++11 constexpr implies const, so we need to make non-const members also non-constexpr -#if (defined __cplusplus) && (__cplusplus == 201103L) -#define OPTIONAL_MUTABLE_CONSTEXPR -#else -#define OPTIONAL_MUTABLE_CONSTEXPR constexpr -#endif - -namespace at { - -// 20.5.4, optional for object types -template -class optional; - -// 20.5.5, optional for lvalue reference types -template -class optional; - -// workaround: std utility functions aren't constexpr yet -template -inline constexpr T&& constexpr_forward( - typename std::remove_reference::type& t) noexcept { - return static_cast(t); -} - -template -inline constexpr T&& constexpr_forward( - typename std::remove_reference::type&& t) noexcept { - static_assert(!std::is_lvalue_reference::value, "!!"); - return static_cast(t); -} - -template -inline constexpr typename std::remove_reference::type&& constexpr_move( - T&& t) noexcept { - return static_cast::type&&>(t); -} - -#if defined NDEBUG -#define TR2_OPTIONAL_ASSERTED_EXPRESSION(CHECK, EXPR) (EXPR) -#else -#define TR2_OPTIONAL_ASSERTED_EXPRESSION(CHECK, EXPR) \ - ((CHECK) ? (EXPR) : ([] { assert(!#CHECK); }(), (EXPR))) -#endif - -namespace detail_ { - -// static_addressof: a constexpr version of addressof -template -struct has_overloaded_addressof { - template - constexpr static bool has_overload(...) { - return false; - } - - template ().operator&())> - constexpr static bool has_overload(bool) { - return true; - } - - constexpr static bool value = has_overload(true); -}; - -template )> -constexpr T* static_addressof(T& ref) { - return &ref; -} - -template )> -T* static_addressof(T& ref) { - return std::addressof(ref); -} - -// the call to convert(b) has return type A and converts b to type A iff b -// decltype(b) is implicitly convertible to A -template -constexpr U convert(U v) { - return v; -} - -} // namespace detail_ - -constexpr struct trivial_init_t { -} trivial_init{}; - -// 20.5.6, In-place construction -constexpr struct in_place_t { -} in_place{}; - -// 20.5.7, Disengaged state indicator -struct nullopt_t { - struct init {}; - constexpr explicit nullopt_t(init) {} -}; -constexpr nullopt_t nullopt{nullopt_t::init()}; - -// 20.5.8, class bad_optional_access -class bad_optional_access : public std::logic_error { - public: - explicit bad_optional_access(const std::string& what_arg) - : logic_error{what_arg} {} - explicit bad_optional_access(const char* what_arg) : logic_error{what_arg} {} -}; - -template -union storage_t { - unsigned char dummy_; - T value_; - - constexpr storage_t(trivial_init_t) noexcept : dummy_(){}; - - template - constexpr storage_t(Args&&... args) - : value_(constexpr_forward(args)...) {} - - ~storage_t() {} -}; - -template -union constexpr_storage_t { - unsigned char dummy_; - T value_; - - constexpr constexpr_storage_t(trivial_init_t) noexcept : dummy_(){}; - - template - constexpr constexpr_storage_t(Args&&... args) - : value_(constexpr_forward(args)...) {} - - ~constexpr_storage_t() = default; -}; - -template -struct optional_base { - bool init_; - storage_t storage_; - - constexpr optional_base() noexcept : init_(false), storage_(trivial_init){}; - - explicit constexpr optional_base(const T& v) : init_(true), storage_(v) {} - - explicit constexpr optional_base(T&& v) - : init_(true), storage_(constexpr_move(v)) {} - - template - explicit optional_base(in_place_t, Args&&... args) - : init_(true), storage_(constexpr_forward(args)...) {} - - template < - class U, - class... Args, - TR2_OPTIONAL_REQUIRES(std::is_constructible>)> - explicit optional_base( - in_place_t, - std::initializer_list il, - Args&&... args) - : init_(true), storage_(il, std::forward(args)...) {} - - ~optional_base() { - if (init_) - storage_.value_.T::~T(); - } -}; - -template -struct constexpr_optional_base { - bool init_; - constexpr_storage_t storage_; - - constexpr constexpr_optional_base() noexcept - : init_(false), storage_(trivial_init){}; - - explicit constexpr constexpr_optional_base(const T& v) - : init_(true), storage_(v) {} - - explicit constexpr constexpr_optional_base(T&& v) - : init_(true), storage_(constexpr_move(v)) {} - - template - explicit constexpr constexpr_optional_base(in_place_t, Args&&... args) - : init_(true), storage_(constexpr_forward(args)...) {} - - template < - class U, - class... Args, - TR2_OPTIONAL_REQUIRES(std::is_constructible>)> - OPTIONAL_CONSTEXPR_INIT_LIST explicit constexpr_optional_base( - in_place_t, - std::initializer_list il, - Args&&... args) - : init_(true), storage_(il, std::forward(args)...) {} - - ~constexpr_optional_base() = default; -}; - -template -using OptionalBase = typename std::conditional< - std::is_trivially_destructible::value, // if possible - constexpr_optional_base::type>, // use base with trivial destructor - optional_base::type>>::type; - -template -class optional : private OptionalBase { - static_assert( - !std::is_same::type, nullopt_t>::value, - "bad T"); - static_assert( - !std::is_same::type, in_place_t>::value, - "bad T"); - - constexpr bool initialized() const noexcept { - return OptionalBase::init_; - } - typename std::remove_const::type* dataptr() { - return std::addressof(OptionalBase::storage_.value_); - } - constexpr const T* dataptr() const { - return detail_::static_addressof(OptionalBase::storage_.value_); - } - -#if OPTIONAL_HAS_THIS_RVALUE_REFS == 1 - constexpr const T& contained_val() const& { - return OptionalBase::storage_.value_; - } -#if OPTIONAL_HAS_MOVE_ACCESSORS == 1 - OPTIONAL_MUTABLE_CONSTEXPR T&& contained_val() && { - return std::move(OptionalBase::storage_.value_); - } - OPTIONAL_MUTABLE_CONSTEXPR T& contained_val() & { - return OptionalBase::storage_.value_; - } -#else - T& contained_val() & { - return OptionalBase::storage_.value_; - } - T&& contained_val() && { - return std::move(OptionalBase::storage_.value_); - } -#endif -#else - constexpr const T& contained_val() const { - return OptionalBase::storage_.value_; - } - T& contained_val() { - return OptionalBase::storage_.value_; - } -#endif - - void clear() noexcept { - if (initialized()) - dataptr()->T::~T(); - OptionalBase::init_ = false; - } - - template - void initialize(Args&&... args) noexcept( - noexcept(T(std::forward(args)...))) { - assert(!OptionalBase::init_); - ::new (static_cast(dataptr())) T(std::forward(args)...); - OptionalBase::init_ = true; - } - - template - void initialize(std::initializer_list il, Args&&... args) noexcept( - noexcept(T(il, std::forward(args)...))) { - assert(!OptionalBase::init_); - ::new (static_cast(dataptr())) T(il, std::forward(args)...); - OptionalBase::init_ = true; - } - - public: - typedef T value_type; - - // 20.5.5.1, constructors - constexpr optional() noexcept : OptionalBase(){}; - constexpr optional(nullopt_t) noexcept : OptionalBase(){}; - - optional(const optional& rhs) : OptionalBase() { - if (rhs.initialized()) { - ::new (static_cast(dataptr())) T(*rhs); - OptionalBase::init_ = true; - } - } - - optional(optional&& rhs) noexcept( - std::is_nothrow_move_constructible::value) - : OptionalBase() { - if (rhs.initialized()) { - ::new (static_cast(dataptr())) T(std::move(*rhs)); - OptionalBase::init_ = true; - } - } - - constexpr optional(const T& v) : OptionalBase(v) {} - - constexpr optional(T&& v) : OptionalBase(constexpr_move(v)) {} - - template - explicit constexpr optional(in_place_t, Args&&... args) - : OptionalBase(in_place_t{}, constexpr_forward(args)...) {} - - template < - class U, - class... Args, - TR2_OPTIONAL_REQUIRES(std::is_constructible>)> - OPTIONAL_CONSTEXPR_INIT_LIST explicit optional( - in_place_t, - std::initializer_list il, - Args&&... args) - : OptionalBase(in_place_t{}, il, constexpr_forward(args)...) {} - - // 20.5.4.2, Destructor - ~optional() = default; - - // 20.5.4.3, assignment - optional& operator=(nullopt_t) noexcept { - clear(); - return *this; - } - - optional& operator=(const optional& rhs) { - if (initialized() == true && rhs.initialized() == false) - clear(); - else if (initialized() == false && rhs.initialized() == true) - initialize(*rhs); - else if (initialized() == true && rhs.initialized() == true) - contained_val() = *rhs; - return *this; - } - - optional& operator=(optional&& rhs) noexcept( - std::is_nothrow_move_assignable::value&& - std::is_nothrow_move_constructible::value) { - if (initialized() == true && rhs.initialized() == false) - clear(); - else if (initialized() == false && rhs.initialized() == true) - initialize(std::move(*rhs)); - else if (initialized() == true && rhs.initialized() == true) - contained_val() = std::move(*rhs); - return *this; - } - - template - auto operator=(U&& v) -> typename std::enable_if< - std::is_same::type, T>::value, - optional&>::type { - if (initialized()) { - contained_val() = std::forward(v); - } else { - initialize(std::forward(v)); - } - return *this; - } - - template - void emplace(Args&&... args) { - clear(); - initialize(std::forward(args)...); - } - - template - void emplace(std::initializer_list il, Args&&... args) { - clear(); - initialize(il, std::forward(args)...); - } - - // 20.5.4.4, Swap - void swap(optional& rhs) noexcept( - std::is_nothrow_move_constructible::value&& noexcept( - swap(std::declval(), std::declval()))) { - if (initialized() == true && rhs.initialized() == false) { - rhs.initialize(std::move(**this)); - clear(); - } else if (initialized() == false && rhs.initialized() == true) { - initialize(std::move(*rhs)); - rhs.clear(); - } else if (initialized() == true && rhs.initialized() == true) { - using std::swap; - swap(**this, *rhs); - } - } - - // 20.5.4.5, Observers - - explicit constexpr operator bool() const noexcept { - return initialized(); - } - constexpr bool has_value() const noexcept { - return initialized(); - } - - constexpr T const* operator->() const { - return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), dataptr()); - } - -#if OPTIONAL_HAS_MOVE_ACCESSORS == 1 - - OPTIONAL_MUTABLE_CONSTEXPR T* operator->() { - assert(initialized()); - return dataptr(); - } - - constexpr T const& operator*() const& { - return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), contained_val()); - } - - OPTIONAL_MUTABLE_CONSTEXPR T& operator*() & { - assert(initialized()); - return contained_val(); - } - - OPTIONAL_MUTABLE_CONSTEXPR T&& operator*() && { - assert(initialized()); - return constexpr_move(contained_val()); - } - - constexpr T const& value() const& { - return initialized() - ? contained_val() - : (throw bad_optional_access("bad optional access"), contained_val()); - } - - OPTIONAL_MUTABLE_CONSTEXPR T& value() & { - return initialized() - ? contained_val() - : (throw bad_optional_access("bad optional access"), contained_val()); - } - - OPTIONAL_MUTABLE_CONSTEXPR T&& value() && { - if (!initialized()) - throw bad_optional_access("bad optional access"); - return std::move(contained_val()); - } - -#else - - T* operator->() { - assert(initialized()); - return dataptr(); - } - - constexpr T const& operator*() const { - return contained_val(); - } - - T& operator*() { - assert(initialized()); - return contained_val(); - } - - constexpr T const& value() const { - return initialized() - ? contained_val() - : (throw bad_optional_access("bad optional access"), contained_val()); - } - - T& value() { - return initialized() - ? contained_val() - : (throw bad_optional_access("bad optional access"), contained_val()); - } - -#endif - -#if OPTIONAL_HAS_THIS_RVALUE_REFS == 1 - - template - constexpr T value_or(V&& v) const& { - return *this ? **this : detail_::convert(constexpr_forward(v)); - } - -#if OPTIONAL_HAS_MOVE_ACCESSORS == 1 - - template - OPTIONAL_MUTABLE_CONSTEXPR T value_or(V&& v) && { - return *this - ? constexpr_move(const_cast&>(*this).contained_val()) - : detail_::convert(constexpr_forward(v)); - } - -#else - - template - T value_or(V&& v) && { - return *this - ? constexpr_move(const_cast&>(*this).contained_val()) - : detail_::convert(constexpr_forward(v)); - } - -#endif - -#else - - template - constexpr T value_or(V&& v) const { - return *this ? **this : detail_::convert(constexpr_forward(v)); - } - -#endif - - // 20.6.3.6, modifiers - void reset() noexcept { - clear(); - } -}; - -template -class optional { - static_assert(!std::is_same::value, "bad T"); - static_assert(!std::is_same::value, "bad T"); - T* ref; - - public: - // 20.5.5.1, construction/destruction - constexpr optional() noexcept : ref(nullptr) {} - - constexpr optional(nullopt_t) noexcept : ref(nullptr) {} - - constexpr optional(T& v) noexcept : ref(detail_::static_addressof(v)) {} - - optional(T&&) = delete; - - constexpr optional(const optional& rhs) noexcept : ref(rhs.ref) {} - - explicit constexpr optional(in_place_t, T& v) noexcept - : ref(detail_::static_addressof(v)) {} - - explicit optional(in_place_t, T&&) = delete; - - ~optional() = default; - - // 20.5.5.2, mutation - optional& operator=(nullopt_t) noexcept { - ref = nullptr; - return *this; - } - - // optional& operator=(const optional& rhs) noexcept { - // ref = rhs.ref; - // return *this; - // } - - // optional& operator=(optional&& rhs) noexcept { - // ref = rhs.ref; - // return *this; - // } - - template - auto operator=(U&& rhs) noexcept -> typename std::enable_if< - std::is_same::type, optional>::value, - optional&>::type { - ref = rhs.ref; - return *this; - } - - template - auto operator=(U&& rhs) noexcept -> typename std::enable_if< - !std::is_same::type, optional>::value, - optional&>::type = delete; - - void emplace(T& v) noexcept { - ref = detail_::static_addressof(v); - } - - void emplace(T&&) = delete; - - void swap(optional& rhs) noexcept { - std::swap(ref, rhs.ref); - } - - // 20.5.5.3, observers - constexpr T* operator->() const { - return TR2_OPTIONAL_ASSERTED_EXPRESSION(ref, ref); - } - - constexpr T& operator*() const { - return TR2_OPTIONAL_ASSERTED_EXPRESSION(ref, *ref); - } - - constexpr T& value() const { - return ref ? *ref - : (throw bad_optional_access("bad optional access"), *ref); - } - - explicit constexpr operator bool() const noexcept { - return ref != nullptr; - } - - constexpr bool has_value() const noexcept { - return ref != nullptr; - } - - template - constexpr typename std::decay::type value_or(V&& v) const { - return *this ? **this - : detail_::convert::type>( - constexpr_forward(v)); - } - - // x.x.x.x, modifiers - void reset() noexcept { - ref = nullptr; - } -}; - -template -class optional { - static_assert(sizeof(T) == 0, "optional rvalue references disallowed"); -}; - -// 20.5.8, Relational operators -template -constexpr bool operator==(const optional& x, const optional& y) { - return bool(x) != bool(y) ? false : bool(x) == false ? true : *x == *y; -} - -template -constexpr bool operator!=(const optional& x, const optional& y) { - return !(x == y); -} - -template -constexpr bool operator<(const optional& x, const optional& y) { - return (!y) ? false : (!x) ? true : *x < *y; -} - -template -constexpr bool operator>(const optional& x, const optional& y) { - return (y < x); -} - -template -constexpr bool operator<=(const optional& x, const optional& y) { - return !(y < x); -} - -template -constexpr bool operator>=(const optional& x, const optional& y) { - return !(x < y); -} - -// 20.5.9, Comparison with nullopt -template -constexpr bool operator==(const optional& x, nullopt_t) noexcept { - return (!x); -} - -template -constexpr bool operator==(nullopt_t, const optional& x) noexcept { - return (!x); -} - -template -constexpr bool operator!=(const optional& x, nullopt_t) noexcept { - return bool(x); -} - -template -constexpr bool operator!=(nullopt_t, const optional& x) noexcept { - return bool(x); -} - -template -constexpr bool operator<(const optional&, nullopt_t) noexcept { - return false; -} - -template -constexpr bool operator<(nullopt_t, const optional& x) noexcept { - return bool(x); -} - -template -constexpr bool operator<=(const optional& x, nullopt_t) noexcept { - return (!x); -} - -template -constexpr bool operator<=(nullopt_t, const optional&) noexcept { - return true; -} - -template -constexpr bool operator>(const optional& x, nullopt_t) noexcept { - return bool(x); -} - -template -constexpr bool operator>(nullopt_t, const optional&) noexcept { - return false; -} - -template -constexpr bool operator>=(const optional&, nullopt_t) noexcept { - return true; -} - -template -constexpr bool operator>=(nullopt_t, const optional& x) noexcept { - return (!x); -} - -// 20.5.10, Comparison with T -template -constexpr bool operator==(const optional& x, const T& v) { - return bool(x) ? *x == v : false; -} - -template -constexpr bool operator==(const T& v, const optional& x) { - return bool(x) ? v == *x : false; -} - -template -constexpr bool operator!=(const optional& x, const T& v) { - return bool(x) ? *x != v : true; -} - -template -constexpr bool operator!=(const T& v, const optional& x) { - return bool(x) ? v != *x : true; -} - -template -constexpr bool operator<(const optional& x, const T& v) { - return bool(x) ? *x < v : true; -} - -template -constexpr bool operator>(const T& v, const optional& x) { - return bool(x) ? v > *x : true; -} - -template -constexpr bool operator>(const optional& x, const T& v) { - return bool(x) ? *x > v : false; -} - -template -constexpr bool operator<(const T& v, const optional& x) { - return bool(x) ? v < *x : false; -} - -template -constexpr bool operator>=(const optional& x, const T& v) { - return bool(x) ? *x >= v : false; -} - -template -constexpr bool operator<=(const T& v, const optional& x) { - return bool(x) ? v <= *x : false; -} - -template -constexpr bool operator<=(const optional& x, const T& v) { - return bool(x) ? *x <= v : true; -} - -template -constexpr bool operator>=(const T& v, const optional& x) { - return bool(x) ? v >= *x : true; -} - -// Comparison of optional with T -template -constexpr bool operator==(const optional& x, const T& v) { - return bool(x) ? *x == v : false; -} - -template -constexpr bool operator==(const T& v, const optional& x) { - return bool(x) ? v == *x : false; -} - -template -constexpr bool operator!=(const optional& x, const T& v) { - return bool(x) ? *x != v : true; -} - -template -constexpr bool operator!=(const T& v, const optional& x) { - return bool(x) ? v != *x : true; -} - -template -constexpr bool operator<(const optional& x, const T& v) { - return bool(x) ? *x < v : true; -} - -template -constexpr bool operator>(const T& v, const optional& x) { - return bool(x) ? v > *x : true; -} - -template -constexpr bool operator>(const optional& x, const T& v) { - return bool(x) ? *x > v : false; -} - -template -constexpr bool operator<(const T& v, const optional& x) { - return bool(x) ? v < *x : false; -} - -template -constexpr bool operator>=(const optional& x, const T& v) { - return bool(x) ? *x >= v : false; -} - -template -constexpr bool operator<=(const T& v, const optional& x) { - return bool(x) ? v <= *x : false; -} - -template -constexpr bool operator<=(const optional& x, const T& v) { - return bool(x) ? *x <= v : true; -} - -template -constexpr bool operator>=(const T& v, const optional& x) { - return bool(x) ? v >= *x : true; -} - -// Comparison of optional with T -template -constexpr bool operator==(const optional& x, const T& v) { - return bool(x) ? *x == v : false; -} - -template -constexpr bool operator==(const T& v, const optional& x) { - return bool(x) ? v == *x : false; -} - -template -constexpr bool operator!=(const optional& x, const T& v) { - return bool(x) ? *x != v : true; -} - -template -constexpr bool operator!=(const T& v, const optional& x) { - return bool(x) ? v != *x : true; -} - -template -constexpr bool operator<(const optional& x, const T& v) { - return bool(x) ? *x < v : true; -} - -template -constexpr bool operator>(const T& v, const optional& x) { - return bool(x) ? v > *x : true; -} - -template -constexpr bool operator>(const optional& x, const T& v) { - return bool(x) ? *x > v : false; -} - -template -constexpr bool operator<(const T& v, const optional& x) { - return bool(x) ? v < *x : false; -} - -template -constexpr bool operator>=(const optional& x, const T& v) { - return bool(x) ? *x >= v : false; -} - -template -constexpr bool operator<=(const T& v, const optional& x) { - return bool(x) ? v <= *x : false; -} - -template -constexpr bool operator<=(const optional& x, const T& v) { - return bool(x) ? *x <= v : true; -} - -template -constexpr bool operator>=(const T& v, const optional& x) { - return bool(x) ? v >= *x : true; -} - -// 20.5.12, Specialized algorithms -template -void swap(optional& x, optional& y) noexcept(noexcept(x.swap(y))) { - x.swap(y); -} - -template -constexpr optional::type> make_optional(T&& v) { - return optional::type>(constexpr_forward(v)); -} - -template -constexpr optional make_optional(std::reference_wrapper v) { - return optional(v.get()); -} - -} // namespace at - -namespace std { -template -struct hash> { - typedef typename hash::result_type result_type; - typedef at::optional argument_type; - - constexpr result_type operator()(argument_type const& arg) const { - return arg ? std::hash{}(*arg) : result_type{}; - } -}; - -template -struct hash> { - typedef typename hash::result_type result_type; - typedef at::optional argument_type; - - constexpr result_type operator()(argument_type const& arg) const { - return arg ? std::hash{}(*arg) : result_type{}; - } -}; -} // namespace std - -#undef TR2_OPTIONAL_REQUIRES -#undef TR2_OPTIONAL_ASSERTED_EXPRESSION diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h deleted file mode 100644 index eed9f677a2ef18..00000000000000 --- a/aten/src/ATen/cuda/detail/KernelUtils.h +++ /dev/null @@ -1,20 +0,0 @@ -#pragma once -// Contents of this file are copied from THCUNN/common.h for the ease of porting -// THCUNN functions into ATen. - -namespace at { namespace cuda { namespace detail { - -// CUDA: grid stride looping -#define CUDA_KERNEL_LOOP(i, n) \ - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) - -// Use 1024 threads per block, which requires cuda sm_2x or above -constexpr int CUDA_NUM_THREADS = 1024; - -// CUDA: number of blocks for threads. -inline int GET_BLOCKS(const int N) -{ - return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; -} - -}}} // namespace at::cuda::detail diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h index 7ce3da3c9e051c..085f2723bf0455 100644 --- a/aten/src/ATen/cudnn/Descriptors.h +++ b/aten/src/ATen/cudnn/Descriptors.h @@ -319,20 +319,6 @@ struct AT_CUDA_API RNNDescriptor } }; -#if CUDNN_VERSION >= 7000 - -struct AT_CUDA_API CTCLossDescriptor - : public Descriptor -{ - void set(cudnnDataType_t datatype) { - AT_CUDNN_CHECK(cudnnSetCTCLossDescriptor(mut_desc(), datatype)); - } -}; - -#endif - union Constant { float f; diff --git a/aten/src/ATen/detail/UniqueVoidPtr.cpp b/aten/src/ATen/detail/UniqueVoidPtr.cpp new file mode 100644 index 00000000000000..07531d826367ae --- /dev/null +++ b/aten/src/ATen/detail/UniqueVoidPtr.cpp @@ -0,0 +1,7 @@ +#include + +namespace at { namespace detail { + +void deleteNothing(void*) {} + +}} // namespace at diff --git a/aten/src/ATen/core/UniqueVoidPtr.h b/aten/src/ATen/detail/UniqueVoidPtr.h similarity index 77% rename from aten/src/ATen/core/UniqueVoidPtr.h rename to aten/src/ATen/detail/UniqueVoidPtr.h index 299c729e125a58..e277014a7935d6 100644 --- a/aten/src/ATen/core/UniqueVoidPtr.h +++ b/aten/src/ATen/detail/UniqueVoidPtr.h @@ -1,15 +1,15 @@ #include -#include +#include namespace at { -using DeleterFnPtr = void (*)(void*); +using DeleterFnPtr = void(*)(void*); namespace detail { // Does not delete anything -AT_CORE_API void deleteNothing(void*); +AT_API void deleteNothing(void*); // A detail::UniqueVoidPtr is an owning smart pointer like unique_ptr, but // with three major differences: @@ -35,47 +35,33 @@ AT_CORE_API void deleteNothing(void*); // to reflect this. // class UniqueVoidPtr { - private: +private: // Lifetime tied to ctx_ void* data_; std::unique_ptr ctx_; - - public: +public: UniqueVoidPtr() : data_(nullptr), ctx_(nullptr, &deleteNothing) {} - explicit UniqueVoidPtr(void* data) - : data_(data), ctx_(nullptr, &deleteNothing) {} + explicit UniqueVoidPtr(void* data) : data_(data), ctx_(nullptr, &deleteNothing) {} UniqueVoidPtr(void* data, void* ctx, DeleterFnPtr ctx_deleter) - : data_(data), ctx_(ctx, ctx_deleter ? ctx_deleter : &deleteNothing) {} - void* operator->() const { - return data_; - } + : data_(data), ctx_(ctx, ctx_deleter ? ctx_deleter : &deleteNothing) {} + void* operator->() const { return data_; } void clear() { ctx_ = nullptr; data_ = nullptr; } - void* get() const { - return data_; - } - void* get_context() const { - return ctx_.get(); - } - void* release_context() { - return ctx_.release(); - } + void* get() const { return data_; } + void* get_context() const { return ctx_.get(); } + void* release_context() { return ctx_.release(); } template T* cast_context(DeleterFnPtr expected_deleter) const { - if (get_deleter() != expected_deleter) - return nullptr; + if (get_deleter() != expected_deleter) return nullptr; return static_cast(get_context()); } - operator bool() const { - return data_ || ctx_; - } - DeleterFnPtr get_deleter() const { - return ctx_.get_deleter(); - } + operator bool() const { return data_ || ctx_; } + DeleterFnPtr get_deleter() const { return ctx_.get_deleter(); } }; + // Note [How UniqueVoidPtr is implemented] // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // UniqueVoidPtr solves a common problem for allocators of tensor data, which @@ -94,18 +80,9 @@ class UniqueVoidPtr { // pointer itself. In simple cases, the context pointer is just the pointer // itself. -inline bool operator==(const UniqueVoidPtr& sp, std::nullptr_t) noexcept { - return !sp; -} -inline bool operator==(std::nullptr_t, const UniqueVoidPtr& sp) noexcept { - return !sp; -} -inline bool operator!=(const UniqueVoidPtr& sp, std::nullptr_t) noexcept { - return sp; -} -inline bool operator!=(std::nullptr_t, const UniqueVoidPtr& sp) noexcept { - return sp; -} +inline bool operator==(const UniqueVoidPtr& sp, std::nullptr_t) noexcept { return !sp; } +inline bool operator==(std::nullptr_t, const UniqueVoidPtr& sp) noexcept { return !sp; } +inline bool operator!=(const UniqueVoidPtr& sp, std::nullptr_t) noexcept { return sp; } +inline bool operator!=(std::nullptr_t, const UniqueVoidPtr& sp) noexcept { return sp; } -} // namespace detail -} // namespace at +}} // namespace at::detail diff --git a/aten/src/ATen/detail/VariableHooksInterface.h b/aten/src/ATen/detail/VariableHooksInterface.h index 836dacb97766ec..287116490397f3 100644 --- a/aten/src/ATen/detail/VariableHooksInterface.h +++ b/aten/src/ATen/detail/VariableHooksInterface.h @@ -3,7 +3,6 @@ #include #include #include -#include namespace at { class Context; @@ -26,10 +25,6 @@ struct AT_API VariableHooksInterface { // squelch -Werror=non-virtual-dtor virtual ~VariableHooksInterface() {} - virtual Type& getVariableType(const at::Type& baseType) const { - AT_ERROR("cannot getVariableType without libtorch"); - } - virtual void registerVariableTypeFor(Context*, Backend backend, ScalarType scalar_type) const { // no-op if Variable not available; it'll get handled (if at all) when // libtorch.so gets loaded diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py index b012de25194361..93c20d4be032f4 100644 --- a/aten/src/ATen/function_wrapper.py +++ b/aten/src/ATen/function_wrapper.py @@ -290,7 +290,7 @@ def __init__(self, reason): 'Backend::${DenseBackend}, ScalarType::Long)'), 'THStorage*': CodeTemplate( - 'checked_cast_storage(' + 'checked_cast_storage<${Storage}>(' '&${arg_name},"${arg_name}",${arg_pos}, ' 'Backend::${Backend}, ScalarType::${ScalarName})'), 'THGenerator*': diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py index 209cca57c293ff..0f2aaffd6eac9d 100644 --- a/aten/src/ATen/gen.py +++ b/aten/src/ATen/gen.py @@ -103,6 +103,10 @@ def check_all_files_written(self): TEMPLATE_PATH = options.source_path + "/templates" GENERATOR_DERIVED = CodeTemplate.from_file( TEMPLATE_PATH + "/GeneratorDerived.h") +STORAGE_DERIVED_CPP = CodeTemplate.from_file( + TEMPLATE_PATH + "/StorageDerived.cpp") +STORAGE_DERIVED_H = CodeTemplate.from_file(TEMPLATE_PATH + "/StorageDerived.h") + TYPE_DERIVED_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDerived.cpp") SPARSE_TYPE_DERIVED_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/SparseTypeDerived.cpp") TYPE_DERIVED_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDerived.h") @@ -233,6 +237,7 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations env['isFloatingType'] = is_floating_type env['isIntegralType'] = not is_floating_type if density == 'Dense': + env['Storage'] = "{}{}Storage".format(backend, scalar_name) env['Tensor'] = "{}{}{}Tensor".format(density_tag, backend, scalar_name) env['Type'] = "{}{}{}Type".format(density_tag, backend, scalar_name) env['DenseTensor'] = "{}{}Tensor".format(backend, scalar_name) @@ -241,6 +246,7 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations env['storage_tensor_headers'] = [] if density != 'Sparse': env['storage_tensor_headers'] = [ + '#include "ATen/{}.h"'.format(env['Storage']), '#include "ATen/{}.h"'.format(env['Tensor']), '#include "ATen/{}ByteTensor.h"'.format(env['Backend']), '#include "ATen/{}IntTensor.h"'.format(env['Backend']), @@ -316,6 +322,8 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations if density != 'Sparse': # there are no storage or tensor types for sparse; it's all uniform + fm.write(env['Storage'] + ".cpp", STORAGE_DERIVED_CPP, env) + fm.write(env['Storage'] + ".h", STORAGE_DERIVED_H, env) env['TensorDenseOrSparse'] = TENSOR_DENSE_CPP.substitute(env) fm.write(env['Tensor'] + ".cpp", TENSOR_DERIVED_CPP, env) fm.write(env['Tensor'] + ".h", TENSOR_DERIVED_H, env) @@ -371,7 +379,7 @@ def declare_outputs(): for backend, density, scalar_types in iterate_types(): scalar_name = scalar_types[0] full_backend = "Sparse" + backend if density == "Sparse" else backend - for kind in ["Type", "Tensor"]: + for kind in ["Storage", "Type", "Tensor"]: if kind != 'Type' and density == "Sparse": # No Storage or Tensor for sparse continue diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp index 36f1e4c0bf86de..a3dc735ab1e4cb 100644 --- a/aten/src/ATen/native/Activation.cpp +++ b/aten/src/ATen/native/Activation.cpp @@ -25,16 +25,6 @@ Tensor & selu_(Tensor & self) { return at::elu_(self, SELU_ALPHA, SELU_SCALE); } -Tensor celu(const Tensor & self, Scalar alpha) { - double inv_alpha = 1. / alpha.to(); - return at::elu(self, 1.0, alpha, Scalar(inv_alpha)); -} - -Tensor & celu_(Tensor & self, Scalar alpha) { - double inv_alpha = 1. / alpha.to(); - return at::elu_(self, 1.0, alpha, Scalar(inv_alpha)); -} - Tensor rrelu(const Tensor & self, Scalar lower, Scalar upper, bool training, Generator* generator) { return at::rrelu_with_noise(self, self.type().tensor(), lower, upper, training, generator); } diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp index 4028e989b87022..a537691f748171 100644 --- a/aten/src/ATen/native/Convolution.cpp +++ b/aten/src/ATen/native/Convolution.cpp @@ -402,11 +402,11 @@ at::Tensor _convolution_nogroup( bool transposed, IntList output_padding) { ConvParams params; - params.stride = stride.vec(); - params.padding = padding.vec(); - params.dilation = dilation.vec(); + params.stride = stride; + params.padding = padding; + params.dilation = dilation; params.transposed = transposed; - params.output_padding = output_padding.vec(); + params.output_padding = output_padding; params.groups = 1; params.benchmark = false; params.deterministic = false; @@ -474,11 +474,11 @@ std::tuple _convolution_double_backward( auto weight = weight_r; ConvParams params; - params.stride = stride_.vec(); - params.padding = padding_.vec(); - params.dilation = dilation_.vec(); + params.stride = stride_; + params.padding = padding_; + params.dilation = dilation_; params.transposed = transposed_; - params.output_padding = output_padding_.vec(); + params.output_padding = output_padding_; params.groups = groups_; params.benchmark = benchmark; params.deterministic = deterministic; diff --git a/aten/src/ATen/native/Distributions.h b/aten/src/ATen/native/Distributions.h index c374740a3ce7d1..7a6e0788531172 100644 --- a/aten/src/ATen/native/Distributions.h +++ b/aten/src/ATen/native/Distributions.h @@ -57,7 +57,6 @@ deviceforcuda scalar_t sample_gamma(scalar_t alpha, BaseSampler& st // Boost alpha for higher acceptance probability. if (alpha < 1.0f) { - if (alpha == 0.f) return 0.f; scale *= std::pow(1 - standard_uniform.sample(), 1.0f / alpha); alpha += 1.0f; } diff --git a/aten/src/ATen/native/Embedding.cpp b/aten/src/ATen/native/Embedding.cpp index 0026a9907d7eca..7599386ee74172 100644 --- a/aten/src/ATen/native/Embedding.cpp +++ b/aten/src/ATen/native/Embedding.cpp @@ -24,7 +24,7 @@ Tensor embedding(const Tensor & weight, const Tensor & indices, return weight.index_select(0, indices); } - auto size = indices.sizes().vec(); + auto size = std::vector(indices.sizes()); for (auto d : weight.sizes().slice(1)) { size.push_back(d); } diff --git a/aten/src/ATen/native/GridSampler.cpp b/aten/src/ATen/native/GridSampler.cpp deleted file mode 100644 index 5f1c8255772dcf..00000000000000 --- a/aten/src/ATen/native/GridSampler.cpp +++ /dev/null @@ -1,780 +0,0 @@ -#include "ATen/ATen.h" -#include "ATen/NativeFunctions.h" -#include "ATen/detail/CUDAHooksInterface.h" -#include "ATen/native/GridSampler.h" - -#ifdef _OPENMP -#include -#endif - -namespace at { namespace native { - -using at::native::detail::GridSamplerInterpolation; -using at::native::detail::GridSamplerPadding; - -namespace { - static inline int64_t clip_coordinates(int64_t in, int64_t clip_limit) { - return std::min(clip_limit - 1, std::max(in, static_cast(0))); - } - - static inline bool within_bounds_2d(int64_t h, int64_t w, int64_t H, int64_t W) { - return h >= 0 && h < H && w >= 0 && w < W; - } - - static inline bool within_bounds_3d(int64_t d, int64_t h, int64_t w, int64_t D, int64_t H, int64_t W) { - return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W; - } - - template - static inline void safe_add_2d(scalar_t *data, int64_t h, int64_t w, - int64_t sH, int64_t sW, int64_t H, int64_t W, - scalar_t delta) { - if (within_bounds_2d(h, w, H, W)) { - data[h * sH + w * sW] += delta; - } - } - - template - static inline void safe_add_3d(scalar_t *data, int64_t d, int64_t h, int64_t w, - int64_t sD, int64_t sH, int64_t sW, - int64_t D, int64_t H, int64_t W, - scalar_t delta) { - if (within_bounds_3d(d, h, w, D, H, W)) { - data[d * sD + h * sH + w * sW] += delta; - } - } - - template - Tensor grid_sampler2d_cpu_impl(const Tensor& input, const Tensor& grid, - GridSamplerInterpolation interpolation_mode, - GridSamplerPadding padding_mode) { - int64_t N = input.size(0); - int64_t C = input.size(1); - int64_t inp_H = input.size(2); - int64_t inp_W = input.size(3); - int64_t out_H = grid.size(1); - int64_t out_W = grid.size(2); - auto output = at::empty({N, C, out_H, out_W}, input.options()); - int64_t inp_sN = input.stride(0); - int64_t inp_sC = input.stride(1); - int64_t inp_sH = input.stride(2); - int64_t inp_sW = input.stride(3); - int64_t grid_sN = grid.stride(0); - int64_t grid_sH = grid.stride(1); - int64_t grid_sW = grid.stride(2); - int64_t grid_sCoor = grid.stride(3); - int64_t out_sN = output.stride(0); - int64_t out_sC = output.stride(1); - int64_t out_sH = output.stride(2); - int64_t out_sW = output.stride(3); - scalar_t *inp_ptr = input.data(); - scalar_t *out_ptr = output.data(); - scalar_t *grid_ptr = grid.data(); - // loop over each output pixel - #ifdef _OPENMP - #pragma omp parallel for - #endif - for (int64_t n = 0; n < N; ++n) { - scalar_t *grid_ptr_N = grid_ptr + n * grid_sN; - scalar_t *inp_ptr_N = inp_ptr + n * inp_sN; - for (int64_t h = 0; h < out_H; ++h) { - for (int64_t w = 0; w < out_W; ++w) { - // get the corresponding input x, y co-ordinates from grid - scalar_t ix = grid_ptr_N[h * grid_sH + w * grid_sW]; - scalar_t iy = grid_ptr_N[h * grid_sH + w * grid_sW + grid_sCoor]; - - // normalize ix, iy from [-1, 1] to [0, inp_W-1] & [0, inp_H-1] - ix = ((ix + 1) / 2) * (inp_W - 1); - iy = ((iy + 1) / 2) * (inp_H - 1); - - // get NE, NW, SE, SW pixel values from (x, y) - int64_t ix_nw = static_cast(std::floor(ix)); - int64_t iy_nw = static_cast(std::floor(iy)); - int64_t ix_ne = ix_nw + 1; - int64_t iy_ne = iy_nw; - int64_t ix_sw = ix_nw; - int64_t iy_sw = iy_nw + 1; - int64_t ix_se = ix_nw + 1; - int64_t iy_se = iy_nw + 1; - - // get surfaces to each neighbor: - scalar_t nw = (ix_se - ix) * (iy_se - iy); - scalar_t ne = (ix - ix_sw) * (iy_sw - iy); - scalar_t sw = (ix_ne - ix) * (iy - iy_ne); - scalar_t se = (ix - ix_nw) * (iy - iy_nw); - - if (padding_mode == GridSamplerPadding::Border) { - // clip coordinates to image borders - ix_nw = clip_coordinates(ix_nw, inp_W); - iy_nw = clip_coordinates(iy_nw, inp_H); - ix_ne = clip_coordinates(ix_ne, inp_W); - iy_ne = clip_coordinates(iy_ne, inp_H); - ix_sw = clip_coordinates(ix_sw, inp_W); - iy_sw = clip_coordinates(iy_sw, inp_H); - ix_se = clip_coordinates(ix_se, inp_W); - iy_se = clip_coordinates(iy_se, inp_H); - } - - // calculate bilinear weighted pixel value and set output pixel - scalar_t *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW; - scalar_t *inp_ptr_NC = inp_ptr_N; - for (int c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) { - // (c, iy_nw, ix_nw) * nw + (c, iy_ne, ix_ne) * ne - // + (c, iy_sw, ix_sw) * sw + (c, iy_se, ix_se) * se - *out_ptr_NCHW = static_cast(0); - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) { - *out_ptr_NCHW += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) { - *out_ptr_NCHW += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) { - *out_ptr_NCHW += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) { - *out_ptr_NCHW += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se; - } - } - } - } - } - return output; - } - - template - Tensor grid_sampler3d_cpu_impl(const Tensor& input, const Tensor& grid, - GridSamplerInterpolation interpolation_mode, - GridSamplerPadding padding_mode) { - int64_t N = input.size(0); - int64_t C = input.size(1); - int64_t inp_D = input.size(2); - int64_t inp_H = input.size(3); - int64_t inp_W = input.size(4); - int64_t out_D = grid.size(1); - int64_t out_H = grid.size(2); - int64_t out_W = grid.size(3); - auto output = at::empty({N, C, out_D, out_H, out_W}, input.options()); - int64_t inp_sN = input.stride(0); - int64_t inp_sC = input.stride(1); - int64_t inp_sD = input.stride(2); - int64_t inp_sH = input.stride(3); - int64_t inp_sW = input.stride(4); - int64_t grid_sN = grid.stride(0); - int64_t grid_sD = grid.stride(1); - int64_t grid_sH = grid.stride(2); - int64_t grid_sW = grid.stride(3); - int64_t grid_sCoor = grid.stride(4); - int64_t out_sN = output.stride(0); - int64_t out_sC = output.stride(1); - int64_t out_sD = output.stride(2); - int64_t out_sH = output.stride(3); - int64_t out_sW = output.stride(4); - scalar_t *inp_ptr = input.data(); - scalar_t *out_ptr = output.data(); - scalar_t *grid_ptr = grid.data(); - // loop over each output pixel - #ifdef _OPENMP - #pragma omp parallel for - #endif - for (int64_t n = 0; n < N; ++n) { - scalar_t *grid_ptr_N = grid_ptr + n * grid_sN; - scalar_t *inp_ptr_N = inp_ptr + n * inp_sN; - for (int64_t d = 0; d < out_D; ++d) { - for (int64_t h = 0; h < out_H; ++h) { - for (int64_t w = 0; w < out_W; ++w) { - // get the corresponding input x, y, z co-ordinates from grid - scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW; - scalar_t ix = *grid_ptr_NDHW; - scalar_t iy = grid_ptr_NDHW[grid_sCoor]; - scalar_t iz = grid_ptr_NDHW[2 * grid_sCoor]; - - // normalize ix, iy, iz from [-1, 1] to [0, inp_W-1] & [0, inp_H-1] & [0, inp_D-1] - ix = ((ix + 1) / 2) * (inp_W - 1); - iy = ((iy + 1) / 2) * (inp_H - 1); - iz = ((iz + 1) / 2) * (inp_D - 1); - - // get corner pixel values from (x, y, z) - // for 4d, we used north-east-south-west - // for 5d, we add top-bottom - int64_t ix_tnw = static_cast(std::floor(ix)); - int64_t iy_tnw = static_cast(std::floor(iy)); - int64_t iz_tnw = static_cast(std::floor(iz)); - - int64_t ix_tne = ix_tnw + 1; - int64_t iy_tne = iy_tnw; - int64_t iz_tne = iz_tnw; - - int64_t ix_tsw = ix_tnw; - int64_t iy_tsw = iy_tnw + 1; - int64_t iz_tsw = iz_tnw; - - int64_t ix_tse = ix_tnw + 1; - int64_t iy_tse = iy_tnw + 1; - int64_t iz_tse = iz_tnw; - - int64_t ix_bnw = ix_tnw; - int64_t iy_bnw = iy_tnw; - int64_t iz_bnw = iz_tnw + 1; - - int64_t ix_bne = ix_tnw + 1; - int64_t iy_bne = iy_tnw; - int64_t iz_bne = iz_tnw + 1; - - int64_t ix_bsw = ix_tnw; - int64_t iy_bsw = iy_tnw + 1; - int64_t iz_bsw = iz_tnw + 1; - - int64_t ix_bse = ix_tnw + 1; - int64_t iy_bse = iy_tnw + 1; - int64_t iz_bse = iz_tnw + 1; - - // get surfaces to each neighbor: - scalar_t tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz); - scalar_t tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz); - scalar_t tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz); - scalar_t tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz); - scalar_t bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse); - scalar_t bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw); - scalar_t bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne); - scalar_t bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw); - - if (padding_mode == GridSamplerPadding::Border) { - // clip coordinates to image borders - ix_tnw = clip_coordinates(ix_tnw, inp_W); - iy_tnw = clip_coordinates(iy_tnw, inp_H); - iz_tnw = clip_coordinates(iz_tnw, inp_D); - ix_tne = clip_coordinates(ix_tne, inp_W); - iy_tne = clip_coordinates(iy_tne, inp_H); - iz_tne = clip_coordinates(iz_tne, inp_D); - ix_tsw = clip_coordinates(ix_tsw, inp_W); - iy_tsw = clip_coordinates(iy_tsw, inp_H); - iz_tsw = clip_coordinates(iz_tsw, inp_D); - ix_tse = clip_coordinates(ix_tse, inp_W); - iy_tse = clip_coordinates(iy_tse, inp_H); - iz_tse = clip_coordinates(iz_tse, inp_D); - ix_bnw = clip_coordinates(ix_bnw, inp_W); - iy_bnw = clip_coordinates(iy_bnw, inp_H); - iz_bnw = clip_coordinates(iz_bnw, inp_D); - ix_bne = clip_coordinates(ix_bne, inp_W); - iy_bne = clip_coordinates(iy_bne, inp_H); - iz_bne = clip_coordinates(iz_bne, inp_D); - ix_bsw = clip_coordinates(ix_bsw, inp_W); - iy_bsw = clip_coordinates(iy_bsw, inp_H); - iz_bsw = clip_coordinates(iz_bsw, inp_D); - ix_bse = clip_coordinates(ix_bse, inp_W); - iy_bse = clip_coordinates(iy_bse, inp_H); - iz_bse = clip_coordinates(iz_bse, inp_D); - } - - // calculate bilinear weighted pixel value and set output pixel - scalar_t *out_ptr_NCDHW = out_ptr + n * out_sN + d * out_sD + h * out_sH + w * out_sW; - scalar_t *inp_ptr_NC = inp_ptr_N; - for (int c = 0; c < C; ++c, out_ptr_NCDHW += out_sC, inp_ptr_NC += inp_sC) { - // (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) * tne - // + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) * tse - // + (c, iz_bnw, iy_bnw, ix_bnw) * bnw + (c, iz_bne, iy_bne, ix_bne) * bne - // + (c, iz_bsw, iy_bsw, ix_bsw) * bsw + (c, iz_bse, iy_bse, ix_bse) * bse - *out_ptr_NCDHW = static_cast(0); - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] * tnw; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] * tne; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] * tsw; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] * tse; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] * bnw; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] * bne; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] * bsw; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] * bse; - } - } - } - } - } - } - return output; - } - - template - std::tuple - grid_sampler2d_backward_cpu_impl(const Tensor& grad_output, - const Tensor& input, const Tensor& grid, - GridSamplerInterpolation interpolation_mode, - GridSamplerPadding padding_mode) { - auto grad_input = at::zeros_like(input); - auto grad_grid = at::empty_like(grid); - int64_t N = input.size(0); - int64_t C = input.size(1); - int64_t inp_H = input.size(2); - int64_t inp_W = input.size(3); - int64_t out_H = grid.size(1); - int64_t out_W = grid.size(2); - int64_t inp_sN = input.stride(0); - int64_t inp_sC = input.stride(1); - int64_t inp_sH = input.stride(2); - int64_t inp_sW = input.stride(3); - int64_t grid_sN = grid.stride(0); - int64_t grid_sH = grid.stride(1); - int64_t grid_sW = grid.stride(2); - int64_t grid_sCoor = grid.stride(3); - int64_t gOut_sN = grad_output.stride(0); - int64_t gOut_sC = grad_output.stride(1); - int64_t gOut_sH = grad_output.stride(2); - int64_t gOut_sW = grad_output.stride(3); - int64_t gInp_sN = grad_input.stride(0); - int64_t gInp_sC = grad_input.stride(1); - int64_t gInp_sH = grad_input.stride(2); - int64_t gInp_sW = grad_input.stride(3); - int64_t gGrid_sN = grad_grid.stride(0); - int64_t gGrid_sW = grad_grid.stride(2); - scalar_t *inp_ptr = input.data(); - scalar_t *grid_ptr = grid.data(); - scalar_t *gOut_ptr = grad_output.data(); - scalar_t *gInp_ptr = grad_input.data(); - scalar_t *gGrid_ptr = grad_grid.data(); - // loop over each output pixel - #ifdef _OPENMP - #pragma omp parallel for - #endif - for (int64_t n = 0; n < N; ++n) { - scalar_t *grid_ptr_N = grid_ptr + n * grid_sN; - scalar_t *inp_ptr_N = inp_ptr + n * inp_sN; - scalar_t *gGrid_ptr_NHW = gGrid_ptr + n * gGrid_sN; - for (int64_t h = 0; h < out_H; ++h) { - for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NHW += gGrid_sW /* grad_grid is contiguous */ ) { - // get the corresponding input x, y co-ordinates from grid - scalar_t ix = grid_ptr_N[h * grid_sH + w * grid_sW]; - scalar_t iy = grid_ptr_N[h * grid_sH + w * grid_sW + grid_sCoor]; - - // normalize ix, iy from [-1, 1] to [0, inp_W-1] & [0, inp_H-1] - ix = ((ix + 1) / 2) * (inp_W - 1); - iy = ((iy + 1) / 2) * (inp_H - 1); - - // get NE, NW, SE, SW pixel values from (x, y) - int64_t ix_nw = static_cast(std::floor(ix)); - int64_t iy_nw = static_cast(std::floor(iy)); - int64_t ix_ne = ix_nw + 1; - int64_t iy_ne = iy_nw; - int64_t ix_sw = ix_nw; - int64_t iy_sw = iy_nw + 1; - int64_t ix_se = ix_nw + 1; - int64_t iy_se = iy_nw + 1; - - // get surfaces to each neighbor: - scalar_t nw = (ix_se - ix) * (iy_se - iy); - scalar_t ne = (ix - ix_sw) * (iy_sw - iy); - scalar_t sw = (ix_ne - ix) * (iy - iy_ne); - scalar_t se = (ix - ix_nw) * (iy - iy_nw); - - int64_t ix_nw_cl, iy_nw_cl, ix_ne_cl, iy_ne_cl, ix_sw_cl, iy_sw_cl, ix_se_cl, iy_se_cl; - - if (padding_mode == GridSamplerPadding::Border) { - // get clipped NE, NW, SE, SW pixel values from (x, y) - ix_nw_cl = clip_coordinates(ix_nw, inp_W); - iy_nw_cl = clip_coordinates(iy_nw, inp_H); - ix_ne_cl = clip_coordinates(ix_ne, inp_W); - iy_ne_cl = clip_coordinates(iy_ne, inp_H); - ix_sw_cl = clip_coordinates(ix_sw, inp_W); - iy_sw_cl = clip_coordinates(iy_sw, inp_H); - ix_se_cl = clip_coordinates(ix_se, inp_W); - iy_se_cl = clip_coordinates(iy_se, inp_H); - } else { - ix_nw_cl = ix_nw; - iy_nw_cl = iy_nw; - ix_ne_cl = ix_ne; - iy_ne_cl = iy_ne; - ix_sw_cl = ix_sw; - iy_sw_cl = iy_sw; - ix_se_cl = ix_se; - iy_se_cl = iy_se; - } - - scalar_t gix = static_cast(0), giy = static_cast(0); - scalar_t *gOut_ptr_NCHW = gOut_ptr + n * gOut_sN + h * gOut_sH + w * gOut_sW; - scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN; - scalar_t *inp_ptr_NC = inp_ptr_N; - // calculate bilinear weighted pixel value and set output pixel - for (int c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC += inp_sC) { - scalar_t gOut = *gOut_ptr_NCHW; - - // calculate and set grad_input - safe_add_2d(gInp_ptr_NC, iy_nw_cl, ix_nw_cl, gInp_sH, gInp_sW, inp_H, inp_W, nw * gOut); - safe_add_2d(gInp_ptr_NC, iy_ne_cl, ix_ne_cl, gInp_sH, gInp_sW, inp_H, inp_W, ne * gOut); - safe_add_2d(gInp_ptr_NC, iy_sw_cl, ix_sw_cl, gInp_sH, gInp_sW, inp_H, inp_W, sw * gOut); - safe_add_2d(gInp_ptr_NC, iy_se_cl, ix_se_cl, gInp_sH, gInp_sW, inp_H, inp_W, se * gOut); - - // calculate grad_grid - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_nw_cl, ix_nw_cl, inp_H, inp_W)) { - scalar_t nw_val = inp_ptr_NC[iy_nw_cl * inp_sH + ix_nw_cl * inp_sW]; - gix -= nw_val * (iy_se - iy) * gOut; - giy -= nw_val * (ix_se - ix) * gOut; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_ne_cl, ix_ne_cl, inp_H, inp_W)) { - scalar_t ne_val = inp_ptr_NC[iy_ne_cl * inp_sH + ix_ne_cl * inp_sW]; - gix += ne_val * (iy_sw - iy) * gOut; - giy -= ne_val * (ix - ix_sw) * gOut; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_sw_cl, ix_sw_cl, inp_H, inp_W)) { - scalar_t sw_val = inp_ptr_NC[iy_sw_cl * inp_sH + ix_sw_cl * inp_sW]; - gix -= sw_val * (iy - iy_ne) * gOut; - giy += sw_val * (ix_ne - ix) * gOut; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_se_cl, ix_se_cl, inp_H, inp_W)) { - scalar_t se_val = inp_ptr_NC[iy_se_cl * inp_sH + ix_se_cl * inp_sW]; - gix += se_val * (iy - iy_nw) * gOut; - giy += se_val * (ix - ix_nw) * gOut; - } - } - - // un-normalize grad_grid values back to [-1, 1] constraints - gix = gix * (inp_W - 1) / 2; - giy = giy * (inp_H - 1) / 2; - - // assuming grad_grid is contiguous - gGrid_ptr_NHW[0] = gix; - gGrid_ptr_NHW[1] = giy; - } - } - } - return std::make_tuple(grad_input, grad_grid); - } - - template - std::tuple - grid_sampler3d_backward_cpu_impl(const Tensor& grad_output, - const Tensor& input, const Tensor& grid, - GridSamplerInterpolation interpolation_mode, - GridSamplerPadding padding_mode) { - auto grad_input = at::zeros_like(input); - auto grad_grid = at::empty_like(grid); - int64_t N = input.size(0); - int64_t C = input.size(1); - int64_t inp_D = input.size(2); - int64_t inp_H = input.size(3); - int64_t inp_W = input.size(4); - int64_t out_D = grid.size(1); - int64_t out_H = grid.size(2); - int64_t out_W = grid.size(3); - int64_t inp_sN = input.stride(0); - int64_t inp_sC = input.stride(1); - int64_t inp_sD = input.stride(2); - int64_t inp_sH = input.stride(3); - int64_t inp_sW = input.stride(4); - int64_t grid_sN = grid.stride(0); - int64_t grid_sD = grid.stride(1); - int64_t grid_sH = grid.stride(2); - int64_t grid_sW = grid.stride(3); - int64_t grid_sCoor = grid.stride(4); - int64_t gOut_sN = grad_output.stride(0); - int64_t gOut_sC = grad_output.stride(1); - int64_t gOut_sD = grad_output.stride(2); - int64_t gOut_sH = grad_output.stride(3); - int64_t gOut_sW = grad_output.stride(4); - int64_t gInp_sN = grad_input.stride(0); - int64_t gInp_sC = grad_input.stride(1); - int64_t gInp_sD = grad_input.stride(2); - int64_t gInp_sH = grad_input.stride(3); - int64_t gInp_sW = grad_input.stride(4); - int64_t gGrid_sN = grad_grid.stride(0); - int64_t gGrid_sW = grad_grid.stride(3); - scalar_t *inp_ptr = input.data(); - scalar_t *grid_ptr = grid.data(); - scalar_t *gOut_ptr = grad_output.data(); - scalar_t *gInp_ptr = grad_input.data(); - scalar_t *gGrid_ptr = grad_grid.data(); - // loop over each output pixel - #ifdef _OPENMP - #pragma omp parallel for - #endif - for (int64_t n = 0; n < N; ++n) { - scalar_t *grid_ptr_N = grid_ptr + n * grid_sN; - scalar_t *inp_ptr_N = inp_ptr + n * inp_sN; - scalar_t *gGrid_ptr_NDHW = gGrid_ptr + n * gGrid_sN; - for (int64_t d = 0; d < out_D; ++d) { - for (int64_t h = 0; h < out_H; ++h) { - for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NDHW += gGrid_sW /* grad_grid is contiguous */ ) { - // get the corresponding input x, y, z co-ordinates from grid - scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW; - scalar_t ix = *grid_ptr_NDHW; - scalar_t iy = grid_ptr_NDHW[grid_sCoor]; - scalar_t iz = grid_ptr_NDHW[2 * grid_sCoor]; - - // normalize ix, iy, iz from [-1, 1] to [0, inp_W-1] & [0, inp_H-1] & [0, inp_D-1] - ix = ((ix + 1) / 2) * (inp_W - 1); - iy = ((iy + 1) / 2) * (inp_H - 1); - iz = ((iz + 1) / 2) * (inp_D - 1); - - // get corner pixel values from (x, y, z) - // for 4d, we used north-east-south-west - // for 5d, we add top-bottom - int64_t ix_tnw = static_cast(std::floor(ix)); - int64_t iy_tnw = static_cast(std::floor(iy)); - int64_t iz_tnw = static_cast(std::floor(iz)); - - int64_t ix_tne = ix_tnw + 1; - int64_t iy_tne = iy_tnw; - int64_t iz_tne = iz_tnw; - - int64_t ix_tsw = ix_tnw; - int64_t iy_tsw = iy_tnw + 1; - int64_t iz_tsw = iz_tnw; - - int64_t ix_tse = ix_tnw + 1; - int64_t iy_tse = iy_tnw + 1; - int64_t iz_tse = iz_tnw; - - int64_t ix_bnw = ix_tnw; - int64_t iy_bnw = iy_tnw; - int64_t iz_bnw = iz_tnw + 1; - - int64_t ix_bne = ix_tnw + 1; - int64_t iy_bne = iy_tnw; - int64_t iz_bne = iz_tnw + 1; - - int64_t ix_bsw = ix_tnw; - int64_t iy_bsw = iy_tnw + 1; - int64_t iz_bsw = iz_tnw + 1; - - int64_t ix_bse = ix_tnw + 1; - int64_t iy_bse = iy_tnw + 1; - int64_t iz_bse = iz_tnw + 1; - - // get surfaces to each neighbor: - scalar_t tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz); - scalar_t tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz); - scalar_t tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz); - scalar_t tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz); - scalar_t bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse); - scalar_t bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw); - scalar_t bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne); - scalar_t bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw); - - int64_t ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, ix_tne_cl, iy_tne_cl, iz_tne_cl; - int64_t ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, ix_tse_cl, iy_tse_cl, iz_tse_cl; - int64_t ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, ix_bne_cl, iy_bne_cl, iz_bne_cl; - int64_t ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, ix_bse_cl, iy_bse_cl, iz_bse_cl; - - if (padding_mode == GridSamplerPadding::Border) { - // clip coordinates to image borders - ix_tnw_cl = clip_coordinates(ix_tnw, inp_W); - iy_tnw_cl = clip_coordinates(iy_tnw, inp_H); - iz_tnw_cl = clip_coordinates(iz_tnw, inp_D); - ix_tne_cl = clip_coordinates(ix_tne, inp_W); - iy_tne_cl = clip_coordinates(iy_tne, inp_H); - iz_tne_cl = clip_coordinates(iz_tne, inp_D); - ix_tsw_cl = clip_coordinates(ix_tsw, inp_W); - iy_tsw_cl = clip_coordinates(iy_tsw, inp_H); - iz_tsw_cl = clip_coordinates(iz_tsw, inp_D); - ix_tse_cl = clip_coordinates(ix_tse, inp_W); - iy_tse_cl = clip_coordinates(iy_tse, inp_H); - iz_tse_cl = clip_coordinates(iz_tse, inp_D); - ix_bnw_cl = clip_coordinates(ix_bnw, inp_W); - iy_bnw_cl = clip_coordinates(iy_bnw, inp_H); - iz_bnw_cl = clip_coordinates(iz_bnw, inp_D); - ix_bne_cl = clip_coordinates(ix_bne, inp_W); - iy_bne_cl = clip_coordinates(iy_bne, inp_H); - iz_bne_cl = clip_coordinates(iz_bne, inp_D); - ix_bsw_cl = clip_coordinates(ix_bsw, inp_W); - iy_bsw_cl = clip_coordinates(iy_bsw, inp_H); - iz_bsw_cl = clip_coordinates(iz_bsw, inp_D); - ix_bse_cl = clip_coordinates(ix_bse, inp_W); - iy_bse_cl = clip_coordinates(iy_bse, inp_H); - iz_bse_cl = clip_coordinates(iz_bse, inp_D); - } else { - ix_tnw_cl = ix_tnw; - iy_tnw_cl = iy_tnw; - iz_tnw_cl = iz_tnw; - ix_tne_cl = ix_tne; - iy_tne_cl = iy_tne; - iz_tne_cl = iz_tne; - ix_tsw_cl = ix_tsw; - iy_tsw_cl = iy_tsw; - iz_tsw_cl = iz_tsw; - ix_tse_cl = ix_tse; - iy_tse_cl = iy_tse; - iz_tse_cl = iz_tse; - ix_bnw_cl = ix_bnw; - iy_bnw_cl = iy_bnw; - iz_bnw_cl = iz_bnw; - ix_bne_cl = ix_bne; - iy_bne_cl = iy_bne; - iz_bne_cl = iz_bne; - ix_bsw_cl = ix_bsw; - iy_bsw_cl = iy_bsw; - iz_bsw_cl = iz_bsw; - ix_bse_cl = ix_bse; - iy_bse_cl = iy_bse; - iz_bse_cl = iz_bse; - } - - scalar_t gix = static_cast(0), giy = static_cast(0), giz = static_cast(0); - scalar_t *gOut_ptr_NCDHW = gOut_ptr + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW; - scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN; - scalar_t *inp_ptr_NC = inp_ptr_N; - // calculate bilinear weighted pixel value and set output pixel - for (int c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC += inp_sC) { - scalar_t gOut = *gOut_ptr_NCDHW; - - // calculate and set grad_input - safe_add_3d(gInp_ptr_NC, iz_tnw_cl, iy_tnw_cl, ix_tnw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tnw * gOut); - safe_add_3d(gInp_ptr_NC, iz_tne_cl, iy_tne_cl, ix_tne_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tne * gOut); - safe_add_3d(gInp_ptr_NC, iz_tsw_cl, iy_tsw_cl, ix_tsw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tsw * gOut); - safe_add_3d(gInp_ptr_NC, iz_tse_cl, iy_tse_cl, ix_tse_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tse * gOut); - safe_add_3d(gInp_ptr_NC, iz_bnw_cl, iy_bnw_cl, ix_bnw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bnw * gOut); - safe_add_3d(gInp_ptr_NC, iz_bne_cl, iy_bne_cl, ix_bne_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bne * gOut); - safe_add_3d(gInp_ptr_NC, iz_bsw_cl, iy_bsw_cl, ix_bsw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bsw * gOut); - safe_add_3d(gInp_ptr_NC, iz_bse_cl, iy_bse_cl, ix_bse_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bse * gOut); - - // calculate grad_grid - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tnw_cl, iy_tnw_cl, ix_tnw_cl, inp_D, inp_H, inp_W)) { - scalar_t tnw_val = inp_ptr_NC[iz_tnw_cl * inp_sD + iy_tnw_cl * inp_sH + ix_tnw_cl * inp_sW]; - gix -= tnw_val * (iy_bse - iy) * (iz_bse - iz) * gOut; - giy -= tnw_val * (ix_bse - ix) * (iz_bse - iz) * gOut; - giz -= tnw_val * (ix_bse - ix) * (iy_bse - iy) * gOut; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tne_cl, iy_tne_cl, ix_tne_cl, inp_D, inp_H, inp_W)) { - scalar_t tne_val = inp_ptr_NC[iz_tne_cl * inp_sD + iy_tne_cl * inp_sH + ix_tne_cl * inp_sW]; - gix += tne_val * (iy_bsw - iy) * (iz_bsw - iz) * gOut; - giy -= tne_val * (ix - ix_bsw) * (iz_bsw - iz) * gOut; - giz -= tne_val * (ix - ix_bsw) * (iy_bsw - iy) * gOut; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tsw_cl, iy_tsw_cl, ix_tsw_cl, inp_D, inp_H, inp_W)) { - scalar_t tsw_val = inp_ptr_NC[iz_tsw_cl * inp_sD + iy_tsw_cl * inp_sH + ix_tsw_cl * inp_sW]; - gix -= tsw_val * (iy - iy_bne) * (iz_bne - iz) * gOut; - giy += tsw_val * (ix_bne - ix) * (iz_bne - iz) * gOut; - giz -= tsw_val * (ix_bne - ix) * (iy - iy_bne) * gOut; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tse_cl, iy_tse_cl, ix_tse_cl, inp_D, inp_H, inp_W)) { - scalar_t tse_val = inp_ptr_NC[iz_tse_cl * inp_sD + iy_tse_cl * inp_sH + ix_tse_cl * inp_sW]; - gix += tse_val * (iy - iy_bnw) * (iz_bnw - iz) * gOut; - giy += tse_val * (ix - ix_bnw) * (iz_bnw - iz) * gOut; - giz -= tse_val * (ix - ix_bnw) * (iy - iy_bnw) * gOut; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bnw_cl, iy_bnw_cl, ix_bnw_cl, inp_D, inp_H, inp_W)) { - scalar_t bnw_val = inp_ptr_NC[iz_bnw_cl * inp_sD + iy_bnw_cl * inp_sH + ix_bnw_cl * inp_sW]; - gix -= bnw_val * (iy_tse - iy) * (iz - iz_tse) * gOut; - giy -= bnw_val * (ix_tse - ix) * (iz - iz_tse) * gOut; - giz += bnw_val * (ix_tse - ix) * (iy_tse - iy) * gOut; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bne_cl, iy_bne_cl, ix_bne_cl, inp_D, inp_H, inp_W)) { - scalar_t bne_val = inp_ptr_NC[iz_bne_cl * inp_sD + iy_bne_cl * inp_sH + ix_bne_cl * inp_sW]; - gix += bne_val * (iy_tsw - iy) * (iz - iz_tsw) * gOut; - giy -= bne_val * (ix - ix_tsw) * (iz - iz_tsw) * gOut; - giz += bne_val * (ix - ix_tsw) * (iy_tsw - iy) * gOut; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bsw_cl, iy_bsw_cl, ix_bsw_cl, inp_D, inp_H, inp_W)) { - scalar_t bsw_val = inp_ptr_NC[iz_bsw_cl * inp_sD + iy_bsw_cl * inp_sH + ix_bsw_cl * inp_sW]; - gix -= bsw_val * (iy - iy_tne) * (iz - iz_tne) * gOut; - giy += bsw_val * (ix_tne - ix) * (iz - iz_tne) * gOut; - giz += bsw_val * (ix_tne - ix) * (iy - iy_tne) * gOut; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bse_cl, iy_bse_cl, ix_bse_cl, inp_D, inp_H, inp_W)) { - scalar_t bse_val = inp_ptr_NC[iz_bse_cl * inp_sD + iy_bse_cl * inp_sH + ix_bse_cl * inp_sW]; - gix += bse_val * (iy - iy_tnw) * (iz - iz_tnw) * gOut; - giy += bse_val * (ix - ix_tnw) * (iz - iz_tnw) * gOut; - giz += bse_val * (ix - ix_tnw) * (iy - iy_tnw) * gOut; - } - } - - // un-normalize grad_grid values back to [-1, 1] constraints - gix = gix * (inp_W - 1) / 2; - giy = giy * (inp_H - 1) / 2; - giz = giz * (inp_D - 1) / 2; - - // assuming grad_grid is contiguous - gGrid_ptr_NDHW[0] = gix; - gGrid_ptr_NDHW[1] = giy; - gGrid_ptr_NDHW[2] = giz; - } - } - } - } - return std::make_tuple(grad_input, grad_grid); - } -} - -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -Tensor grid_sampler_2d_cpu(const Tensor& input, const Tensor& grid, - int64_t interpolation_mode, int64_t padding_mode) { - return AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler2d_cpu", [&] { - return grid_sampler2d_cpu_impl( - input, grid, static_cast(interpolation_mode), - static_cast(padding_mode)); - }); -} - -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -Tensor grid_sampler_3d_cpu(const Tensor& input, const Tensor& grid, - int64_t interpolation_mode, int64_t padding_mode) { - return AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler3d_cpu", [&] { - return grid_sampler3d_cpu_impl( - input, grid, static_cast(interpolation_mode), - static_cast(padding_mode)); - }); -} - -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -std::tuple -grid_sampler_2d_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& grid, - int64_t interpolation_mode, int64_t padding_mode) { - return AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler_2d_backward_cpu", [&] { - return grid_sampler2d_backward_cpu_impl( - grad_output, input, grid, - static_cast(interpolation_mode), - static_cast(padding_mode)); - }); -} - -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -std::tuple -grid_sampler_3d_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& grid, - int64_t interpolation_mode, int64_t padding_mode) { - return AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler_3d_backward_cpu", [&] { - return grid_sampler3d_backward_cpu_impl( - grad_output, input, grid, - static_cast(interpolation_mode), - static_cast(padding_mode)); - }); -} - -Tensor grid_sampler(const Tensor& input, const Tensor& grid, int64_t padding_mode) { - AT_CHECK( - (input.dim() == 4 || input.dim() == 5) && input.dim() == grid.dim(), - "grid_sampler(): expected 4D or 5D input and grid with same number " - "dimensions, but got input with sizes ", input.sizes(), - " and grid with sizes ", grid.sizes()); - AT_CHECK( - input.size(0) == grid.size(0), - "grid_sampler(): expected grid and input to have same batch size, but got " - "input with sizes ", input.sizes(), " and grid with sizes ", grid.sizes()); - AT_CHECK( - grid.size(-1) == input.dim() - 2, - "grid_sampler(): expected grid to have size ", input.dim() - 2, " in last " - "dimension, but got grid with sizes ", grid.sizes()); - // cudnn does not support inputs larger than 1024 - if (at::native::cudnn_is_acceptable(input) && - static_cast(padding_mode) == GridSamplerPadding::Zeros && - input.dim() == 4 && - input.size(1) <= 1024) { - return cudnn_grid_sampler(input, grid); - } - if (input.dim() == 4) { - return at::grid_sampler_2d(input, grid, 0, padding_mode); - } else { - return at::grid_sampler_3d(input, grid, 0, padding_mode); - } -} - -}} // namespace at::native diff --git a/aten/src/ATen/native/GridSampler.h b/aten/src/ATen/native/GridSampler.h deleted file mode 100644 index f39b4e996469fa..00000000000000 --- a/aten/src/ATen/native/GridSampler.h +++ /dev/null @@ -1,9 +0,0 @@ -#include "ATen/ATen.h" -#include "ATen/NativeFunctions.h" - -namespace at { namespace native { namespace detail { - - enum class GridSamplerInterpolation {Bilinear, Nearest}; - enum class GridSamplerPadding {Zeros, Border, Reflection}; - -}}} // namespace at::native::detail diff --git a/aten/src/ATen/native/Indexing.cpp b/aten/src/ATen/native/Indexing.cpp index e4eb336cd5f453..9720adb4895769 100644 --- a/aten/src/ATen/native/Indexing.cpp +++ b/aten/src/ATen/native/Indexing.cpp @@ -69,7 +69,11 @@ static std::vector expandByteTensors(const Tensor & self, TensorList ind } // Replace with nonzeros auto nonzero = index.nonzero(); +#ifndef USE_TH_SIZE_ZERO_DIM + auto special_empty = nonzero.numel() == 0; +#else auto special_empty = false; +#endif for (int64_t j = 0; j < index.dim(); j++) { if (special_empty) { // We can't call select on an empty tensor so we just create an empty @@ -210,10 +214,26 @@ static Tensor computeLinearIndex(const Tensor & src, TensorList indices) { return linearIndex; } +#ifndef USE_TH_SIZE_ZERO_DIM +static bool hasEmptyTensor(TensorList tensors) { + for (auto& tensor : tensors) { + if (tensor.defined() && tensor.numel() == 0) { + return true; + } + } + return false; +} +#endif + static std::tuple makeLinearIndex(Tensor self, TensorList orig) { checkIndexTensorTypes(orig); // first expand ByteTensor (boolean masks) into 1 or more LongTensors auto indices = expandByteTensors(self, orig); +#ifndef USE_TH_SIZE_ZERO_DIM + if (hasEmptyTensor(indices)) { + return std::make_tuple(self, self.type().toScalarType(kLong).tensor()); + } +#endif // next broadcast all index tensors together indices = expand_outplace(indices); // add missing null Tensors so that it matches self.dim() @@ -279,11 +299,11 @@ Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Ten } // Check that source and destination slices have the same size - auto selfSlicedSizes = self.sizes().vec(); + auto selfSlicedSizes = std::vector(self.sizes()); if (selfSlicedSizes.size() > 0) { selfSlicedSizes.erase(selfSlicedSizes.begin() + dim); } - auto sourceSlicedSizes = source.sizes().vec(); + auto sourceSlicedSizes = std::vector(source.sizes()); if (sourceSlicedSizes.size() > 0) { sourceSlicedSizes.erase(sourceSlicedSizes.begin() + dim); } diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp index c82bf8ba0ae043..cb24e71119f9b1 100644 --- a/aten/src/ATen/native/Linear.cpp +++ b/aten/src/ATen/native/Linear.cpp @@ -1,7 +1,6 @@ #include "ATen/ATen.h" #include "ATen/NativeFunctions.h" #include "ATen/WrapDimUtilsMulti.h" -#include namespace at { namespace native { @@ -137,8 +136,6 @@ Tensor einsum(std::string eqn, TensorList tensors) { } else { in_eqn = eqn; } - // remove spaces for einsum compatibility (#9929) - in_eqn.erase(std::remove_if(in_eqn.begin(), in_eqn.end(), isspace), in_eqn.end()); // next we parse in_eq (the left hand side) by iterating. It is a string of comma separated terms per index int64_t operand = 0; @@ -215,7 +212,7 @@ Tensor einsum(std::string eqn, TensorList tensors) { num_output_dims++; } } - } else if (! isspace(c)) { // letter (hopefully) + } else { // letter (hopefully) AT_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis in the right hand side"); AT_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices"); int64_t letter_num = c-'a'; diff --git a/aten/src/ATen/native/LossCTC.cpp b/aten/src/ATen/native/LossCTC.cpp deleted file mode 100644 index 092b7255eb4a0d..00000000000000 --- a/aten/src/ATen/native/LossCTC.cpp +++ /dev/null @@ -1,365 +0,0 @@ -// Copyright (c) 2018 MathInf GmbH, Thomas Viehmann -// Licensed under the BSD-3-Clause license -// This is the CPU implementation of the Connectionist Temporal Loss. -// We mostly follow Graves. -// 1. Graves et al: http://www.cs.toronto.edu/~graves/icml_2006.pdf -// We use the equations from above link, but note that [1] has 1-based indexing and we (of course) use 0-based. -// Graves et al call the probabilities y, we use log_probs (also calling them inputs) - -#include -#include "ATen/Dispatch.h" -#include "ATen/TensorUtils.h" - -#include -#include - -namespace at { -namespace native { - -namespace { - -// this ad-hoc converts from targets (l in [1]) to augmented targets (l' in [1]) note that no bound-checking is done -template -static inline int64_t get_target_prime(target_t* target, int64_t offset, int64_t stride, int64_t idx, int64_t BLANK) { - if (idx % 2 == 0) { - return BLANK; - } else { - return target[offset + stride * (idx / 2)]; - } -} - -// This kernel is a relatively straightforward implementation of the alpha calculation in the forward backward algorithm (section 4.1). -// A (minor) twist is that we are using log-calculations to enhance numerical stability (log_probs and log_alpha). -// The function returns the loss and the alphas, the alphas are kept for the backward step. The wrapper (ctc_loss below) hides -// the alphas from the user by only returning the loss. -template -std::tuple ctc_loss_cpu_template(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK) { - // log_probs: input_len x batch_size x num_labels - // targets [int64]: batch_size x target_length OR sum(target_lengths) - constexpr scalar_t neginf = -std::numeric_limits::infinity(); - using target_t = typename std::conditional::type; - - CheckedFrom c = "ctc_loss_cpu"; - auto log_probs_arg = TensorArg(log_probs, "log_probs", 1); - auto targets_arg = TensorArg(targets, "targets", 2); - checkScalarType(c, targets_arg, target_scalar_type); - checkDim(c, log_probs_arg, 3); - checkDimRange(c, targets_arg, 1, 3); - - int64_t batch_size = log_probs.size(1); - int64_t num_labels = log_probs.size(2); - AT_CHECK(BLANK < num_labels, "blank must be in label range"); - AT_CHECK((int64_t) input_lengths.size() == batch_size, "input_lengths must be of size batch_size"); - AT_CHECK((int64_t) target_lengths.size() == batch_size, "target_lengths must be of size batch_size"); - - size_t tg_target_stride; - int64_t max_target_length; - std::vector tg_batch_offsets(batch_size); - if (targets.dim() == 1) { // concatenated targets - int64_t pos = 0; - max_target_length = 0; - for (int64_t i = 0; i < batch_size; i++) { - tg_batch_offsets[i] = pos; - pos += target_lengths[i]; - if (max_target_length < target_lengths[i]) - max_target_length = target_lengths[i]; - } - tg_target_stride = targets.stride(0); - checkSize(c, targets_arg, 0, pos); - } - else { // batch x max_target_length - // dim is 2 - int64_t tg_batch_stride = targets.stride(0); - for (int64_t i = 0; i < batch_size; i++) { - tg_batch_offsets[i] = i * tg_batch_stride; - } - tg_target_stride = targets.stride(1); - max_target_length = targets.size(1); - checkSize(c, targets_arg, 0, batch_size); - AT_CHECK(targets.size(1) >= max_target_length, - "Expected tensor to have size at least ", max_target_length, " at dimension 1, but got size ", targets.size(1), " for ", targets_arg, - " (while checking arguments for ", c, ")"); - } - int64_t max_input_length = log_probs.size(0); - for (int64_t b = 0; b < batch_size; b++) { - AT_CHECK(input_lengths[b] <= max_input_length, - "Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", targets.size(0), " for ", targets_arg, - " (while checking arguments for ", c, ")"); - } - - Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2*max_target_length+1}, log_probs.options()); - Tensor neg_log_likelihood = at::empty({batch_size}, log_probs.options()); - - auto lpp = log_probs.permute({1,0,2}); - auto log_probs_a_global = lpp.accessor(); - auto log_alpha_a_global = log_alpha.accessor(); - auto targets_data = targets.data(); - auto neg_log_likelihood_a = neg_log_likelihood.accessor(); - - // alpha calculation for the first row, the three equations for alpha_1 above eq (6) - // first the default - log_alpha.narrow(1, 0, 1).fill_(neginf); - #pragma omp parallel for - for (int64_t b = 0; b < batch_size; b++) { - int64_t input_length = input_lengths[b]; - int64_t target_length = target_lengths[b]; - auto log_probs_a = log_probs_a_global[b]; - auto log_alpha_a = log_alpha_a_global[b]; - int64_t tg_batch_offset = tg_batch_offsets[b]; - - // the first two items of alpha_t above eq (6) - log_alpha_a[0][0] = log_probs_a[0][BLANK]; - if (target_length > 0) - log_alpha_a[0][1] = log_probs_a[0][get_target_prime(targets_data, tg_batch_offset, tg_target_stride, 1, BLANK)]; - - // now the loop over the inputs - for (int64_t t=1; t 0) { - la2 = log_alpha_a[t-1][s-1]; - if (la2 > lamax) - lamax = la2; - } else { - la2 = neginf; - } - if ((s > 1) && (get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s-2, BLANK) != - current_target_prime)) { - la3 = log_alpha_a[t-1][s-2]; - if (la3 > lamax) - lamax = la3; - } else { - la3 = neginf; - } - if (lamax == neginf) // cannot do neginf-neginf - lamax = 0; - // this is the assignment of eq (6) - log_alpha_a[t][s] = std::log(std::exp(la1-lamax)+std::exp(la2-lamax)+std::exp(la3-lamax))+lamax + log_probs_a[t][current_target_prime]; - } - } - // the likelihood is the the sum of the last two alphas, eq (8), the loss is the negative log likelihood - scalar_t l1 = log_alpha_a[input_length-1][target_length*2]; - scalar_t l2 = log_alpha_a[input_length-1][target_length*2-1]; - scalar_t m = std::max(l1, l2); - m = ((m == neginf) ? 0 : m); - scalar_t log_likelihood = std::log(std::exp(l1-m)+std::exp(l2-m))+m; - neg_log_likelihood_a[b] = -log_likelihood; - } - - return std::make_tuple(neg_log_likelihood, log_alpha); -} - -// This is the backward. It consists of two phases: -// a) computing the beta analogous to the alphas in the forward (backward half of the forward-backward algorithm) (eq (10) and (11)) -// b) collecting the per-activation characters for all s and wrapping the gradient (eq (16), the collection is the sum) -template -Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, - const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK) { - constexpr scalar_t neginf = -std::numeric_limits::infinity(); - using target_t = typename std::conditional::type; - int64_t max_input_length = log_probs.size(0); - int64_t batch_size = log_probs.size(1); - int64_t num_labels = log_probs.size(2); - Tensor grad = at::full_like(log_probs, neginf); // at this point, this is log of empty sum - - // The admin bits. We don't do much checking and assume that the forward did. - int64_t tg_target_stride; - int64_t max_target_length; - std::vector tg_batch_offsets(batch_size); - - if (targets.dim() == 1) { // concatenated targets - int64_t pos = 0; - max_target_length = 0; - for (int64_t i = 0; i < batch_size; i++) { - tg_batch_offsets[i] = pos; - pos += target_lengths[i]; - if (max_target_length < target_lengths[i]) - max_target_length = target_lengths[i]; - } - tg_target_stride = targets.stride(0); - } - else { // batch x max_target_length - // dim is 2 - int64_t tg_batch_stride = targets.stride(0); - for (int64_t i = 0; i < batch_size; i++) { - tg_batch_offsets[i] = i * tg_batch_stride; - } - tg_target_stride = targets.stride(1); - max_target_length = targets.size(1); - } - - Tensor log_beta = at::empty_like(log_alpha); // could be optimized to use only 2 rows - auto lpp = log_probs.permute({1,0,2}); - auto log_probs_a_global = lpp.accessor(); - auto log_alpha_a_global = log_alpha.accessor(); - auto log_beta_a_global = log_beta.accessor(); - auto gp = grad.permute({1,0,2}); - auto grad_a_global = gp.accessor(); - auto targets_data = targets.data(); - - #pragma omp parallel for - for (int64_t b = 0; b < batch_size; b++) { - auto log_probs_a = log_probs_a_global[b]; - auto log_alpha_a = log_alpha_a_global[b]; - auto log_beta_a = log_beta_a_global[b]; - auto grad_a = grad_a_global[b]; - int64_t input_length = input_lengths[b]; - int64_t target_length = target_lengths[b]; - int64_t tg_batch_offset = tg_batch_offsets[b]; - - // the initialization of beta before eq (10) - // here we do the fill for each batch item separately, as the input lengths will differ, so the t in which - // we start varies - if (input_length > 0) { - log_beta.narrow(0, b, 1).narrow(1, input_length-1, 1).fill_(neginf); - log_beta_a[input_length-1][2*target_length] = log_probs_a[input_length-1][BLANK]; - grad_a[input_length-1][BLANK] = log_alpha_a[input_length-1][2*target_length] + log_beta_a[input_length-1][2*target_length]; - - if (target_length > 0) { - auto current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, 2*target_length-1, BLANK); - log_beta_a[input_length-1][2*target_length-1] = log_probs_a[input_length-1][current_target_prime]; - - // the first two are a blank and a non-blank, so we know they are different and we don't need to do log+ - grad_a[input_length-1][current_target_prime] = log_alpha_a[input_length-1][2*target_length-1] + log_beta_a[input_length-1][2*target_length-1]; - } - } - - // now loop applying eq (10) / (11) - for (int64_t t=input_length-2; t>=0; t--) { - // this loop over s could be parallel/vectorized and doesn't really need to be descending... - // alternatively, one might consider moving s to the outer loop to cache current_target_prime more (but then it needs to be descending) - // for the cuda implementation, that gave a speed boost. - for (int64_t s=2*target_length; s>=0; s--) { - scalar_t lb1 = log_beta_a[t+1][s]; - scalar_t lbmax = lb1; - scalar_t lb2, lb3; - auto current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK); - if (s < 2*target_length) { - lb2 = log_beta_a[t+1][s+1]; - if (lb2 > lbmax) - lbmax = lb2; - } else { - lb2 = neginf; - } - if ((s < 2*target_length-1) && (get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s+2, BLANK) != - current_target_prime)) { - lb3 = log_beta_a[t+1][s+2]; - if (lb3 > lbmax) - lbmax = lb3; - } else { - lb3 = neginf; - } - if (lbmax == neginf) - lbmax = 0; - - log_beta_a[t][s] = std::log(std::exp(lb1-lbmax)+std::exp(lb2-lbmax)+std::exp(lb3-lbmax))+lbmax + log_probs_a[t][current_target_prime]; - // one might check whether one can vectorize this better when done after the t-loop... - // now that we have beta, we fill in the sum of alpha*beta in eq (16) - // in contrast to the cuda implementation, we only parallelize over the batch, so we don't have a concurrency - // issue (several s can map to the same target character) - // collected[b, t, target'[s]] "log+=" log_alpha[t, s]+log_beta[t, s] - scalar_t log_alpha_beta = log_alpha_a[t][s] + log_beta_a[t][s]; - scalar_t &lcab = grad_a[t][current_target_prime]; - if (lcab == neginf) { - lcab = log_alpha_beta; - } else { - scalar_t max = std::max(lcab, log_alpha_beta); - lcab = std::log(std::exp(lcab-max)+std::exp(log_alpha_beta-max))+max; - } - } - } - - // now grad has the sum of eq (16) - // now we wrap up the calculation by adding in the remaining items of eq (16) - // this could be a great target for further vectorization. - // grad is the output gradient, nll is the loss. Note that the likelihood -nll is the Z of eq (16) - scalar_t nll = neg_log_likelihood.accessor()[b]; - scalar_t gr = grad_out.accessor()[b]; - for (int64_t t = 0; t < input_length; t++) { // or go for the full thing? - for (int64_t c = 0; c < num_labels; c++) { - scalar_t& res = grad_a[t][c]; - scalar_t lp = log_probs_a[t][c]; - res = std::exp(lp)-std::exp(res + nll - lp) * gr; - } - } - // zero the remainder - if (input_length < max_input_length) { - grad.narrow(0, input_length, max_input_length - input_length).narrow(1, b, 1).zero_(); - } - } - return grad; -} - -} // namespace - -std::tuple ctc_loss_cpu(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK) { - return AT_DISPATCH_FLOATING_TYPES(log_probs.type(), "ctc_loss", [&] { - if (targets.type().scalarType() == kLong) { - return ctc_loss_cpu_template(log_probs, targets, input_lengths, target_lengths, BLANK); - } else { - return ctc_loss_cpu_template(log_probs, targets, input_lengths, target_lengths, BLANK); - } - }); -} - -Tensor ctc_loss_backward_cpu(const Tensor& grad, const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, - const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK) { - return AT_DISPATCH_FLOATING_TYPES(log_probs.type(), "ctc_loss_backward", [&] { - if (targets.type().scalarType() == kLong) { - return ctc_loss_backward_cpu_template(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK); - } else { - return ctc_loss_backward_cpu_template(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK); - } - }); -} - -// this wrapper function dispatches to the native and cudnn implementations and hides the alpha/grad from the user (by just returning the loss) -// the gradient is implemented for _cudnn_ctc_loss (just in derivatives.yaml) and _ctc_loss and this function has automatic gradients -// it also handles the reduction if desired -Tensor ctc_loss(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK, int64_t reduction) { - auto& ctx = at::globalContext(); - - bool use_cudnn = - detail::getCUDAHooks().compiledWithCuDNN() && - (detail::getCUDAHooks().versionCuDNN() >= 7000) && - ctx.userEnabledCuDNN() && - (BLANK == 0) && (targets.dim()==1) && - (log_probs.type().scalarType() == at::kFloat) && - (targets.type().scalarType() == at::kInt) && - (log_probs.type().backend() == Backend::CUDA); - - if (use_cudnn) { - // we don't know that input_lengths and target_lengths have the same size (they should, but we didn't check yet) - int64_t max_input_length = log_probs.size(0); - for (int64_t b = 0; b < input_lengths.size(); b++) { - use_cudnn &= (input_lengths[b] == max_input_length); - } - for (int64_t b = 0; b < target_lengths.size(); b++) { - use_cudnn &= (target_lengths[b] <= 256); - } - } - - Tensor res; - if (use_cudnn) { - res = std::get<0>(at::_cudnn_ctc_loss(log_probs, targets, input_lengths, target_lengths, BLANK, ctx.deterministicCuDNN())); - } else { - res = std::get<0>(at::_ctc_loss(log_probs, targets, input_lengths, target_lengths, BLANK)); - } - if (reduction == Reduction::ElementwiseMean) { - auto target_lengths_t = at::tensor(target_lengths, res.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(res.type()); - return (res / target_lengths_t).mean(); - } else if (reduction == Reduction::Sum) { - return res.sum(); - } - return res; -} - -} } // at::native diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp index b84b9c3f36b3ea..d6ebbd4573a70c 100644 --- a/aten/src/ATen/native/TensorFactories.cpp +++ b/aten/src/ATen/native/TensorFactories.cpp @@ -141,9 +141,17 @@ Tensor& eye_out_cpu(Tensor& result, int64_t n) { } Tensor& eye_out_cpu(Tensor& result, int64_t n, int64_t m) { +#ifndef USE_TH_SIZE_ZERO_DIM + AT_CHECK(n > 0, "n must be greater than 0, got ", n); +#else AT_CHECK(n >= 0, "n must be greater or equal to 0, got ", n); +#endif +#ifndef USE_TH_SIZE_ZERO_DIM + if(m <= 0) { +#else if(m < 0) { +#endif m = n; } diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index be7e626fa1b748..f7ced03c5ab6fc 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -12,10 +12,6 @@ namespace at { namespace native { -std::vector broadcast_tensors(TensorList tensors) { - return expand_outplace(tensors); -} - static void check_cat_no_zero_dim(TensorList tensors) { for(size_t i = 0; i < tensors.size(); ++i) { auto& t = tensors[i]; @@ -82,6 +78,9 @@ Tensor diagonal(const Tensor& self, int64_t offset, int64_t dim1_, int64_t dim2_ } else { diag_size = std::max(std::min(self.size(dim1)+offset, self.size(dim2)), 0); } +#ifndef USE_TH_SIZE_ZERO_DIM + AT_CHECK(diag_size > 0, "invalid diagonal offset ", offset); // the diagonal offset was too large in magnitude +#endif // NumPy allows you to specify offsets "off the end"; let's just be careful not to // set a ridiculous storage_offset in that case (technically it shouldn't matter @@ -96,8 +95,8 @@ Tensor diagonal(const Tensor& self, int64_t offset, int64_t dim1_, int64_t dim2_ // construct new size and stride: we drop dim1 and dim2 (maximum first for not changing the index of the minumum) // the new ("joint") dimension is appended to the end of the shape / stride to match numpy semantics - auto sizes = self.sizes().vec(); - auto strides = self.strides().vec(); + auto sizes = std::vector(self.sizes()); + auto strides = std::vector(self.strides()); sizes.erase(sizes.begin() + std::max(dim1, dim2)); strides.erase(strides.begin() + std::max(dim1, dim2)); sizes.erase(sizes.begin() + std::min(dim1, dim2)); @@ -158,7 +157,11 @@ Tensor narrow(const Tensor& self, int64_t dim, int64_t start, int64_t length) { if (start != cur_size) { // start being the end is valid, but not a valid dim specification. start = maybe_wrap_dim(start, cur_size); } +#ifndef USE_TH_SIZE_ZERO_DIM + if (length <= 0 || start > cur_size - length) { +#else if (length < 0 || start > cur_size - length) { +#endif AT_ERROR("start (", start, ") + length (", length, ") exceeds dimension size (", cur_size, ")."); } return at::slice(self, dim, start, start + length, 1); @@ -243,6 +246,14 @@ static std::vector infer_size(IntList shape, int64_t numel) { AT_CHECK(newsize != 0, "cannot reshape tensor of 0 elements into shape ", shape); res[*infer_dim] = numel / newsize; } +#ifndef USE_TH_SIZE_ZERO_DIM + if (numel == 0) { + // Collapse zero-element shapes into one dimension because TH handles zeros + // in sizes strangely: x.resize_(1, 0) has shape (1,). TODO: remove this + // once we have multi-dimensional empty tensors. + return {0}; + } +#endif return res; } @@ -280,8 +291,8 @@ Tensor select(const Tensor& self, int64_t dim, int64_t index) { if (index < 0) { index += size; } - auto sizes = self.sizes().vec(); - auto strides = self.strides().vec(); + auto sizes = std::vector(self.sizes()); + auto strides = std::vector(self.strides()); auto storage_offset = self.storage_offset() + index * strides[dim]; sizes.erase(sizes.begin() + dim); strides.erase(strides.begin() + dim); @@ -292,8 +303,8 @@ Tensor slice(const Tensor& self, int64_t dim, int64_t start, int64_t end, int64_ int64_t ndim = self.dim(); AT_CHECK(ndim > 0, "slice() cannot be applied to a 0-dim tensor."); dim = maybe_wrap_dim(dim, ndim); - auto sizes = self.sizes().vec(); - auto strides = self.strides().vec(); + auto sizes = std::vector(self.sizes()); + auto strides = std::vector(self.strides()); if (step <= 0) { // TODO: support negative strides throw std::runtime_error("slice step must be positive"); @@ -316,6 +327,12 @@ Tensor slice(const Tensor& self, int64_t dim, int64_t start, int64_t end, int64_ } auto storage_offset = self.storage_offset() + start * strides[dim]; auto len = end - start; +#ifndef USE_TH_SIZE_ZERO_DIM + if (len == 0) { + // TODO: currently we don't have support for 0-sized dims, return size 0 tensor for now + return self.type().tensor(); + } +#endif sizes[dim] = (len + step - 1) / step; // round-up strides[dim] *= step; return self.as_strided(sizes, strides, storage_offset); @@ -407,7 +424,7 @@ static inline Tensor & sparse_transpose_(Tensor & self, int64_t dim0, int64_t di } if (self._indices().numel() == 0 && self._values().numel() == 0) { - auto sizes = self.sizes().vec(); + std::vector sizes(self.sizes()); std::swap(sizes[dim0], sizes[dim1]); return self.sparse_raw_resize_(sizes, self._sparseDims(), self._denseDims()); @@ -422,7 +439,7 @@ static inline Tensor & sparse_transpose_(Tensor & self, int64_t dim0, int64_t di row0.copy_(row1); row1.copy_(tmp); - auto sizes = self.sizes().vec(); + std::vector sizes(self.sizes()); std::swap(sizes[dim0], sizes[dim1]); return self.sparse_raw_resize_(sizes, -1, -1); @@ -441,8 +458,8 @@ Tensor & transpose_(Tensor & self, int64_t dim0, int64_t dim1) { return sparse_transpose_(self, dim0, dim1); } - auto strides = self.strides().vec(); - auto sizes = self.sizes().vec(); + std::vector strides(self.strides()); + std::vector sizes(self.sizes()); std::swap(strides[dim0], strides[dim1]); std::swap(sizes[dim0], sizes[dim1]); return self.as_strided_(sizes, strides); @@ -461,8 +478,8 @@ Tensor transpose(const Tensor & self, int64_t dim0, int64_t dim1) { return sparse_transpose_(self_clone, dim0, dim1); } - auto strides = self.strides().vec(); - auto sizes = self.sizes().vec(); + std::vector strides(self.strides()); + std::vector sizes(self.sizes()); std::swap(strides[dim0], strides[dim1]); std::swap(sizes[dim0], sizes[dim1]); return self.as_strided(sizes, strides); @@ -522,8 +539,13 @@ inferSqueezeGeometry(const Tensor& tensor, int64_t dim) { std::tuple, std::vector > inferUnsqueezeGeometry(const Tensor& tensor, int64_t dim) { - auto sizes = tensor.sizes().vec(); - auto strides = tensor.strides().vec(); +#ifndef USE_TH_SIZE_ZERO_DIM + if (tensor.numel() == 0) { + throw std::runtime_error("cannot unsqueeze empty tensor"); + } +#endif + std::vector sizes(tensor.sizes()); + std::vector strides(tensor.strides()); int64_t new_stride = dim >= tensor.dim() ? 1 : sizes[dim] * strides[dim]; sizes.insert(sizes.begin() + dim, 1); strides.insert(strides.begin() + dim, new_stride); @@ -541,7 +563,7 @@ Tensor squeeze(const Tensor& self, int64_t dim) { dim = maybe_wrap_dim(dim, dims); if (dims == 0 || self.sizes()[dim] != 1) { - return self.as_strided(self.sizes(), self.strides()); + return self.as_strided(self.sizes().vec(), self.strides().vec()); } auto g = inferSqueezeGeometry(self, dim); return self.as_strided(std::get<0>(g), std::get<1>(g)); @@ -557,7 +579,7 @@ Tensor & squeeze_(Tensor& self, int64_t dim) { dim = maybe_wrap_dim(dim, self.dim()); if (dims == 0 || self.sizes()[dim] != 1) { - return self.as_strided_(self.sizes(), self.strides()); + return self.as_strided_(self.sizes().vec(), self.strides().vec()); } auto g = inferSqueezeGeometry(self, dim); return self.as_strided_(std::get<0>(g), std::get<1>(g)); diff --git a/aten/src/ATen/native/TensorTransformations.cpp b/aten/src/ATen/native/TensorTransformations.cpp index 0648387b35d5ae..84759874ef5355 100644 --- a/aten/src/ATen/native/TensorTransformations.cpp +++ b/aten/src/ATen/native/TensorTransformations.cpp @@ -13,7 +13,7 @@ Tensor flip_cpu(const Tensor& self, IntList dims) { const int64_t total_dims = self.dim(), flip_dims_size = dims.size(); flip_check_errors(total_dims, flip_dims_size, dims); - auto flip_dims_v = dims.vec(); + auto flip_dims_v = std::vector(dims); wrap_all_dims(flip_dims_v, total_dims); std::sort(flip_dims_v.begin(), flip_dims_v.end()); auto final_indices = std::vector(total_dims); diff --git a/aten/src/ATen/native/TensorTransformations.h b/aten/src/ATen/native/TensorTransformations.h index 9b8c7d62b585c6..2504a2c3f201b8 100644 --- a/aten/src/ATen/native/TensorTransformations.h +++ b/aten/src/ATen/native/TensorTransformations.h @@ -14,7 +14,7 @@ static inline void flip_check_errors(int64_t total_dims, int64_t flip_dims_size, AT_CHECK(flip_dims_size > 0 && flip_dims_size <= total_dims, "flip dims size out of range, got flip dims size=", flip_dims_size); - auto flip_dims_v = dims.vec(); + auto flip_dims_v = std::vector(dims); // check if dims axis within range auto min_max_d = std::minmax_element(flip_dims_v.begin(), flip_dims_v.end()); diff --git a/aten/src/ATen/native/Vision.cpp b/aten/src/ATen/native/Vision.cpp new file mode 100644 index 00000000000000..458e9aca23f0fe --- /dev/null +++ b/aten/src/ATen/native/Vision.cpp @@ -0,0 +1,28 @@ +#include "ATen/ATen.h" +#include "ATen/NativeFunctions.h" +#include "ATen/detail/CUDAHooksInterface.h" + +namespace { + enum GridSamplerMode {GridSamplerModeZeros, GridSamplerModeBorder}; +} + +namespace at { namespace native { + +Tensor grid_sampler(const Tensor& input, const Tensor& grid, int64_t padding_mode) { + // cudnn does not support inputs larger than 1024 + if (at::native::cudnn_is_acceptable(input) && + padding_mode == GridSamplerModeZeros && + input.dim() == 4 && + input.size(1) <= 1024) { + return cudnn_grid_sampler(input, grid); + } + if (input.dim() == 4) { + return thnn_grid_sampler_bilinear2d(input, grid, padding_mode); + } + if (input.dim() == 5) { + return thnn_grid_sampler_bilinear3d(input, grid, padding_mode); + } + AT_ERROR("grid_sampler(): input must be 4d or 5d but got input of shape: ", input.dim()); +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/cuda/GridSampler.cu b/aten/src/ATen/native/cuda/GridSampler.cu deleted file mode 100644 index a47865f2023474..00000000000000 --- a/aten/src/ATen/native/cuda/GridSampler.cu +++ /dev/null @@ -1,788 +0,0 @@ -#include "ATen/ATen.h" -#include "ATen/native/GridSampler.h" -#include "ATen/cuda/CUDAContext.h" -#include "ATen/cuda/CUDAApplyUtils.cuh" -#include "ATen/cuda/detail/TensorInfo.cuh" -#include "ATen/cuda/detail/IndexUtils.cuh" -#include "ATen/cuda/detail/KernelUtils.h" - -namespace at { namespace native { - -using namespace at::cuda::detail; - -using at::native::detail::GridSamplerInterpolation; -using at::native::detail::GridSamplerPadding; - -namespace { - static __forceinline__ __device__ - int clip_coordinates(int in, int clip_limit) { - return ::min(clip_limit - 1, ::max(in, static_cast(0))); - } - - static __forceinline__ __device__ - bool within_bounds_2d(int h, int w, int H, int W) { - return h >= 0 && h < H && w >= 0 && w < W; - } - - static __forceinline__ __device__ - bool within_bounds_3d(int d, int h, int w, int D, int H, int W) { - return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W; - } - - template - static __forceinline__ __device__ - void safe_add_2d(scalar_t *data, int h, int w, - int sH, int sW, int H, int W, - scalar_t delta) { - if (within_bounds_2d(h, w, H, W)) { - atomicAdd(data + h * sH + w * sW, delta); - } - } - - template - static __forceinline__ __device__ - void safe_add_3d(scalar_t *data, int d, int h, int w, - int sD, int sH, int sW, int D, int H, int W, - scalar_t delta) { - if (within_bounds_3d(d, h, w, D, H, W)) { - atomicAdd(data + d * sD + h * sH + w * sW, delta); - } - } - - template - __launch_bounds__(1024) - __global__ void grid_sampler_2d_kernel( - const int nthreads, - TensorInfo input, - TensorInfo grid, - TensorInfo output, - const GridSamplerPadding padding_mode) { - - int C = input.sizes[1]; - int inp_H = input.sizes[2]; - int inp_W = input.sizes[3]; - int out_H = grid.sizes[1]; - int out_W = grid.sizes[2]; - int inp_sN = input.strides[0]; - int inp_sC = input.strides[1]; - int inp_sH = input.strides[2]; - int inp_sW = input.strides[3]; - int grid_sN = grid.strides[0]; - int grid_sH = grid.strides[1]; - int grid_sW = grid.strides[2]; - int grid_sCoor = grid.strides[3]; - int out_sN = output.strides[0]; - int out_sC = output.strides[1]; - int out_sH = output.strides[2]; - int out_sW = output.strides[3]; - - CUDA_KERNEL_LOOP(index, nthreads) { - const int w = index % out_W; - const int h = (index / out_W) % out_H; - const int n = index / (out_H * out_W); - const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW; - - // get the corresponding input x, y co-ordinates from grid - scalar_t ix = grid.data[grid_offset]; - scalar_t iy = grid.data[grid_offset + grid_sCoor]; - - // normalize ix, iy from [-1, 1] to [0, IH-1] & [0, IW-1] - float ixf = ((ix + 1.f) / 2) * (inp_W - 1); - float iyf = ((iy + 1.f) / 2) * (inp_H - 1); - - ix = static_cast(ixf); - iy = static_cast(iyf); - - // get NE, NW, SE, SW pixel values from (x, y) - int ix_nw = static_cast(::floor(ixf)); - int iy_nw = static_cast(::floor(iyf)); - int ix_ne = ix_nw + 1; - int iy_ne = iy_nw; - int ix_sw = ix_nw; - int iy_sw = iy_nw + 1; - int ix_se = ix_nw + 1; - int iy_se = iy_nw + 1; - - // get surfaces to each neighbor: - scalar_t nw = (ix_se - ix) * (iy_se - iy); - scalar_t ne = (ix - ix_sw) * (iy_sw - iy); - scalar_t sw = (ix_ne - ix) * (iy - iy_ne); - scalar_t se = (ix - ix_nw) * (iy - iy_nw); - - // calculate bilinear weighted pixel value and set output pixel - if (padding_mode == GridSamplerPadding::Border) { - // clip coordinates to image borders - ix_nw = clip_coordinates(ix_nw, inp_W); - iy_nw = clip_coordinates(iy_nw, inp_H); - ix_ne = clip_coordinates(ix_ne, inp_W); - iy_ne = clip_coordinates(iy_ne, inp_H); - ix_sw = clip_coordinates(ix_sw, inp_W); - iy_sw = clip_coordinates(iy_sw, inp_H); - ix_se = clip_coordinates(ix_se, inp_W); - iy_se = clip_coordinates(iy_se, inp_H); - } - - auto inp_ptr_NC = input.data + n * inp_sN; - auto out_ptr_NCHW = output.data + n * out_sN + h * out_sH + w * out_sW; - for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) { - *out_ptr_NCHW = static_cast(0); - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) { - *out_ptr_NCHW += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) { - *out_ptr_NCHW += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) { - *out_ptr_NCHW += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) { - *out_ptr_NCHW += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se; - } - } - } - } - - template - __launch_bounds__(1024) - __global__ void grid_sampler_3d_kernel( - const int nthreads, - TensorInfo input, - TensorInfo grid, - TensorInfo output, - const GridSamplerPadding padding_mode) { - - int C = input.sizes[1]; - int inp_D = input.sizes[2]; - int inp_H = input.sizes[3]; - int inp_W = input.sizes[4]; - int out_D = grid.sizes[1]; - int out_H = grid.sizes[2]; - int out_W = grid.sizes[3]; - int inp_sN = input.strides[0]; - int inp_sC = input.strides[1]; - int inp_sD = input.strides[2]; - int inp_sH = input.strides[3]; - int inp_sW = input.strides[4]; - int grid_sN = grid.strides[0]; - int grid_sD = grid.strides[1]; - int grid_sH = grid.strides[2]; - int grid_sW = grid.strides[3]; - int grid_sCoor = grid.strides[4]; - int out_sN = output.strides[0]; - int out_sC = output.strides[1]; - int out_sD = output.strides[2]; - int out_sH = output.strides[3]; - int out_sW = output.strides[4]; - - CUDA_KERNEL_LOOP(index, nthreads) { - const int w = index % out_W; - const int h = (index / out_W) % out_H; - const int d = (index / (out_H * out_W)) % out_D; - const int n = index / (out_D * out_H * out_W); - const int grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW; - - // get the corresponding input x, y, z co-ordinates from grid - scalar_t ix = grid.data[grid_offset]; - scalar_t iy = grid.data[grid_offset + grid_sCoor]; - scalar_t iz = grid.data[grid_offset + 2 * grid_sCoor]; - - // normalize ix, iy, iz from [-1, 1] to [0, inp_W-1] & [0, inp_H-1] & [0, inp_D-1] - float ixf = ((ix + 1.f) / 2) * (inp_W - 1); - float iyf = ((iy + 1.f) / 2) * (inp_H - 1); - float izf = ((iz + 1.f) / 2) * (inp_D - 1); - - ix = static_cast(ixf); - iy = static_cast(iyf); - iz = static_cast(izf); - - // get corner pixel values from (x, y, z) - // for 4d, we used north-east-south-west - // for 5d, we add top-bottom - int ix_tnw = static_cast(::floor(ix)); - int iy_tnw = static_cast(::floor(iy)); - int iz_tnw = static_cast(::floor(iz)); - - int ix_tne = ix_tnw + 1; - int iy_tne = iy_tnw; - int iz_tne = iz_tnw; - - int ix_tsw = ix_tnw; - int iy_tsw = iy_tnw + 1; - int iz_tsw = iz_tnw; - - int ix_tse = ix_tnw + 1; - int iy_tse = iy_tnw + 1; - int iz_tse = iz_tnw; - - int ix_bnw = ix_tnw; - int iy_bnw = iy_tnw; - int iz_bnw = iz_tnw + 1; - - int ix_bne = ix_tnw + 1; - int iy_bne = iy_tnw; - int iz_bne = iz_tnw + 1; - - int ix_bsw = ix_tnw; - int iy_bsw = iy_tnw + 1; - int iz_bsw = iz_tnw + 1; - - int ix_bse = ix_tnw + 1; - int iy_bse = iy_tnw + 1; - int iz_bse = iz_tnw + 1; - - // get surfaces to each neighbor: - scalar_t tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz); - scalar_t tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz); - scalar_t tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz); - scalar_t tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz); - scalar_t bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse); - scalar_t bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw); - scalar_t bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne); - scalar_t bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw); - - if (padding_mode == GridSamplerPadding::Border) { - // clip coordinates to image borders - ix_tnw = clip_coordinates(ix_tnw, inp_W); - iy_tnw = clip_coordinates(iy_tnw, inp_H); - iz_tnw = clip_coordinates(iz_tnw, inp_D); - ix_tne = clip_coordinates(ix_tne, inp_W); - iy_tne = clip_coordinates(iy_tne, inp_H); - iz_tne = clip_coordinates(iz_tne, inp_D); - ix_tsw = clip_coordinates(ix_tsw, inp_W); - iy_tsw = clip_coordinates(iy_tsw, inp_H); - iz_tsw = clip_coordinates(iz_tsw, inp_D); - ix_tse = clip_coordinates(ix_tse, inp_W); - iy_tse = clip_coordinates(iy_tse, inp_H); - iz_tse = clip_coordinates(iz_tse, inp_D); - ix_bnw = clip_coordinates(ix_bnw, inp_W); - iy_bnw = clip_coordinates(iy_bnw, inp_H); - iz_bnw = clip_coordinates(iz_bnw, inp_D); - ix_bne = clip_coordinates(ix_bne, inp_W); - iy_bne = clip_coordinates(iy_bne, inp_H); - iz_bne = clip_coordinates(iz_bne, inp_D); - ix_bsw = clip_coordinates(ix_bsw, inp_W); - iy_bsw = clip_coordinates(iy_bsw, inp_H); - iz_bsw = clip_coordinates(iz_bsw, inp_D); - ix_bse = clip_coordinates(ix_bse, inp_W); - iy_bse = clip_coordinates(iy_bse, inp_H); - iz_bse = clip_coordinates(iz_bse, inp_D); - } - - auto inp_ptr_NC = input.data + n * inp_sN; - auto out_ptr_NCDHW = output.data + n * out_sN + d * out_sD + h * out_sH + w * out_sW; - for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) { - // (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) * tne - // + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) * tse - // + (c, iz_bnw, iy_bnw, ix_bnw) * bnw + (c, iz_bne, iy_bne, ix_bne) * bne - // + (c, iz_bsw, iy_bsw, ix_bsw) * bsw + (c, iz_bse, iy_bse, ix_bse) * bse - *out_ptr_NCDHW = static_cast(0); - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] * tnw; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] * tne; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] * tsw; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] * tse; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] * bnw; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] * bne; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] * bsw; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] * bse; - } - } - } - } - - template - __launch_bounds__(1024) - __global__ void grid_sampler_2d_backward_kernel( - const int nthreads, - TensorInfo grad_output, - TensorInfo input, - TensorInfo grid, - TensorInfo grad_input, // initialized to zeros - TensorInfo grad_grid, // initialized to empty - const GridSamplerPadding padding_mode) { - - int C = input.sizes[1]; - int inp_H = input.sizes[2]; - int inp_W = input.sizes[3]; - int out_H = grid.sizes[1]; - int out_W = grid.sizes[2]; - int inp_sN = input.strides[0]; - int inp_sC = input.strides[1]; - int inp_sH = input.strides[2]; - int inp_sW = input.strides[3]; - int grid_sN = grid.strides[0]; - int grid_sH = grid.strides[1]; - int grid_sW = grid.strides[2]; - int grid_sCoor = grid.strides[3]; - int gOut_sN = grad_output.strides[0]; - int gOut_sC = grad_output.strides[1]; - int gOut_sH = grad_output.strides[2]; - int gOut_sW = grad_output.strides[3]; - int gInp_sN = grad_input.strides[0]; - int gInp_sC = grad_input.strides[1]; - int gInp_sH = grad_input.strides[2]; - int gInp_sW = grad_input.strides[3]; - int gGrid_sW = grad_grid.strides[2]; - - CUDA_KERNEL_LOOP(index, nthreads) { - const int w = index % out_W; - const int h = (index / out_W) % out_H; - const int n = index / (out_H * out_W); - const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW; - - // get the corresponding input x, y co-ordinates from grid - scalar_t ix = grid.data[grid_offset]; - scalar_t iy = grid.data[grid_offset + grid_sCoor]; - - // normalize ix, iy from [-1, 1] to [0, IH-1] & [0, IW-1] - float ixf = ((ix + 1.f) / 2) * (inp_W - 1); - float iyf = ((iy + 1.f) / 2) * (inp_H - 1); - - ix = static_cast(ixf); - iy = static_cast(iyf); - - // get NE, NW, SE, SW pixel values from (x, y) - int ix_nw = static_cast(::floor(ixf)); - int iy_nw = static_cast(::floor(iyf)); - int ix_ne = ix_nw + 1; - int iy_ne = iy_nw; - int ix_sw = ix_nw; - int iy_sw = iy_nw + 1; - int ix_se = ix_nw + 1; - int iy_se = iy_nw + 1; - - // get surfaces to each neighbor: - scalar_t nw = (ix_se - ix) * (iy_se - iy); - scalar_t ne = (ix - ix_sw) * (iy_sw - iy); - scalar_t sw = (ix_ne - ix) * (iy - iy_ne); - scalar_t se = (ix - ix_nw) * (iy - iy_nw); - - int ix_nw_cl, iy_nw_cl, ix_ne_cl, iy_ne_cl, ix_sw_cl, iy_sw_cl, ix_se_cl, iy_se_cl; - - // calculate bilinear weighted pixel value and set output pixel - if (padding_mode == GridSamplerPadding::Border) { - // clip coordinates to image borders - ix_nw_cl = clip_coordinates(ix_nw, inp_W); - iy_nw_cl = clip_coordinates(iy_nw, inp_H); - ix_ne_cl = clip_coordinates(ix_ne, inp_W); - iy_ne_cl = clip_coordinates(iy_ne, inp_H); - ix_sw_cl = clip_coordinates(ix_sw, inp_W); - iy_sw_cl = clip_coordinates(iy_sw, inp_H); - ix_se_cl = clip_coordinates(ix_se, inp_W); - iy_se_cl = clip_coordinates(iy_se, inp_H); - } else { - ix_nw_cl = ix_nw; - iy_nw_cl = iy_nw; - ix_ne_cl = ix_ne; - iy_ne_cl = iy_ne; - ix_sw_cl = ix_sw; - iy_sw_cl = iy_sw; - ix_se_cl = ix_se; - iy_se_cl = iy_se; - } - - scalar_t gix = static_cast(0), giy = static_cast(0); - scalar_t *gOut_ptr_NCHW = grad_output.data + n * gOut_sN + h * gOut_sH + w * gOut_sW; - scalar_t *gInp_ptr_NC = grad_input.data + n * gInp_sN; - scalar_t *inp_ptr_NC = input.data + n * inp_sN; - for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, gInp_ptr_NC += gInp_sC, gOut_ptr_NCHW += gOut_sC) { - scalar_t gOut = *gOut_ptr_NCHW; - - // calculate and set grad_input - safe_add_2d(gInp_ptr_NC, iy_nw_cl, ix_nw_cl, gInp_sH, gInp_sW, inp_H, inp_W, nw * gOut); - safe_add_2d(gInp_ptr_NC, iy_ne_cl, ix_ne_cl, gInp_sH, gInp_sW, inp_H, inp_W, ne * gOut); - safe_add_2d(gInp_ptr_NC, iy_sw_cl, ix_sw_cl, gInp_sH, gInp_sW, inp_H, inp_W, sw * gOut); - safe_add_2d(gInp_ptr_NC, iy_se_cl, ix_se_cl, gInp_sH, gInp_sW, inp_H, inp_W, se * gOut); - - // calculate grad_grid - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_nw_cl, ix_nw_cl, inp_H, inp_W)) { - scalar_t nw_val = inp_ptr_NC[iy_nw_cl * inp_sH + ix_nw_cl * inp_sW]; - gix -= nw_val * (iy_se - iy) * gOut; - giy -= nw_val * (ix_se - ix) * gOut; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_ne_cl, ix_ne_cl, inp_H, inp_W)) { - scalar_t ne_val = inp_ptr_NC[iy_ne_cl * inp_sH + ix_ne_cl * inp_sW]; - gix += ne_val * (iy_sw - iy) * gOut; - giy -= ne_val * (ix - ix_sw) * gOut; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_sw_cl, ix_sw_cl, inp_H, inp_W)) { - scalar_t sw_val = inp_ptr_NC[iy_sw_cl * inp_sH + ix_sw_cl * inp_sW]; - gix -= sw_val * (iy - iy_ne) * gOut; - giy += sw_val * (ix_ne - ix) * gOut; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_se_cl, ix_se_cl, inp_H, inp_W)) { - scalar_t se_val = inp_ptr_NC[iy_se_cl * inp_sH + ix_se_cl * inp_sW]; - gix += se_val * (iy - iy_nw) * gOut; - giy += se_val * (ix - ix_nw) * gOut; - } - } - - // un-normalize grad_grid values back to [-1, 1] constraints - gix = gix * (inp_W - 1.f) / 2; - giy = giy * (inp_H - 1.f) / 2; - - // assuming grad_grid is contiguous - // thus we can - // 1. use index with gGrid_sW to diectly compute gGrid_ptr_NHW - // 2. directly assign to gGrid_ptr_NHW[0], gGrid_ptr_NHW[1] - scalar_t *gGrid_ptr_NHW = grad_grid.data + index * gGrid_sW; - gGrid_ptr_NHW[0] = gix; - gGrid_ptr_NHW[1] = giy; - } - } - - template - __launch_bounds__(1024) - __global__ void grid_sampler_3d_backward_kernel( - const int nthreads, - TensorInfo grad_output, - TensorInfo input, - TensorInfo grid, - TensorInfo grad_input, // initialized to zeros - TensorInfo grad_grid, // initialized to empty - const GridSamplerPadding padding_mode) { - - int C = input.sizes[1]; - int inp_D = input.sizes[2]; - int inp_H = input.sizes[3]; - int inp_W = input.sizes[4]; - int out_D = grid.sizes[1]; - int out_H = grid.sizes[2]; - int out_W = grid.sizes[3]; - int inp_sN = input.strides[0]; - int inp_sC = input.strides[1]; - int inp_sD = input.strides[2]; - int inp_sH = input.strides[3]; - int inp_sW = input.strides[4]; - int grid_sN = grid.strides[0]; - int grid_sD = grid.strides[1]; - int grid_sH = grid.strides[2]; - int grid_sW = grid.strides[3]; - int grid_sCoor = grid.strides[4]; - int gOut_sN = grad_output.strides[0]; - int gOut_sC = grad_output.strides[1]; - int gOut_sD = grad_output.strides[2]; - int gOut_sH = grad_output.strides[3]; - int gOut_sW = grad_output.strides[4]; - int gInp_sN = grad_input.strides[0]; - int gInp_sC = grad_input.strides[1]; - int gInp_sD = grad_input.strides[2]; - int gInp_sH = grad_input.strides[3]; - int gInp_sW = grad_input.strides[4]; - int gGrid_sW = grad_grid.strides[3]; - - CUDA_KERNEL_LOOP(index, nthreads) { - const int w = index % out_W; - const int h = (index / out_W) % out_H; - const int d = (index / (out_H * out_W)) % out_D; - const int n = index / (out_D * out_H * out_W); - const int grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW; - - // get the corresponding input x, y, z co-ordinates from grid - scalar_t ix = grid.data[grid_offset]; - scalar_t iy = grid.data[grid_offset + grid_sCoor]; - scalar_t iz = grid.data[grid_offset + 2 * grid_sCoor]; - - // normalize ix, iy, iz from [-1, 1] to [0, inp_W-1] & [0, inp_H-1] & [0, inp_D-1] - float ixf = ((ix + 1.f) / 2) * (inp_W - 1); - float iyf = ((iy + 1.f) / 2) * (inp_H - 1); - float izf = ((iz + 1.f) / 2) * (inp_D - 1); - - ix = static_cast(ixf); - iy = static_cast(iyf); - iz = static_cast(izf); - - // get corner pixel values from (x, y, z) - // for 4d, we used north-east-south-west - // for 5d, we add top-bottom - int ix_tnw = static_cast(::floor(ix)); - int iy_tnw = static_cast(::floor(iy)); - int iz_tnw = static_cast(::floor(iz)); - - int ix_tne = ix_tnw + 1; - int iy_tne = iy_tnw; - int iz_tne = iz_tnw; - - int ix_tsw = ix_tnw; - int iy_tsw = iy_tnw + 1; - int iz_tsw = iz_tnw; - - int ix_tse = ix_tnw + 1; - int iy_tse = iy_tnw + 1; - int iz_tse = iz_tnw; - - int ix_bnw = ix_tnw; - int iy_bnw = iy_tnw; - int iz_bnw = iz_tnw + 1; - - int ix_bne = ix_tnw + 1; - int iy_bne = iy_tnw; - int iz_bne = iz_tnw + 1; - - int ix_bsw = ix_tnw; - int iy_bsw = iy_tnw + 1; - int iz_bsw = iz_tnw + 1; - - int ix_bse = ix_tnw + 1; - int iy_bse = iy_tnw + 1; - int iz_bse = iz_tnw + 1; - - // get surfaces to each neighbor: - scalar_t tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz); - scalar_t tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz); - scalar_t tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz); - scalar_t tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz); - scalar_t bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse); - scalar_t bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw); - scalar_t bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne); - scalar_t bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw); - - int ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, ix_tne_cl, iy_tne_cl, iz_tne_cl; - int ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, ix_tse_cl, iy_tse_cl, iz_tse_cl; - int ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, ix_bne_cl, iy_bne_cl, iz_bne_cl; - int ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, ix_bse_cl, iy_bse_cl, iz_bse_cl; - - if (padding_mode == GridSamplerPadding::Border) { - // clip coordinates to image borders - ix_tnw_cl = clip_coordinates(ix_tnw, inp_W); - iy_tnw_cl = clip_coordinates(iy_tnw, inp_H); - iz_tnw_cl = clip_coordinates(iz_tnw, inp_D); - ix_tne_cl = clip_coordinates(ix_tne, inp_W); - iy_tne_cl = clip_coordinates(iy_tne, inp_H); - iz_tne_cl = clip_coordinates(iz_tne, inp_D); - ix_tsw_cl = clip_coordinates(ix_tsw, inp_W); - iy_tsw_cl = clip_coordinates(iy_tsw, inp_H); - iz_tsw_cl = clip_coordinates(iz_tsw, inp_D); - ix_tse_cl = clip_coordinates(ix_tse, inp_W); - iy_tse_cl = clip_coordinates(iy_tse, inp_H); - iz_tse_cl = clip_coordinates(iz_tse, inp_D); - ix_bnw_cl = clip_coordinates(ix_bnw, inp_W); - iy_bnw_cl = clip_coordinates(iy_bnw, inp_H); - iz_bnw_cl = clip_coordinates(iz_bnw, inp_D); - ix_bne_cl = clip_coordinates(ix_bne, inp_W); - iy_bne_cl = clip_coordinates(iy_bne, inp_H); - iz_bne_cl = clip_coordinates(iz_bne, inp_D); - ix_bsw_cl = clip_coordinates(ix_bsw, inp_W); - iy_bsw_cl = clip_coordinates(iy_bsw, inp_H); - iz_bsw_cl = clip_coordinates(iz_bsw, inp_D); - ix_bse_cl = clip_coordinates(ix_bse, inp_W); - iy_bse_cl = clip_coordinates(iy_bse, inp_H); - iz_bse_cl = clip_coordinates(iz_bse, inp_D); - } else { - ix_tnw_cl = ix_tnw; - iy_tnw_cl = iy_tnw; - iz_tnw_cl = iz_tnw; - ix_tne_cl = ix_tne; - iy_tne_cl = iy_tne; - iz_tne_cl = iz_tne; - ix_tsw_cl = ix_tsw; - iy_tsw_cl = iy_tsw; - iz_tsw_cl = iz_tsw; - ix_tse_cl = ix_tse; - iy_tse_cl = iy_tse; - iz_tse_cl = iz_tse; - ix_bnw_cl = ix_bnw; - iy_bnw_cl = iy_bnw; - iz_bnw_cl = iz_bnw; - ix_bne_cl = ix_bne; - iy_bne_cl = iy_bne; - iz_bne_cl = iz_bne; - ix_bsw_cl = ix_bsw; - iy_bsw_cl = iy_bsw; - iz_bsw_cl = iz_bsw; - ix_bse_cl = ix_bse; - iy_bse_cl = iy_bse; - iz_bse_cl = iz_bse; - } - - scalar_t gix = static_cast(0), giy = static_cast(0), giz = static_cast(0); - scalar_t *gOut_ptr_NCDHW = grad_output.data + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW; - scalar_t *gInp_ptr_NC = grad_input.data + n * gInp_sN; - scalar_t *inp_ptr_NC = input.data + n * inp_sN; - // calculate bilinear weighted pixel value and set output pixel - for (int c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC += inp_sC) { - scalar_t gOut = *gOut_ptr_NCDHW; - - // calculate and set grad_input - safe_add_3d(gInp_ptr_NC, iz_tnw_cl, iy_tnw_cl, ix_tnw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tnw * gOut); - safe_add_3d(gInp_ptr_NC, iz_tne_cl, iy_tne_cl, ix_tne_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tne * gOut); - safe_add_3d(gInp_ptr_NC, iz_tsw_cl, iy_tsw_cl, ix_tsw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tsw * gOut); - safe_add_3d(gInp_ptr_NC, iz_tse_cl, iy_tse_cl, ix_tse_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tse * gOut); - safe_add_3d(gInp_ptr_NC, iz_bnw_cl, iy_bnw_cl, ix_bnw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bnw * gOut); - safe_add_3d(gInp_ptr_NC, iz_bne_cl, iy_bne_cl, ix_bne_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bne * gOut); - safe_add_3d(gInp_ptr_NC, iz_bsw_cl, iy_bsw_cl, ix_bsw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bsw * gOut); - safe_add_3d(gInp_ptr_NC, iz_bse_cl, iy_bse_cl, ix_bse_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bse * gOut); - - // calculate grad_grid - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tnw_cl, iy_tnw_cl, ix_tnw_cl, inp_D, inp_H, inp_W)) { - scalar_t tnw_val = inp_ptr_NC[iz_tnw_cl * inp_sD + iy_tnw_cl * inp_sH + ix_tnw_cl * inp_sW]; - gix -= tnw_val * (iy_bse - iy) * (iz_bse - iz) * gOut; - giy -= tnw_val * (ix_bse - ix) * (iz_bse - iz) * gOut; - giz -= tnw_val * (ix_bse - ix) * (iy_bse - iy) * gOut; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tne_cl, iy_tne_cl, ix_tne_cl, inp_D, inp_H, inp_W)) { - scalar_t tne_val = inp_ptr_NC[iz_tne_cl * inp_sD + iy_tne_cl * inp_sH + ix_tne_cl * inp_sW]; - gix += tne_val * (iy_bsw - iy) * (iz_bsw - iz) * gOut; - giy -= tne_val * (ix - ix_bsw) * (iz_bsw - iz) * gOut; - giz -= tne_val * (ix - ix_bsw) * (iy_bsw - iy) * gOut; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tsw_cl, iy_tsw_cl, ix_tsw_cl, inp_D, inp_H, inp_W)) { - scalar_t tsw_val = inp_ptr_NC[iz_tsw_cl * inp_sD + iy_tsw_cl * inp_sH + ix_tsw_cl * inp_sW]; - gix -= tsw_val * (iy - iy_bne) * (iz_bne - iz) * gOut; - giy += tsw_val * (ix_bne - ix) * (iz_bne - iz) * gOut; - giz -= tsw_val * (ix_bne - ix) * (iy - iy_bne) * gOut; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tse_cl, iy_tse_cl, ix_tse_cl, inp_D, inp_H, inp_W)) { - scalar_t tse_val = inp_ptr_NC[iz_tse_cl * inp_sD + iy_tse_cl * inp_sH + ix_tse_cl * inp_sW]; - gix += tse_val * (iy - iy_bnw) * (iz_bnw - iz) * gOut; - giy += tse_val * (ix - ix_bnw) * (iz_bnw - iz) * gOut; - giz -= tse_val * (ix - ix_bnw) * (iy - iy_bnw) * gOut; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bnw_cl, iy_bnw_cl, ix_bnw_cl, inp_D, inp_H, inp_W)) { - scalar_t bnw_val = inp_ptr_NC[iz_bnw_cl * inp_sD + iy_bnw_cl * inp_sH + ix_bnw_cl * inp_sW]; - gix -= bnw_val * (iy_tse - iy) * (iz - iz_tse) * gOut; - giy -= bnw_val * (ix_tse - ix) * (iz - iz_tse) * gOut; - giz += bnw_val * (ix_tse - ix) * (iy_tse - iy) * gOut; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bne_cl, iy_bne_cl, ix_bne_cl, inp_D, inp_H, inp_W)) { - scalar_t bne_val = inp_ptr_NC[iz_bne_cl * inp_sD + iy_bne_cl * inp_sH + ix_bne_cl * inp_sW]; - gix += bne_val * (iy_tsw - iy) * (iz - iz_tsw) * gOut; - giy -= bne_val * (ix - ix_tsw) * (iz - iz_tsw) * gOut; - giz += bne_val * (ix - ix_tsw) * (iy_tsw - iy) * gOut; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bsw_cl, iy_bsw_cl, ix_bsw_cl, inp_D, inp_H, inp_W)) { - scalar_t bsw_val = inp_ptr_NC[iz_bsw_cl * inp_sD + iy_bsw_cl * inp_sH + ix_bsw_cl * inp_sW]; - gix -= bsw_val * (iy - iy_tne) * (iz - iz_tne) * gOut; - giy += bsw_val * (ix_tne - ix) * (iz - iz_tne) * gOut; - giz += bsw_val * (ix_tne - ix) * (iy - iy_tne) * gOut; - } - if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bse_cl, iy_bse_cl, ix_bse_cl, inp_D, inp_H, inp_W)) { - scalar_t bse_val = inp_ptr_NC[iz_bse_cl * inp_sD + iy_bse_cl * inp_sH + ix_bse_cl * inp_sW]; - gix += bse_val * (iy - iy_tnw) * (iz - iz_tnw) * gOut; - giy += bse_val * (ix - ix_tnw) * (iz - iz_tnw) * gOut; - giz += bse_val * (ix - ix_tnw) * (iy - iy_tnw) * gOut; - } - } - - // un-normalize grad_grid values back to [-1, 1] constraints - gix = gix * (inp_W - 1) / 2; - giy = giy * (inp_H - 1) / 2; - giz = giz * (inp_D - 1) / 2; - - // assuming grad_grid is contiguous - // thus we can - // 1. use index with gGrid_sW to diectly compute gGrid_ptr_NDHW - // 2. directly assign to gGrid_ptr_NDHW[0], gGrid_ptr_NDHW[1], gGrid_ptr_NDHW[2] - scalar_t *gGrid_ptr_NDHW = grad_grid.data + index * gGrid_sW; - gGrid_ptr_NDHW[0] = gix; - gGrid_ptr_NDHW[1] = giy; - gGrid_ptr_NDHW[2] = giz; - } - } -} // namespace - -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -Tensor grid_sampler_2d_cuda(const Tensor& input, const Tensor& grid, - int64_t interpolation_mode, int64_t padding_mode) { - auto N = input.size(0); - auto H = grid.size(1); - auto W = grid.size(2); - auto output = at::empty({N, input.size(1), H, W}, input.options()); - AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "grid_sampler_2d_cuda", [&] { - int count = static_cast(N * H * W); - grid_sampler_2d_kernel - <<>>( - count, - getTensorInfo(input), - getTensorInfo(grid), - getTensorInfo(output), - static_cast(padding_mode)); - }); - return output; -} - -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -Tensor grid_sampler_3d_cuda(const Tensor& input, const Tensor& grid, - int64_t interpolation_mode, int64_t padding_mode) { - auto N = input.size(0); - auto D = grid.size(1); - auto H = grid.size(2); - auto W = grid.size(3); - auto output = at::empty({N, input.size(1), D, H, W}, input.options()); - AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "grid_sampler_2d_cuda", [&] { - int count = static_cast(N * D * H * W); - grid_sampler_3d_kernel - <<>>( - count, - getTensorInfo(input), - getTensorInfo(grid), - getTensorInfo(output), - static_cast(padding_mode)); - }); - return output; -} - -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -std::tuple -grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input, const Tensor& grid, - int64_t interpolation_mode, int64_t padding_mode) { - auto N = input.size(0); - auto H = grid.size(1); - auto W = grid.size(2); - auto grad_input = at::zeros_like(input); - auto grad_grid = at::empty_like(grid); - AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "grid_sampler_2d_backward_cuda", [&] { - int count = static_cast(N * H * W); - grid_sampler_2d_backward_kernel - <<>>( - count, - getTensorInfo(grad_output), - getTensorInfo(input), - getTensorInfo(grid), - getTensorInfo(grad_input), - getTensorInfo(grad_grid), - static_cast(padding_mode)); - }); - return std::make_tuple(grad_input, grad_grid); -} - -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -std::tuple -grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input, const Tensor& grid, - int64_t interpolation_mode, int64_t padding_mode) { - auto N = input.size(0); - auto D = grid.size(1); - auto H = grid.size(2); - auto W = grid.size(3); - auto grad_input = at::zeros_like(input); - auto grad_grid = at::empty_like(grid); - AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "grid_sampler_3d_backward_cuda", [&] { - int count = static_cast(N * D * H * W); - grid_sampler_3d_backward_kernel - <<>>( - count, - getTensorInfo(grad_output), - getTensorInfo(input), - getTensorInfo(grid), - getTensorInfo(grad_input), - getTensorInfo(grad_grid), - static_cast(padding_mode)); - }); - return std::make_tuple(grad_input, grad_grid); -} - -}} // namespace at::native diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh index 12f22fcaf2f216..4b474e0c079e77 100644 --- a/aten/src/ATen/native/cuda/Loops.cuh +++ b/aten/src/ATen/native/cuda/Loops.cuh @@ -76,9 +76,6 @@ void gpu_nullary_kernel(TensorIterator& iter, const func_t& f) { using arg0_t = typename traits::result_type; int64_t numel = iter.numel(); - if (numel == 0) { - return; - } if (iter.is_trivial_1d()) { auto strides = iter.get_inner_strides(); int stride0 = strides[0]; @@ -108,9 +105,6 @@ void gpu_unary_kernel(TensorIterator& iter, const func_t& f) { using arg1_t = typename traits::arg1_t; int64_t numel = iter.numel(); - if (numel == 0) { - return; - } if (iter.is_cpu_scalar(1)) { auto a = iter.scalar_value(1); iter.remove_operand(1); @@ -158,9 +152,6 @@ void gpu_binary_kernel(TensorIterator& iter, const func_t& f) { using arg2_t = typename traits::arg2_t; int numel = iter.numel(); - if (numel == 0) { - return; - } if (iter.is_cpu_scalar(1)) { auto a = iter.scalar_value(1); iter.remove_operand(1); diff --git a/aten/src/ATen/native/cuda/LossCTC.cu b/aten/src/ATen/native/cuda/LossCTC.cu deleted file mode 100644 index 70ece3f4440cf7..00000000000000 --- a/aten/src/ATen/native/cuda/LossCTC.cu +++ /dev/null @@ -1,625 +0,0 @@ -// Copyright (c) 2018 MathInf GmbH, Thomas Viehmann -// Licensed under the BSD-3-Clause license -// This is the GPU implementation of the Connectionist Temporal Loss. -// We mostly follow Graves. -// 1. Graves et al: http://www.cs.toronto.edu/~graves/icml_2006.pdf -// We use the equations from above link, but note that [1] has 1-based indexing and we (of course) use 0-based. -// Graves et al call the probabilities y, we use log_probs (also calling them inputs) -// A few optimizations (simmilar to those here, but also some I didn't take) are described in -// 2. Minmin Sun: http://on-demand.gputechconf.com/gtc/2016/presentation/s6383-minmin-sun-speech-recognition.pdf - -#include -#include - -#include -#include "ATen/Dispatch.h" -#include "ATen/cuda/CUDAApplyUtils.cuh" - -#include -#include - -namespace at { -namespace native { - -namespace { - -// this ad-hoc converts from targets (l in [1]) to augmented targets (l' in [1]) note that no bound-checking is done -// __restrict__ impact to be measured, https://devblogs.nvidia.com/cuda-pro-tip-optimize-pointer-aliasing/ -template -__device__ static inline int64_t get_target_prime(const target_t* __restrict__ target, int64_t offset, int64_t stride, int64_t idx, int64_t BLANK) { - if (idx % 2 == 0) { - return BLANK; - } else { - return target[offset + stride * (idx / 2)]; - } -} - -// this kernel is a relatively straightforward implementation of the alpha calculation in the forward backward algorithm (section 4.1). -// A (minor) twist is that we are using log-calculations to enhance numerical stability (log_probs and log_alpha). -// In total it would be more efficient to compute the beta in the same kernel (e.g. cudnn does this). While the beta are not -// needed for the loss itself (just the grad), we can return log_alpha+log_beta (so same space as currently) and the overhead -// is small and the use-case for loss without grad is relatively limited. -// We parallelize by batch and target sequence. Empirically, it is faster to loop over the input (log probs) sequence and do -// target in parallel, even if it means more frequent __syncthreads. -// In contrast to the cuDNN implementation, we allow large target lengths. For this we need that all previous `s` have been -// computed when we start a new block_s. This is why we have our own for loop here. -template -__global__ void ctc_loss_log_alpha_gpu_kernel(scalar_t* __restrict__ log_alpha_data, - const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths, int64_t max_input_length, - const target_t* __restrict__ targets_data, const int64_t* __restrict__ target_lengths, int64_t max_target_length, - scalar_t* __restrict__ neg_log_likelihood_data, - int64_t lp_input_stride, int64_t lp_batch_stride, int64_t lp_char_stride, - int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride, - const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride, - int64_t batch_size, int64_t BLANK) { - - constexpr scalar_t neginf = -INFINITY; - - // bookkeeping - int64_t b = threadIdx.y + blockIdx.y * blockDim.y; - int64_t input_length = input_lengths[b]; - int64_t target_length = target_lengths[b]; - int64_t lp_batch_offset = b*lp_batch_stride; - int64_t la_batch_offset = b*la_batch_stride; - int64_t tg_batch_offset = tg_batch_offsets[b]; - - if (b >= batch_size) - return; - - // first row (t=0), the three equations for alpha_1 above eq (6) - for (int64_t block_s = 0; block_s < 2*max_target_length+1; block_s += blockDim.x) { - int64_t s = threadIdx.x + block_s; - scalar_t la; - switch (s) { - case 0: - la = log_probs_data[lp_batch_offset + lp_char_stride * BLANK]; - break; - case 1: - if (target_length > 0) { - la = log_probs_data[lp_batch_offset + lp_char_stride * get_target_prime(targets_data, tg_batch_offset, tg_target_stride, 1, BLANK)]; - } - else { - la = neginf; - } - break; - default: - la = neginf; - } - if (s < 2*max_target_length+1) - log_alpha_data[la_batch_offset + /* la_input_stride * 0 */ + la_target_stride * s] = la; - } - - for (int64_t block_s = 0; block_s < 2*max_target_length+1; block_s += blockDim.x) { - int64_t s = threadIdx.x + block_s; - - // These two only depend on s, so we can cache them. - int64_t current_char; // l_s in eq (6) - bool have_three; // flag which of the two cases in eq (6) we have - if (s < 2*target_length+1) { - current_char = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK); - have_three = ((s > 1) && (get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s-2, BLANK) != - current_char)); - } else { - current_char = BLANK; - have_three = false; - } - for (int64_t t=1; t < max_input_length; t++) { - __syncthreads(); // on cuda 9 we might use partial synchronization of only the threads within the same batch - if ((t < input_length) && (target_length > 0) && (s < 2*target_length+1)) { - // only for valid t, s. This is equation (6) and (7), la1, la2, la3 are the three summands, - // lamax is the maximum for the logsumexp trick. - scalar_t la1 = log_alpha_data[la_batch_offset + la_input_stride * (t-1) + la_target_stride * s]; - scalar_t lamax = la1; - scalar_t la2, la3; - if (s > 0) { - la2 = log_alpha_data[la_batch_offset + la_input_stride * (t-1) + la_target_stride * (s-1)]; - if (la2 > lamax) - lamax = la2; - } else { - la2 = neginf; - } - if (have_three) { - la3 = log_alpha_data[la_batch_offset + la_input_stride * (t-1) + la_target_stride * (s-2)]; - if (la3 > lamax) - lamax = la3; - } else { - la3 = neginf; - } - if (lamax == neginf) // when all are neginf. (then the whole thing is neginf, but we can pretend) - lamax = 0; - - log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s] = std::log(std::exp(la1-lamax)+std::exp(la2-lamax)+std::exp(la3-lamax))+lamax - + log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * current_char]; - } else { - // otherwise we just set to neginf - if (s < 2*max_target_length+1) - log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s] = neginf; - } - } - } - __syncthreads(); // on cuda 9 we might use partial synchronization of only the threads within the same batch - - // compute the loss (eq (8)) - if (threadIdx.x == 0) { - scalar_t l1 = log_alpha_data[la_batch_offset + la_input_stride * (input_length-1) + la_target_stride * (target_length*2)]; - scalar_t l2 = log_alpha_data[la_batch_offset + la_input_stride * (input_length-1) + la_target_stride * (target_length*2-1)]; - scalar_t m = ((l1 > l2) ? l1 : l2); - m = ((m == neginf) ? 0 : m); - scalar_t log_likelihood = std::log(std::exp(l1-m)+std::exp(l2-m))+m; - neg_log_likelihood_data[b] = -log_likelihood; - } -} - -// The forward computation. Lot's of admin and a call to the alpha kernel. -// Note: we do not check that the labels are in the valid range. As we use -// them for indexing in the kernels, you'll see memory errors when you -// pass corrupt labels. -// We support both a 2-dimensional tensor as targets (one set of targets in each row) and -// a 1-dimensional tensor where all targets are concatenated (and we use target_lengths -// to figure out where they begin). -// We return log_alpha (currently, might change to (log_alpha+log_beta) to be passed to the -// backward. The dispatch function will only return the loss. -template -std::tuple ctc_loss_gpu_template(const Tensor& log_probs, const Tensor& targets_, IntList input_lengths, IntList target_lengths, int64_t BLANK) { - // log_probs: input_len x batch_size x num_labels - // targets [int64]: batch_size x target_length OR sum(target_lengths) - CheckedFrom c = "ctc_loss_gpu"; - using target_t = typename std::conditional::type; - auto targets = targets_.toType(log_probs.type().toScalarType(target_scalar_type)); // to log_probs cuda if it isn't there already - auto log_probs_arg = TensorArg(log_probs, "log_probs", 1); - auto targets_arg = TensorArg(targets, "targets", 2); - checkAllSameGPU(c, {log_probs_arg, targets_arg}); - - checkScalarType(c, targets_arg, target_scalar_type); - checkDim(c, log_probs_arg, 3); - checkDimRange(c, targets_arg, 1, 3); - - int64_t batch_size = log_probs.size(1); - int64_t num_labels = log_probs.size(2); - AT_CHECK(BLANK < num_labels, "blank must be in label range"); - AT_CHECK(input_lengths.size() == batch_size, "input_lengths must be of size batch_size"); - AT_CHECK(target_lengths.size() == batch_size, "target_lengths must be of size batch_size"); - - int64_t lp_input_stride = log_probs.stride(0); - int64_t lp_char_stride = log_probs.stride(2); - int64_t tg_target_stride; - - int64_t max_target_length; - auto tg_batch_offsets = at::empty({batch_size}, TensorOptions(at::CPU(kLong))); - auto tg_batch_offsets_data = tg_batch_offsets.data(); - if (targets.dim() == 1) { // concatenated targets - int64_t pos = 0; - max_target_length = 0; - for (int64_t i = 0; i < batch_size; i++) { - tg_batch_offsets_data[i] = pos; - pos += target_lengths[i]; - if (max_target_length < target_lengths[i]) - max_target_length = target_lengths[i]; - } - tg_target_stride = targets.stride(0); - checkSize(c, targets_arg, 0, pos); - } - else { // batch x max_target_length - // dim is 2 - int64_t tg_batch_stride = targets.stride(0); - for (int64_t i = 0; i < batch_size; i++) { - tg_batch_offsets_data[i] = i * tg_batch_stride; - } - tg_target_stride = targets.stride(1); - max_target_length = targets.size(1); - checkSize(c, targets_arg, 0, batch_size); - AT_CHECK(targets.size(1) >= max_target_length, - "Expected tensor to have size at least ", max_target_length, " at dimension 1, but got size ", targets.size(1), " for ", targets_arg, - " (while checking arguments for ", c, ")"); - } - int64_t max_input_length = log_probs.size(0); - for (int64_t b = 0; b < batch_size; b++) { - AT_CHECK(input_lengths[b] <= max_input_length, - "Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", targets.size(0), " for ", targets_arg, - " (while checking arguments for ", c, ")"); - } - - auto target_lengths_t = at::tensor(target_lengths, targets.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(targets.type().toScalarType(kLong)); - auto input_lengths_t = at::tensor(input_lengths, targets.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(targets.type().toScalarType(kLong)); - tg_batch_offsets = tg_batch_offsets.toType(targets.type().toScalarType(kLong)); - - Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2*max_target_length+1}, log_probs.options()); - Tensor neg_log_likelihood = at::empty({batch_size}, log_probs.options()); - - // Very likely, we could be more clever here, e.g. learning (or genralizing and reusing) from SoftMax.cu... - constexpr int max_threads = 1024; - int threads_target = max_threads; - while (threads_target / 2 >= 2*max_target_length+1) { - threads_target /= 2; - } - int threads_batch = std::min(max_threads / threads_target, (int) batch_size); - - dim3 block(threads_target, threads_batch); - dim3 grid((2*max_target_length+1 + threads_target-1)/threads_target, (batch_size+threads_batch-1)/threads_batch); - cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - - ctc_loss_log_alpha_gpu_kernel<<>>( - log_alpha.data(), - log_probs.data(), input_lengths_t.data(), log_probs.size(0), - targets.data(), target_lengths_t.data(), max_target_length, - neg_log_likelihood.data(), - log_probs.stride(0), log_probs.stride(1), log_probs.stride(2), - log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2), - tg_batch_offsets.data(), tg_target_stride, - batch_size, BLANK); - return std::make_tuple(neg_log_likelihood, log_alpha); -} - -// The second (backward) half of the forward backward algorithm, (10) and (11). This is parallel to the -// alpha kernel above. (As mentioned above, it might make sense do the calculation in the alpha kernel.) -template -__global__ void ctc_loss_backward_log_beta_gpu_kernel(scalar_t* __restrict__ log_beta_data, - const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths, int64_t max_input_length, - const target_t* __restrict__ targets_data, const int64_t* __restrict__ target_lengths, int64_t max_target_length, - int64_t lp_input_stride, int64_t lp_batch_stride, int64_t lp_char_stride, - int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride, - const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride, - int64_t batch_size, int64_t BLANK) { - constexpr scalar_t neginf = -INFINITY; - - int64_t b = threadIdx.y + blockIdx.y * blockDim.y; - - int64_t input_length = input_lengths[b]; - int64_t target_length = target_lengths[b]; - int64_t lp_batch_offset = b*lp_batch_stride; - int64_t lb_batch_offset = b*lb_batch_stride; - int64_t tg_batch_offset = tg_batch_offsets[b]; - - if (b >= batch_size) - return; - - // "first" row, the beta initiaization before eq (10) (t=target_length - differes per batch) - for (int64_t block_s = 2*max_target_length - (2*max_target_length % blockDim.x); block_s >= 0; block_s -= blockDim.x) { - int64_t s = threadIdx.x + block_s; - scalar_t lb; - if (s == 2*target_length) { - lb = log_probs_data[lp_batch_offset + (input_length-1) * lp_input_stride + lp_char_stride * BLANK]; - } else if ((target_length > 0) && (s == 2*target_length-1)) { - int64_t current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK); - lb = log_probs_data[lp_batch_offset + (input_length-1) * lp_input_stride + lp_char_stride * current_target_prime]; - } else { - lb = neginf; - } - if (s < 2*max_target_length+1) { - log_beta_data[lb_batch_offset + (input_length-1) * lb_input_stride + lb_target_stride * s] = lb; - } - } - - // go backward in s - for (int64_t block_s = 2*max_target_length - (2*max_target_length % blockDim.x); block_s >= 0; block_s -= blockDim.x) { - int64_t s = threadIdx.x + block_s; - int64_t current_target_prime; - bool have_three; - if (s < 2*target_length+1) { - current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK); - have_three = ((s < 2*target_length-1) && - (get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s+2, BLANK) != - current_target_prime)); - } else { - current_target_prime = BLANK; - have_three = false; - } - // now go backward in t. Note that we need to skip the last timestep that we did above. - for (int64_t t=max_input_length-2; t>=0; t--) { - __syncthreads(); // on cuda 9 we might use partial synchronization of only the threads within the same batch item - if ((t < input_length-1) && (target_length > 0) && (s < 2*target_length+1)) { - scalar_t lb1 = log_beta_data[lb_batch_offset + lb_input_stride * (t+1) + lb_target_stride * s]; - scalar_t lbmax = lb1; - scalar_t lb2, lb3; - - if (s < 2*target_length) { - lb2 = log_beta_data[lb_batch_offset + lb_input_stride * (t+1) + lb_target_stride * (s+1)]; - if (lb2 > lbmax) - lbmax = lb2; - } else { - lb2 = neginf; - } - if (have_three) { - lb3 = log_beta_data[lb_batch_offset + lb_input_stride * (t+1) + lb_target_stride * (s+2)]; - if (lb3 > lbmax) - lbmax = lb3; - } else { - lb3 = neginf; - } - if (lbmax == neginf) - lbmax = 0; - - scalar_t lb = std::log(std::exp(lb1-lbmax)+std::exp(lb2-lbmax)+std::exp(lb3-lbmax))+lbmax - + log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * current_target_prime]; - - log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s] = lb; - } else if ((s < 2*max_target_length+1) || (t >= input_length)) { - log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s] = neginf; - } - } - } -} - -// This implements the subtrahend of equation (16) for all *nonblank* characters. -// It assumes you have probs in gradient_data when called -// and it modifies gradient_data to be, the gradient. -// In order to facilitate this inplace update, We don't actually do this in logspace. -// (The other variant implemented uses log_space and the differences seem to be -// not so problematic at least with unit normal distributed test activations.) -// Internally this uses atomicAdd because different threads may write to the same -// gradient position. -// This is parallelised over b and s again. -// Note that for us, the Z of eqn (16) is actually constant for all t and it is the -// likelihood - this is why we use the negative log likelihood below. -// We also multiply by the input gradient to keep with standard autograd style. -// I took this trick from [2], for moderate alphabet sizes a log-space -// calculation (with an atomic log add) is similarly in performance, but for large -// alphabets the inplace nature is a considerable advantage. -template -__global__ void ctc_loss_backward_collect_nonblank_gpu_kernel(scalar_t* __restrict__ gradient_data, - const scalar_t* __restrict__ grad_out_data, int64_t grad_out_batch_stride, - const scalar_t* __restrict__ log_alpha_data, const scalar_t* __restrict__ log_beta_data, - const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths, int64_t max_input_length, - const target_t* __restrict__ targets_data, const int64_t* __restrict__ target_lengths, int64_t max_target_length, - const scalar_t* __restrict__ neg_log_likelihood_data, - int64_t gr_input_stride, int64_t gr_batch_stride, int64_t gr_char_stride, - int64_t lp_input_stride, int64_t lp_batch_stride, int64_t lp_char_stride, - int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride, - int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride, - const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride, - int64_t batch_size, int64_t num_labels, int64_t BLANK) { - int64_t b = threadIdx.y + blockIdx.y * blockDim.y; - int64_t s = threadIdx.x + blockIdx.x * blockDim.y; // note, this directly indexes into targets, no targets prime! - - if (b >= batch_size) - return; - - int64_t input_length = input_lengths[b]; - int64_t target_length = target_lengths[b]; - int64_t gr_batch_offset = b*gr_batch_stride; - int64_t lp_batch_offset = b*lp_batch_stride; - int64_t la_batch_offset = b*la_batch_stride; - int64_t lb_batch_offset = b*lb_batch_stride; - int64_t tg_batch_offset = tg_batch_offsets[b]; - - if (s >= target_length) - return; - - int64_t target = targets_data[tg_batch_offset + s * tg_target_stride]; - scalar_t nll = neg_log_likelihood_data[b]; - scalar_t gr = grad_out_data[b * grad_out_batch_stride]; - - for (int64_t t = 0; t < input_length; t++) { - scalar_t lp = log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * target]; - atomicAdd(&gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * target], - -std::exp(log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * (s*2+1)] - + log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * (s*2+1)] - + nll - lp) * gr); - } -} - -// This is the naive implementation of equation (16). It is parallelised in batch and input timestep. -// It appears to be faster than the above method for small batch sizes. -template -__global__ void ctc_loss_backward_collect_gpu_kernel(scalar_t* __restrict__ gradient_data, - const scalar_t* __restrict__ grad_out_data, int64_t grad_out_batch_stride, - const scalar_t* __restrict__ log_alpha_data, const scalar_t* __restrict__ log_beta_data, - const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths, int64_t max_input_length, - const target_t* __restrict__ targets_data, const int64_t* __restrict__ target_lengths, int64_t max_target_length, - const scalar_t* __restrict__ neg_log_likelihood_data, - int64_t gr_input_stride, int64_t gr_batch_stride, int64_t gr_char_stride, - int64_t lp_input_stride, int64_t lp_batch_stride, int64_t lp_char_stride, - int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride, - int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride, - const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride, - int64_t batch_size, int64_t num_labels, int64_t BLANK) { - - constexpr scalar_t neginf = -INFINITY; - int64_t b = threadIdx.y + blockIdx.y * blockDim.y; - int64_t t = threadIdx.x + blockIdx.x * blockDim.x; - - if ((t >= max_input_length) || (b >= batch_size)) - return; - - int64_t input_length = input_lengths[b]; - int64_t target_length = target_lengths[b]; - int64_t gr_batch_offset = b*gr_batch_stride; - int64_t lp_batch_offset = b*lp_batch_stride; - int64_t la_batch_offset = b*la_batch_stride; - int64_t lb_batch_offset = b*lb_batch_stride; - int64_t tg_batch_offset = tg_batch_offsets[b]; - - // collected[b, t, target'[s]] "log+=" log_alpha[t, s]+log_beta[t, s] - for (int s = 0; s < 2*max_target_length+1; s++) { - if ((target_length > 0) && (s < 2*target_length+1)) { - int64_t current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK); - scalar_t log_alpha_beta = (log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s] - + log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s]); - scalar_t& lcab = gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * current_target_prime]; - if (lcab == neginf) { - lcab = log_alpha_beta; - } else { - scalar_t max = ((lcab > log_alpha_beta) ? lcab : log_alpha_beta); - lcab = std::log(std::exp(lcab-max)+std::exp(log_alpha_beta-max))+max; - } - } - } - - scalar_t nll = neg_log_likelihood_data[b]; - scalar_t gr = grad_out_data[b * grad_out_batch_stride]; - - for (int64_t c = 0; c < num_labels; c++) { - scalar_t& res = gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * c]; - if (t < input_length) { - scalar_t lp = log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * c]; - res = std::exp(lp)-std::exp(res + nll - lp) * gr; - } - else { - res = 0.; - } - } -} - -// The backward. It essentially computes eq 16 by using the above kernels. -// We don't do a lot of checking as we envision this to be called only when backpropagating through a (well-checked) forward. -template -Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_probs, const Tensor& targets_, IntList input_lengths, IntList target_lengths, - const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK) { - constexpr scalar_t neginf = -INFINITY; - using target_t = typename std::conditional::type; - auto targets = targets_.toType(log_probs.type().toScalarType(target_scalar_type)); // to cuda if it isn't there already - int64_t batch_size = log_probs.size(1); - int64_t num_labels = log_probs.size(2); - int64_t lp_input_stride = log_probs.stride(0); - int64_t lp_char_stride = log_probs.stride(2); - int64_t tg_target_stride; - - int64_t max_target_length; - auto tg_batch_offsets = at::empty({batch_size}, TensorOptions(at::CPU(kLong))); - auto tg_batch_offsets_data = tg_batch_offsets.data(); - if (targets.dim() == 1) { // concatenated targets - int64_t pos = 0; - max_target_length = 0; - for (int64_t i = 0; i < batch_size; i++) { - tg_batch_offsets_data[i] = pos; - pos += target_lengths[i]; - if (max_target_length < target_lengths[i]) - max_target_length = target_lengths[i]; - } - tg_target_stride = targets.stride(0); - } - else { // batch x max_target_length - // dim is 2 - int64_t tg_batch_stride = targets.stride(0); - for (int64_t i = 0; i < batch_size; i++) { - tg_batch_offsets_data[i] = i * tg_batch_stride; - } - tg_target_stride = targets.stride(1); - max_target_length = targets.size(1); - } - auto target_lengths_t = at::tensor(target_lengths, targets.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(targets.type().toScalarType(kLong)); - auto input_lengths_t = at::tensor(input_lengths, targets.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(targets.type().toScalarType(kLong)); - tg_batch_offsets = tg_batch_offsets.toType(targets.type().toScalarType(kLong)); - - Tensor log_beta = at::empty({batch_size, log_probs.size(0), 2*max_target_length+1}, log_probs.options()); - Tensor grad = at::full_like(log_probs, neginf); // initialization for log(sum (alpha beta)) - - // As above, there may be better configurations to use. - constexpr int max_threads = 1024; - int threads_target = max_threads; - while (threads_target / 2 >= 2*max_target_length+1) { - threads_target /= 2; - } - int threads_batch = std::min(max_threads / threads_target, (int) batch_size); - - cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - - { - dim3 block(threads_target, threads_batch); - dim3 grid((2*max_target_length+1 + threads_target-1)/threads_target, (batch_size+threads_batch-1)/threads_batch); - - ctc_loss_backward_log_beta_gpu_kernel<<>> - (log_beta.data(), - log_probs.data(), input_lengths_t.data(), log_probs.size(0), - targets.data(), target_lengths_t.data(), max_target_length, - log_probs.stride(0), log_probs.stride(1), log_probs.stride(2), - log_beta.stride(0), log_beta.stride(1), log_beta.stride(2), - tg_batch_offsets.data(), tg_target_stride, - batch_size, BLANK); - } - - // Very crude heuristic for what is a small problem., based on linearly regressing problem dimensions on - // the (capped) difference of timings. - // Note that for OK problems target length <= input length, so we - // only consider input length. - bool is_large = (2*log_probs.size(0)+(24*batch_size)/10+(2*num_labels)/10) > 450; - if (is_large) { // large alphabet, large batch - // this computes the probs, minuend in (16) - exp_out(grad, log_probs); - // now we compute the subtrahend for the blanks. It is a straightforward reduction because we know that - // blanks are in every other position. - // maybe we should kernelize this, too. - auto grad_blank = grad.narrow(2, BLANK, 1); - grad_blank -= (at::logsumexp(log_alpha.as_strided({batch_size, log_alpha.size(1), max_target_length+1}, - {log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2)*2}) - + log_beta.as_strided({batch_size, log_beta.size(1), max_target_length+1}, - {log_beta.stride(0), log_beta.stride(1), log_beta.stride(2)*2}), - 2, true) - .permute({1, 0, 2}) - .add_(neg_log_likelihood.view({1, batch_size, 1})) - .sub_(log_probs.narrow(2, BLANK, 1)) - .exp_() - ); - // Tor the non-blank characters, we use a kernel to compute the subtrahend. - // Again we might configure block and grid in a better way. - int threads_target = max_threads; - while (threads_target / 2 >= max_target_length) { - threads_target /= 2; - } - int threads_batch = std::min(max_threads / threads_target, (int) batch_size); - dim3 block(threads_target, threads_batch); - dim3 grid((max_target_length + threads_target-1)/threads_target, (batch_size+threads_batch-1)/threads_batch); - ctc_loss_backward_collect_nonblank_gpu_kernel<<>> - (grad.data(), - grad_out.data(), grad_out.stride(0), - log_alpha.data(), log_beta.data(), - log_probs.data(), input_lengths_t.data(), log_probs.size(0), - targets.data(), target_lengths_t.data(), max_target_length, - neg_log_likelihood.data(), - grad.stride(0), grad.stride(1), grad.stride(2), - log_probs.stride(0), log_probs.stride(1), log_probs.stride(2), - log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2), - log_beta.stride(0), log_beta.stride(1), log_beta.stride(2), - tg_batch_offsets.data(), tg_target_stride, - batch_size, num_labels, BLANK); - } else { // small problem, use naive algorithm - // Still no block/grid configuration guru... - int threads_input = max_threads; - while (threads_input / 2 >= log_probs.size(0)) { - threads_input /= 2; - } - threads_batch = std::min(max_threads / threads_input, (int) batch_size); - dim3 block(threads_input, threads_batch); - dim3 grid((log_probs.size(0) + threads_input-1)/threads_input, (batch_size+threads_batch-1)/threads_batch); - - ctc_loss_backward_collect_gpu_kernel<<>> - (grad.data(), - grad_out.data(), grad_out.stride(0), - log_alpha.data(), log_beta.data(), - log_probs.data(), input_lengths_t.data(), log_probs.size(0), - targets.data(), target_lengths_t.data(), max_target_length, - neg_log_likelihood.data(), - grad.stride(0), grad.stride(1), grad.stride(2), - log_probs.stride(0), log_probs.stride(1), log_probs.stride(2), - log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2), - log_beta.stride(0), log_beta.stride(1), log_beta.stride(2), - tg_batch_offsets.data(), tg_target_stride, - batch_size, num_labels, BLANK); - } - return grad; -} - -} // namespace - -std::tuple ctc_loss_gpu(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK) { - return AT_DISPATCH_FLOATING_TYPES(log_probs.type(), "ctc_loss", [&] { - if (targets.type().scalarType() == kLong) { - return ctc_loss_gpu_template(log_probs, targets, input_lengths, target_lengths, BLANK); - } else { - return ctc_loss_gpu_template(log_probs, targets, input_lengths, target_lengths, BLANK); - } - }); -} - -Tensor ctc_loss_backward_gpu(const Tensor& grad, const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, - const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK) { - return AT_DISPATCH_FLOATING_TYPES(log_probs.type(), "ctc_loss_backward", [&] { - if (targets.type().scalarType() == kLong) { - return ctc_loss_backward_gpu_template(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK); - } else { - return ctc_loss_backward_gpu_template(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK); - } - }); -} - -} } // at::native diff --git a/aten/src/ATen/native/cuda/TensorFactories.cu b/aten/src/ATen/native/cuda/TensorFactories.cu index 5cde662fba78a6..420733dc558c06 100644 --- a/aten/src/ATen/native/cuda/TensorFactories.cu +++ b/aten/src/ATen/native/cuda/TensorFactories.cu @@ -20,9 +20,17 @@ Tensor& eye_out_cuda(Tensor& result, int64_t n) { } Tensor& eye_out_cuda(Tensor& result, int64_t n, int64_t m) { +#ifndef USE_TH_SIZE_ZERO_DIM + AT_CHECK(n > 0, "n must be greater than 0, got ", n); +#else AT_CHECK(n >= 0, "n must be greater or equal to 0, got ", n); +#endif +#ifndef USE_TH_SIZE_ZERO_DIM + if(m <= 0) { +#else if(m < 0) { +#endif m = n; } diff --git a/aten/src/ATen/native/cuda/TensorTransformations.cu b/aten/src/ATen/native/cuda/TensorTransformations.cu index f97395d6392ca6..7fa1fe64f28d6f 100644 --- a/aten/src/ATen/native/cuda/TensorTransformations.cu +++ b/aten/src/ATen/native/cuda/TensorTransformations.cu @@ -80,7 +80,7 @@ Tensor flip_cuda(const Tensor& self, IntList dims) { return out_tensor; } - auto flip_dims = dims.vec(); + auto flip_dims = std::vector(dims); wrap_all_dims(flip_dims, total_dims); // use kernel_pointwise_flip_apply2 only when to-flip dim is the 1st or last dim, where collapseDims can reduce the amount of work @@ -99,10 +99,10 @@ Tensor flip_cuda(const Tensor& self, IntList dims) { auto flip_dims_t = at::CPU(kLong).tensorFromBlob(flip_dims.data(), {static_cast(flip_dims.size())}); - auto shape = in_tensor.sizes().vec(); + auto shape = std::vector(in_tensor.sizes()); auto shape_t = at::CPU(kLong).tensorFromBlob(shape.data(), {static_cast(shape.size())}); - auto strides = in_tensor.strides().vec(); + auto strides = std::vector(in_tensor.strides()); auto strides_t = at::CPU(kLong).tensorFromBlob(strides.data(), {static_cast(strides.size())}); // stride_contiguous is the stride of non-contiguous tensor after calling contiguous(), diff --git a/aten/src/ATen/native/cudnn/LossCTC.cpp b/aten/src/ATen/native/cudnn/LossCTC.cpp deleted file mode 100644 index 966aa20e0a128d..00000000000000 --- a/aten/src/ATen/native/cudnn/LossCTC.cpp +++ /dev/null @@ -1,92 +0,0 @@ -#include -#include -#include -#include -#if AT_CUDNN_ENABLED() - #include -#endif - - -#if !AT_CUDNN_ENABLED() || (CUDNN_VERSION < 7000) - -namespace at { namespace native { - -// See Note [ATen preprocessor philosophy] - -std::tuple _cudnn_ctc_loss(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK, bool deterministic) { - throw std::runtime_error("cudnn_ctc_loss: ATen not compiled with cuDNN >= 7 support"); -} - -}} - -#else // AT_CUDNN_ENABLED - -#include -#include -#include - -#include - -namespace at { namespace native { - -namespace { - -} // namespace - -std::tuple _cudnn_ctc_loss(const Tensor& log_probs_t, const Tensor& targets_t, IntList input_lengths_, IntList target_lengths_, int64_t BLANK, bool deterministic) { - CheckedFrom c = "cudnn_ctc_loss"; - TensorArg log_probs { log_probs_t, "log_probs", 1 }; - TensorArg targets { targets_t, "targets", 2 }; - checkDim(c, log_probs, 3); - checkScalarType(c, log_probs, kFloat); - checkDim(c, targets, 1); - checkScalarType(c, targets, kInt); - checkContiguous(c, targets); // ? - checkBackend(c, {*log_probs}, Backend::CUDA); - checkBackend(c, {*targets}, Backend::CPU); - int64_t batch_size = log_probs->size(1); - AT_CHECK(input_lengths_.size() == batch_size, "input_lengths needs to have size to match batch_size"); - AT_CHECK(target_lengths_.size() == batch_size, "target_lengths needs to have size to match batch_size"); - - std::vector input_lengths(input_lengths_.begin(), input_lengths_.end()); - std::vector target_lengths(target_lengths_.begin(), target_lengths_.end()); - - setCuDNNStreamToCurrent(); - AT_CHECK(BLANK == 0, "blank must be label 0 for cudnn_ctc_loss"); - // checked in dispatch: - // assert other conditions for cudnnCTCLoss: all label lengths <= 256 - // all input lengths = logprob.size(0) - - auto handle = getCudnnHandle(); - - cudnnCTCLossAlgo_t algo = (deterministic ? CUDNN_CTC_LOSS_ALGO_DETERMINISTIC : CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC); - - Tensor probs = log_probs->softmax(2); - TensorDescriptor probs_desc{probs}; - Tensor grad = at::empty_like(probs); - TensorDescriptor grad_desc{grad}; - - CTCLossDescriptor ctc_loss_desc; - ctc_loss_desc.set(CUDNN_DATA_FLOAT); - - size_t workspace_size; - AT_CUDNN_CHECK(cudnnGetCTCLossWorkspaceSize(handle, probs_desc.desc(), grad_desc.desc(), - targets->data(), target_lengths.data(), input_lengths.data(), - algo, ctc_loss_desc.desc(), &workspace_size)); - - - Tensor workspace = log_probs->type().toScalarType(kByte).tensor(workspace_size); // new way of doing this with empty? - Tensor costs = at::empty({log_probs->size(1)}, log_probs->options()); - - AT_CUDNN_CHECK(cudnnCTCLoss(handle, probs_desc.desc(), probs.data_ptr(), - targets->data(), target_lengths.data(), input_lengths.data(), - costs.data_ptr(), grad_desc.desc(), grad.data_ptr(), algo, - ctc_loss_desc.desc(), workspace.data_ptr(), workspace_size)); - - return std::make_tuple(costs, grad); -} - - -}} // namespace at::native - -#endif diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp index 08e84618e81db3..63f0d7a29578f9 100644 --- a/aten/src/ATen/native/cudnn/RNN.cpp +++ b/aten/src/ATen/native/cudnn/RNN.cpp @@ -166,7 +166,7 @@ namespace { std::vector descriptors(batch_sizes.size()); size_t i = 0; // To be mutated in the loop - auto batch_tensor_size = tensor.sizes().vec(); + std::vector batch_tensor_size(tensor.sizes()); for (auto batch_size : batch_sizes) { batch_tensor_size[0] = batch_size; // NB: cuDNN RNN API does not support 2d descriptors, so we @@ -994,7 +994,7 @@ std::tuple> _cudnn_rnn_backward( if (output_mask[3]) { dw = at::native::_cudnn_rnn_backward_weight(input, weight, weight_stride0, weight_buf, hx, cx, output, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve); } - return std::tuple>{dx, dhx, dcx, dw}; + return std::tuple{dx, dhx, dcx, dw}; } // TODO: I am not sure if we actually need the 'dropout' and 'train' parameters diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 2a8941675d6c9f..8692d6165ff72a 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -29,11 +29,6 @@ - func: _cast_Half(Tensor self, bool non_blocking=false) -> Tensor variants: function, method -- func: _cudnn_ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank, bool deterministic) -> (Tensor, Tensor) - variants: function - dispatch: - CUDA: _cudnn_ctc_loss - - func: _cudnn_rnn_flatten_weight(TensorList weight_arr, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, bool bidirectional) -> Tensor variants: function dispatch: @@ -249,9 +244,6 @@ - func: blackman_window(int64_t window_length, bool periodic, TensorOptions options={}) -> Tensor variants: function -- func: broadcast_tensors(TensorList tensors) -> TensorList - variants: function - - func: cat(TensorList tensors, int64_t dim=0) -> Tensor variants: function @@ -512,21 +504,6 @@ - func: cumprod_out(Tensor result, Tensor self, int64_t dim) -> Tensor variants: function -- func: ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank=0, int64_t reduction=Reduction::ElementwiseMean) -> Tensor - variants: function - -- func: _ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank=0) -> (Tensor, Tensor) - variants: function - dispatch: - CPU: ctc_loss_cpu - CUDA: ctc_loss_gpu - -- func: _ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int64_t blank) -> Tensor - variants: function - dispatch: - CPU: ctc_loss_backward_cpu - CUDA: ctc_loss_backward_gpu - - func: det(Tensor self) -> Tensor - func: diagflat(Tensor self, int64_t offset=0) -> Tensor @@ -738,45 +715,9 @@ variants: function deprecated: true -# NOTE [ grid_sampler Native Functions ] -# `grid_sampler` does all the shape checking and then dispatches to one of -# `cudnn_grid_sampler`, `grid_sampler_2d`, or `grid_sampler_3d`, each of which -# has the corresponding backward defined as native functions as well. Therefore, -# in these functions and their backwards, no more shape checking is done. -# -# Additionally, arguments `padding_mode` and `interpolation_mode` are cast to -# enums defined in `native/GridSampler.h`. `cudnn_grid_sampler` doesn't take in -# `interpolation_mode` because it only supports Bilinear interpolation mode. -# -# ssnl: Currently `interpolation_mode` is just a placeholder. It is not really -# used. Everywhere Bilinear is assumed. I will add Nearest soon. - func: grid_sampler(Tensor input, Tensor grid, int64_t padding_mode) -> Tensor variants: function -- func: grid_sampler_2d(Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) -> Tensor - variants: function - dispatch: - CPU: grid_sampler_2d_cpu - CUDA: grid_sampler_2d_cuda - -- func: grid_sampler_2d_backward(Tensor grad_output, Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) -> (Tensor, Tensor) - variants: function - dispatch: - CPU: grid_sampler_2d_backward_cpu - CUDA: grid_sampler_2d_backward_cuda - -- func: grid_sampler_3d(Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) -> Tensor - variants: function - dispatch: - CPU: grid_sampler_3d_cpu - CUDA: grid_sampler_3d_cuda - -- func: grid_sampler_3d_backward(Tensor grad_output, Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) -> (Tensor, Tensor) - variants: function - dispatch: - CPU: grid_sampler_3d_backward_cpu - CUDA: grid_sampler_3d_backward_cuda - - func: hann_window(int64_t window_length, TensorOptions options={}) -> Tensor variants: function @@ -1329,12 +1270,6 @@ - func: selu_(Tensor self) -> Tensor variants: function -- func: celu(Tensor self, Scalar alpha=1.0) -> Tensor - variants: function - -- func: celu_(Tensor self, Scalar alpha=1.0) -> Tensor - variants: function - - func: sigmoid(Tensor self) -> Tensor - func: sigmoid_(Tensor self) -> Tensor diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp index 7a7e8be5c7ff6a..0cac9bcb9131fa 100644 --- a/aten/src/ATen/native/sparse/SparseTensor.cpp +++ b/aten/src/ATen/native/sparse/SparseTensor.cpp @@ -63,7 +63,7 @@ SparseTensor new_sparse(const SparseType& dtype) { AT_ASSERT(!dtype.is_variable()); AT_ASSERT(dtype.is_sparse()); // TODO: Hmm... this const_cast business seems a bit dodgy - return SparseTensor(new SparseTensorImpl(dtype.backend(), dtype.scalarType()), /* retain */ false); + return SparseTensor(new SparseTensorImpl(const_cast(&dtype)), /* retain */ false); } /*** Helper methods ***/ diff --git a/aten/src/ATen/native/sparse/SparseUtils.h b/aten/src/ATen/native/sparse/SparseUtils.h index aac948e4940241..226b9084579031 100644 --- a/aten/src/ATen/native/sparse/SparseUtils.h +++ b/aten/src/ATen/native/sparse/SparseUtils.h @@ -118,7 +118,7 @@ inline Tensor _new_values_with_size_of(const Tensor& values, int64_t nnz) { // That's the assumption this code makes. return values.type().tensor({nnz}); } else { - std::vector size = values.sizes().vec(); + std::vector size = values.sizes(); size[0] = nnz; return values.type().tensor(size); } diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu index ff4b0e0c57736c..02b190e4901c55 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu @@ -81,7 +81,7 @@ SparseTensor coalesce_sparse_cuda(const SparseTensor& self) { int64_t newNnz = newEnd.first - indicesIter; indices1D.resize_({1, newNnz}); - auto newValues_size = values.sizes().vec(); + std::vector newValues_size(values.sizes()); newValues_size[0] = newNnz; Tensor newValues = at::empty(newValues_size, values.options()); diff --git a/aten/src/ATen/nn.yaml b/aten/src/ATen/nn.yaml index 8a8a8a5dbe954b..86783e4f76dcd6 100644 --- a/aten/src/ATen/nn.yaml +++ b/aten/src/ATen/nn.yaml @@ -58,7 +58,7 @@ # Activation functions -- name: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) +- name: elu(Tensor self, Scalar alpha=1, Scalar scale=1) cname: ELU has_inplace: True scalar_check: @@ -274,3 +274,11 @@ - name: thnn_conv_dilated3d(Tensor self, Tensor weight, IntList[3] kernel_size, Tensor bias={}, IntList[3] stride=1, IntList[3] padding=0, IntList[3] dilation=1) cname: VolumetricDilatedConvolution buffers: [columns, ones] + +# Vision + +- name: thnn_grid_sampler_bilinear2d(Tensor self, Tensor grid, int64_t padding_mode) + cname: SpatialGridSamplerBilinear + +- name: thnn_grid_sampler_bilinear3d(Tensor self, Tensor grid, int64_t padding_mode) + cname: VolumetricGridSamplerBilinear diff --git a/aten/src/ATen/optional.h b/aten/src/ATen/optional.h index 0a395bae67cda6..287ddd8577b340 100644 --- a/aten/src/ATen/optional.h +++ b/aten/src/ATen/optional.h @@ -1 +1,982 @@ -#include +// Copyright (C) 2011 - 2012 Andrzej Krzemienski. +// +// Use, modification, and distribution is subject to the Boost Software +// License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +// +// The idea and interface is based on Boost.Optional library +// authored by Fernando Luis Cacciola Carballal +// +// From https://github.com/akrzemi1/Optional +// +// ATen: +// - Move to `at` namespace. +// - Remove macro use in line 478 because the nvcc device compiler cannot handle it. + +#pragma once + +# include +# include +# include +# include +# include +# include +# include + +# define TR2_OPTIONAL_REQUIRES(...) typename std::enable_if<__VA_ARGS__::value, bool>::type = false + +# if defined __GNUC__ // NOTE: GNUC is also defined for Clang +# if (__GNUC__ == 4) && (__GNUC_MINOR__ >= 8) +# define TR2_OPTIONAL_GCC_4_8_AND_HIGHER___ +# elif (__GNUC__ > 4) +# define TR2_OPTIONAL_GCC_4_8_AND_HIGHER___ +# endif +# +# if (__GNUC__ == 4) && (__GNUC_MINOR__ >= 7) +# define TR2_OPTIONAL_GCC_4_7_AND_HIGHER___ +# elif (__GNUC__ > 4) +# define TR2_OPTIONAL_GCC_4_7_AND_HIGHER___ +# endif +# +# if (__GNUC__ == 4) && (__GNUC_MINOR__ == 8) && (__GNUC_PATCHLEVEL__ >= 1) +# define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ +# elif (__GNUC__ == 4) && (__GNUC_MINOR__ >= 9) +# define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ +# elif (__GNUC__ > 4) +# define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ +# endif +# endif +# +# if defined __clang_major__ +# if (__clang_major__ == 3 && __clang_minor__ >= 5) +# define TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ +# elif (__clang_major__ > 3) +# define TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ +# endif +# if defined TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ +# define TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_ +# elif (__clang_major__ == 3 && __clang_minor__ == 4 && __clang_patchlevel__ >= 2) +# define TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_ +# endif +# endif +# +# if defined _MSC_VER +# if (_MSC_VER >= 1900) +# define TR2_OPTIONAL_MSVC_2015_AND_HIGHER___ +# endif +# endif + +# if defined __clang__ +# if (__clang_major__ > 2) || (__clang_major__ == 2) && (__clang_minor__ >= 9) +# define OPTIONAL_HAS_THIS_RVALUE_REFS 1 +# else +# define OPTIONAL_HAS_THIS_RVALUE_REFS 0 +# endif +# elif defined TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ +# define OPTIONAL_HAS_THIS_RVALUE_REFS 1 +# elif defined TR2_OPTIONAL_MSVC_2015_AND_HIGHER___ +# define OPTIONAL_HAS_THIS_RVALUE_REFS 1 +# else +# define OPTIONAL_HAS_THIS_RVALUE_REFS 0 +# endif + + +# if defined TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ +# define OPTIONAL_HAS_CONSTEXPR_INIT_LIST 1 +# define OPTIONAL_CONSTEXPR_INIT_LIST constexpr +# else +# define OPTIONAL_HAS_CONSTEXPR_INIT_LIST 0 +# define OPTIONAL_CONSTEXPR_INIT_LIST +# endif + +# if defined TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ && (defined __cplusplus) && (__cplusplus != 201103L) +# define OPTIONAL_HAS_MOVE_ACCESSORS 1 +# else +# define OPTIONAL_HAS_MOVE_ACCESSORS 0 +# endif + +# // In C++11 constexpr implies const, so we need to make non-const members also non-constexpr +# if (defined __cplusplus) && (__cplusplus == 201103L) +# define OPTIONAL_MUTABLE_CONSTEXPR +# else +# define OPTIONAL_MUTABLE_CONSTEXPR constexpr +# endif + +namespace at { + +// 20.5.4, optional for object types +template class optional; + +// 20.5.5, optional for lvalue reference types +template class optional; + + +// workaround: std utility functions aren't constexpr yet +template inline constexpr T&& constexpr_forward(typename std::remove_reference::type& t) noexcept +{ + return static_cast(t); +} + +template inline constexpr T&& constexpr_forward(typename std::remove_reference::type&& t) noexcept +{ + static_assert(!std::is_lvalue_reference::value, "!!"); + return static_cast(t); +} + +template inline constexpr typename std::remove_reference::type&& constexpr_move(T&& t) noexcept +{ + return static_cast::type&&>(t); +} + + +#if defined NDEBUG +# define TR2_OPTIONAL_ASSERTED_EXPRESSION(CHECK, EXPR) (EXPR) +#else +# define TR2_OPTIONAL_ASSERTED_EXPRESSION(CHECK, EXPR) ((CHECK) ? (EXPR) : ([]{assert(!#CHECK);}(), (EXPR))) +#endif + + +namespace detail_ +{ + +// static_addressof: a constexpr version of addressof +template +struct has_overloaded_addressof +{ + template + constexpr static bool has_overload(...) { return false; } + + template ().operator&()) > + constexpr static bool has_overload(bool) { return true; } + + constexpr static bool value = has_overload(true); +}; + +template )> +constexpr T* static_addressof(T& ref) +{ + return &ref; +} + +template )> +T* static_addressof(T& ref) +{ + return std::addressof(ref); +} + + +// the call to convert(b) has return type A and converts b to type A iff b decltype(b) is implicitly convertible to A +template +constexpr U convert(U v) { return v; } + +} // namespace detail + + +constexpr struct trivial_init_t{} trivial_init{}; + + +// 20.5.6, In-place construction +constexpr struct in_place_t{} in_place{}; + + +// 20.5.7, Disengaged state indicator +struct nullopt_t +{ + struct init{}; + constexpr explicit nullopt_t(init){} +}; +constexpr nullopt_t nullopt{nullopt_t::init()}; + + +// 20.5.8, class bad_optional_access +class bad_optional_access : public std::logic_error { +public: + explicit bad_optional_access(const std::string& what_arg) : logic_error{what_arg} {} + explicit bad_optional_access(const char* what_arg) : logic_error{what_arg} {} +}; + + +template +union storage_t +{ + unsigned char dummy_; + T value_; + + constexpr storage_t( trivial_init_t ) noexcept : dummy_() {}; + + template + constexpr storage_t( Args&&... args ) : value_(constexpr_forward(args)...) {} + + ~storage_t(){} +}; + + +template +union constexpr_storage_t +{ + unsigned char dummy_; + T value_; + + constexpr constexpr_storage_t( trivial_init_t ) noexcept : dummy_() {}; + + template + constexpr constexpr_storage_t( Args&&... args ) : value_(constexpr_forward(args)...) {} + + ~constexpr_storage_t() = default; +}; + + +template +struct optional_base +{ + bool init_; + storage_t storage_; + + constexpr optional_base() noexcept : init_(false), storage_(trivial_init) {}; + + explicit constexpr optional_base(const T& v) : init_(true), storage_(v) {} + + explicit constexpr optional_base(T&& v) : init_(true), storage_(constexpr_move(v)) {} + + template explicit optional_base(in_place_t, Args&&... args) + : init_(true), storage_(constexpr_forward(args)...) {} + + template >)> + explicit optional_base(in_place_t, std::initializer_list il, Args&&... args) + : init_(true), storage_(il, std::forward(args)...) {} + + ~optional_base() { if (init_) storage_.value_.T::~T(); } +}; + + +template +struct constexpr_optional_base +{ + bool init_; + constexpr_storage_t storage_; + + constexpr constexpr_optional_base() noexcept : init_(false), storage_(trivial_init) {}; + + explicit constexpr constexpr_optional_base(const T& v) : init_(true), storage_(v) {} + + explicit constexpr constexpr_optional_base(T&& v) : init_(true), storage_(constexpr_move(v)) {} + + template explicit constexpr constexpr_optional_base(in_place_t, Args&&... args) + : init_(true), storage_(constexpr_forward(args)...) {} + + template >)> + OPTIONAL_CONSTEXPR_INIT_LIST explicit constexpr_optional_base(in_place_t, std::initializer_list il, Args&&... args) + : init_(true), storage_(il, std::forward(args)...) {} + + ~constexpr_optional_base() = default; +}; + +template +using OptionalBase = typename std::conditional< + std::is_trivially_destructible::value, // if possible + constexpr_optional_base::type>, // use base with trivial destructor + optional_base::type> +>::type; + + + +template +class optional : private OptionalBase +{ + static_assert( !std::is_same::type, nullopt_t>::value, "bad T" ); + static_assert( !std::is_same::type, in_place_t>::value, "bad T" ); + + + constexpr bool initialized() const noexcept { return OptionalBase::init_; } + typename std::remove_const::type* dataptr() { return std::addressof(OptionalBase::storage_.value_); } + constexpr const T* dataptr() const { return detail_::static_addressof(OptionalBase::storage_.value_); } + +# if OPTIONAL_HAS_THIS_RVALUE_REFS == 1 + constexpr const T& contained_val() const& { return OptionalBase::storage_.value_; } +# if OPTIONAL_HAS_MOVE_ACCESSORS == 1 + OPTIONAL_MUTABLE_CONSTEXPR T&& contained_val() && { return std::move(OptionalBase::storage_.value_); } + OPTIONAL_MUTABLE_CONSTEXPR T& contained_val() & { return OptionalBase::storage_.value_; } +# else + T& contained_val() & { return OptionalBase::storage_.value_; } + T&& contained_val() && { return std::move(OptionalBase::storage_.value_); } +# endif +# else + constexpr const T& contained_val() const { return OptionalBase::storage_.value_; } + T& contained_val() { return OptionalBase::storage_.value_; } +# endif + + void clear() noexcept { + if (initialized()) dataptr()->T::~T(); + OptionalBase::init_ = false; + } + + template + void initialize(Args&&... args) noexcept(noexcept(T(std::forward(args)...))) + { + assert(!OptionalBase::init_); + ::new (static_cast(dataptr())) T(std::forward(args)...); + OptionalBase::init_ = true; + } + + template + void initialize(std::initializer_list il, Args&&... args) noexcept(noexcept(T(il, std::forward(args)...))) + { + assert(!OptionalBase::init_); + ::new (static_cast(dataptr())) T(il, std::forward(args)...); + OptionalBase::init_ = true; + } + +public: + typedef T value_type; + + // 20.5.5.1, constructors + constexpr optional() noexcept : OptionalBase() {}; + constexpr optional(nullopt_t) noexcept : OptionalBase() {}; + + optional(const optional& rhs) + : OptionalBase() + { + if (rhs.initialized()) { + ::new (static_cast(dataptr())) T(*rhs); + OptionalBase::init_ = true; + } + } + + optional(optional&& rhs) noexcept(std::is_nothrow_move_constructible::value) + : OptionalBase() + { + if (rhs.initialized()) { + ::new (static_cast(dataptr())) T(std::move(*rhs)); + OptionalBase::init_ = true; + } + } + + constexpr optional(const T& v) : OptionalBase(v) {} + + constexpr optional(T&& v) : OptionalBase(constexpr_move(v)) {} + + template + explicit constexpr optional(in_place_t, Args&&... args) + : OptionalBase(in_place_t{}, constexpr_forward(args)...) {} + + template >)> + OPTIONAL_CONSTEXPR_INIT_LIST explicit optional(in_place_t, std::initializer_list il, Args&&... args) + : OptionalBase(in_place_t{}, il, constexpr_forward(args)...) {} + + // 20.5.4.2, Destructor + ~optional() = default; + + // 20.5.4.3, assignment + optional& operator=(nullopt_t) noexcept + { + clear(); + return *this; + } + + optional& operator=(const optional& rhs) + { + if (initialized() == true && rhs.initialized() == false) clear(); + else if (initialized() == false && rhs.initialized() == true) initialize(*rhs); + else if (initialized() == true && rhs.initialized() == true) contained_val() = *rhs; + return *this; + } + + optional& operator=(optional&& rhs) + noexcept(std::is_nothrow_move_assignable::value && std::is_nothrow_move_constructible::value) + { + if (initialized() == true && rhs.initialized() == false) clear(); + else if (initialized() == false && rhs.initialized() == true) initialize(std::move(*rhs)); + else if (initialized() == true && rhs.initialized() == true) contained_val() = std::move(*rhs); + return *this; + } + + template + auto operator=(U&& v) + -> typename std::enable_if + < + std::is_same::type, T>::value, + optional& + >::type + { + if (initialized()) { contained_val() = std::forward(v); } + else { initialize(std::forward(v)); } + return *this; + } + + + template + void emplace(Args&&... args) + { + clear(); + initialize(std::forward(args)...); + } + + template + void emplace(std::initializer_list il, Args&&... args) + { + clear(); + initialize(il, std::forward(args)...); + } + + // 20.5.4.4, Swap + void swap(optional& rhs) noexcept(std::is_nothrow_move_constructible::value && noexcept(swap(std::declval(), std::declval()))) + { + if (initialized() == true && rhs.initialized() == false) { rhs.initialize(std::move(**this)); clear(); } + else if (initialized() == false && rhs.initialized() == true) { initialize(std::move(*rhs)); rhs.clear(); } + else if (initialized() == true && rhs.initialized() == true) { using std::swap; swap(**this, *rhs); } + } + + // 20.5.4.5, Observers + + explicit constexpr operator bool() const noexcept { return initialized(); } + constexpr bool has_value() const noexcept { return initialized(); } + + constexpr T const* operator ->() const { + return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), dataptr()); + } + +# if OPTIONAL_HAS_MOVE_ACCESSORS == 1 + + OPTIONAL_MUTABLE_CONSTEXPR T* operator ->() { + assert (initialized()); + return dataptr(); + } + + constexpr T const& operator *() const& { + return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), contained_val()); + } + + OPTIONAL_MUTABLE_CONSTEXPR T& operator *() & { + assert (initialized()); + return contained_val(); + } + + OPTIONAL_MUTABLE_CONSTEXPR T&& operator *() && { + assert (initialized()); + return constexpr_move(contained_val()); + } + + constexpr T const& value() const& { + return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val()); + } + + OPTIONAL_MUTABLE_CONSTEXPR T& value() & { + return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val()); + } + + OPTIONAL_MUTABLE_CONSTEXPR T&& value() && { + if (!initialized()) throw bad_optional_access("bad optional access"); + return std::move(contained_val()); + } + +# else + + T* operator ->() { + assert (initialized()); + return dataptr(); + } + + constexpr T const& operator *() const { + return contained_val(); + } + + T& operator *() { + assert (initialized()); + return contained_val(); + } + + constexpr T const& value() const { + return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val()); + } + + T& value() { + return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val()); + } + +# endif + +# if OPTIONAL_HAS_THIS_RVALUE_REFS == 1 + + template + constexpr T value_or(V&& v) const& + { + return *this ? **this : detail_::convert(constexpr_forward(v)); + } + +# if OPTIONAL_HAS_MOVE_ACCESSORS == 1 + + template + OPTIONAL_MUTABLE_CONSTEXPR T value_or(V&& v) && + { + return *this ? constexpr_move(const_cast&>(*this).contained_val()) : detail_::convert(constexpr_forward(v)); + } + +# else + + template + T value_or(V&& v) && + { + return *this ? constexpr_move(const_cast&>(*this).contained_val()) : detail_::convert(constexpr_forward(v)); + } + +# endif + +# else + + template + constexpr T value_or(V&& v) const + { + return *this ? **this : detail_::convert(constexpr_forward(v)); + } + +# endif + + // 20.6.3.6, modifiers + void reset() noexcept { clear(); } +}; + + +template +class optional +{ + static_assert( !std::is_same::value, "bad T" ); + static_assert( !std::is_same::value, "bad T" ); + T* ref; + +public: + + // 20.5.5.1, construction/destruction + constexpr optional() noexcept : ref(nullptr) {} + + constexpr optional(nullopt_t) noexcept : ref(nullptr) {} + + constexpr optional(T& v) noexcept : ref(detail_::static_addressof(v)) {} + + optional(T&&) = delete; + + constexpr optional(const optional& rhs) noexcept : ref(rhs.ref) {} + + explicit constexpr optional(in_place_t, T& v) noexcept : ref(detail_::static_addressof(v)) {} + + explicit optional(in_place_t, T&&) = delete; + + ~optional() = default; + + // 20.5.5.2, mutation + optional& operator=(nullopt_t) noexcept { + ref = nullptr; + return *this; + } + + // optional& operator=(const optional& rhs) noexcept { + // ref = rhs.ref; + // return *this; + // } + + // optional& operator=(optional&& rhs) noexcept { + // ref = rhs.ref; + // return *this; + // } + + template + auto operator=(U&& rhs) noexcept + -> typename std::enable_if + < + std::is_same::type, optional>::value, + optional& + >::type + { + ref = rhs.ref; + return *this; + } + + template + auto operator=(U&& rhs) noexcept + -> typename std::enable_if + < + !std::is_same::type, optional>::value, + optional& + >::type + = delete; + + void emplace(T& v) noexcept { + ref = detail_::static_addressof(v); + } + + void emplace(T&&) = delete; + + + void swap(optional& rhs) noexcept + { + std::swap(ref, rhs.ref); + } + + // 20.5.5.3, observers + constexpr T* operator->() const { + return TR2_OPTIONAL_ASSERTED_EXPRESSION(ref, ref); + } + + constexpr T& operator*() const { + return TR2_OPTIONAL_ASSERTED_EXPRESSION(ref, *ref); + } + + constexpr T& value() const { + return ref ? *ref : (throw bad_optional_access("bad optional access"), *ref); + } + + explicit constexpr operator bool() const noexcept { + return ref != nullptr; + } + + constexpr bool has_value() const noexcept { + return ref != nullptr; + } + + template + constexpr typename std::decay::type value_or(V&& v) const + { + return *this ? **this : detail_::convert::type>(constexpr_forward(v)); + } + + // x.x.x.x, modifiers + void reset() noexcept { ref = nullptr; } +}; + + +template +class optional +{ + static_assert( sizeof(T) == 0, "optional rvalue references disallowed" ); +}; + + +// 20.5.8, Relational operators +template constexpr bool operator==(const optional& x, const optional& y) +{ + return bool(x) != bool(y) ? false : bool(x) == false ? true : *x == *y; +} + +template constexpr bool operator!=(const optional& x, const optional& y) +{ + return !(x == y); +} + +template constexpr bool operator<(const optional& x, const optional& y) +{ + return (!y) ? false : (!x) ? true : *x < *y; +} + +template constexpr bool operator>(const optional& x, const optional& y) +{ + return (y < x); +} + +template constexpr bool operator<=(const optional& x, const optional& y) +{ + return !(y < x); +} + +template constexpr bool operator>=(const optional& x, const optional& y) +{ + return !(x < y); +} + + +// 20.5.9, Comparison with nullopt +template constexpr bool operator==(const optional& x, nullopt_t) noexcept +{ + return (!x); +} + +template constexpr bool operator==(nullopt_t, const optional& x) noexcept +{ + return (!x); +} + +template constexpr bool operator!=(const optional& x, nullopt_t) noexcept +{ + return bool(x); +} + +template constexpr bool operator!=(nullopt_t, const optional& x) noexcept +{ + return bool(x); +} + +template constexpr bool operator<(const optional&, nullopt_t) noexcept +{ + return false; +} + +template constexpr bool operator<(nullopt_t, const optional& x) noexcept +{ + return bool(x); +} + +template constexpr bool operator<=(const optional& x, nullopt_t) noexcept +{ + return (!x); +} + +template constexpr bool operator<=(nullopt_t, const optional&) noexcept +{ + return true; +} + +template constexpr bool operator>(const optional& x, nullopt_t) noexcept +{ + return bool(x); +} + +template constexpr bool operator>(nullopt_t, const optional&) noexcept +{ + return false; +} + +template constexpr bool operator>=(const optional&, nullopt_t) noexcept +{ + return true; +} + +template constexpr bool operator>=(nullopt_t, const optional& x) noexcept +{ + return (!x); +} + + + +// 20.5.10, Comparison with T +template constexpr bool operator==(const optional& x, const T& v) +{ + return bool(x) ? *x == v : false; +} + +template constexpr bool operator==(const T& v, const optional& x) +{ + return bool(x) ? v == *x : false; +} + +template constexpr bool operator!=(const optional& x, const T& v) +{ + return bool(x) ? *x != v : true; +} + +template constexpr bool operator!=(const T& v, const optional& x) +{ + return bool(x) ? v != *x : true; +} + +template constexpr bool operator<(const optional& x, const T& v) +{ + return bool(x) ? *x < v : true; +} + +template constexpr bool operator>(const T& v, const optional& x) +{ + return bool(x) ? v > *x : true; +} + +template constexpr bool operator>(const optional& x, const T& v) +{ + return bool(x) ? *x > v : false; +} + +template constexpr bool operator<(const T& v, const optional& x) +{ + return bool(x) ? v < *x : false; +} + +template constexpr bool operator>=(const optional& x, const T& v) +{ + return bool(x) ? *x >= v : false; +} + +template constexpr bool operator<=(const T& v, const optional& x) +{ + return bool(x) ? v <= *x : false; +} + +template constexpr bool operator<=(const optional& x, const T& v) +{ + return bool(x) ? *x <= v : true; +} + +template constexpr bool operator>=(const T& v, const optional& x) +{ + return bool(x) ? v >= *x : true; +} + + +// Comparison of optional with T +template constexpr bool operator==(const optional& x, const T& v) +{ + return bool(x) ? *x == v : false; +} + +template constexpr bool operator==(const T& v, const optional& x) +{ + return bool(x) ? v == *x : false; +} + +template constexpr bool operator!=(const optional& x, const T& v) +{ + return bool(x) ? *x != v : true; +} + +template constexpr bool operator!=(const T& v, const optional& x) +{ + return bool(x) ? v != *x : true; +} + +template constexpr bool operator<(const optional& x, const T& v) +{ + return bool(x) ? *x < v : true; +} + +template constexpr bool operator>(const T& v, const optional& x) +{ + return bool(x) ? v > *x : true; +} + +template constexpr bool operator>(const optional& x, const T& v) +{ + return bool(x) ? *x > v : false; +} + +template constexpr bool operator<(const T& v, const optional& x) +{ + return bool(x) ? v < *x : false; +} + +template constexpr bool operator>=(const optional& x, const T& v) +{ + return bool(x) ? *x >= v : false; +} + +template constexpr bool operator<=(const T& v, const optional& x) +{ + return bool(x) ? v <= *x : false; +} + +template constexpr bool operator<=(const optional& x, const T& v) +{ + return bool(x) ? *x <= v : true; +} + +template constexpr bool operator>=(const T& v, const optional& x) +{ + return bool(x) ? v >= *x : true; +} + +// Comparison of optional with T +template constexpr bool operator==(const optional& x, const T& v) +{ + return bool(x) ? *x == v : false; +} + +template constexpr bool operator==(const T& v, const optional& x) +{ + return bool(x) ? v == *x : false; +} + +template constexpr bool operator!=(const optional& x, const T& v) +{ + return bool(x) ? *x != v : true; +} + +template constexpr bool operator!=(const T& v, const optional& x) +{ + return bool(x) ? v != *x : true; +} + +template constexpr bool operator<(const optional& x, const T& v) +{ + return bool(x) ? *x < v : true; +} + +template constexpr bool operator>(const T& v, const optional& x) +{ + return bool(x) ? v > *x : true; +} + +template constexpr bool operator>(const optional& x, const T& v) +{ + return bool(x) ? *x > v : false; +} + +template constexpr bool operator<(const T& v, const optional& x) +{ + return bool(x) ? v < *x : false; +} + +template constexpr bool operator>=(const optional& x, const T& v) +{ + return bool(x) ? *x >= v : false; +} + +template constexpr bool operator<=(const T& v, const optional& x) +{ + return bool(x) ? v <= *x : false; +} + +template constexpr bool operator<=(const optional& x, const T& v) +{ + return bool(x) ? *x <= v : true; +} + +template constexpr bool operator>=(const T& v, const optional& x) +{ + return bool(x) ? v >= *x : true; +} + + +// 20.5.12, Specialized algorithms +template +void swap(optional& x, optional& y) noexcept(noexcept(x.swap(y))) +{ + x.swap(y); +} + + +template +constexpr optional::type> make_optional(T&& v) +{ + return optional::type>(constexpr_forward(v)); +} + +template +constexpr optional make_optional(std::reference_wrapper v) +{ + return optional(v.get()); +} + + +} // namespace at + +namespace std +{ + template + struct hash> + { + typedef typename hash::result_type result_type; + typedef at::optional argument_type; + + constexpr result_type operator()(argument_type const& arg) const { + return arg ? std::hash{}(*arg) : result_type{}; + } + }; + + template + struct hash> + { + typedef typename hash::result_type result_type; + typedef at::optional argument_type; + + constexpr result_type operator()(argument_type const& arg) const { + return arg ? std::hash{}(*arg) : result_type{}; + } + }; +} + +# undef TR2_OPTIONAL_REQUIRES +# undef TR2_OPTIONAL_ASSERTED_EXPRESSION diff --git a/aten/src/ATen/templates/StorageDerived.cpp b/aten/src/ATen/templates/StorageDerived.cpp new file mode 100644 index 00000000000000..0491203c3286e6 --- /dev/null +++ b/aten/src/ATen/templates/StorageDerived.cpp @@ -0,0 +1,69 @@ +#include "ATen/${Storage}.h" + +// ${generated_comment} + +#include "ATen/Half.h" +#include "ATen/Allocator.h" +#include + +#include "ATen/Config.h" +$extra_cuda_headers + +namespace at { + +${Storage}::${Storage}() + : Storage(new StorageImpl( + ScalarType::${ScalarName}, + 0, +#if ${isCUDA} + globalContext().getTHCState()->cudaDeviceAllocator, +#else + getTHDefaultAllocator(), +#endif + /* resizable */ true)) {} + +${Storage}::${Storage}(size_t size) + : Storage(new StorageImpl( + ScalarType::${ScalarName}, + size, +#if ${isCUDA} + globalContext().getTHCState()->cudaDeviceAllocator, +#else + getTHDefaultAllocator(), +#endif + /* resizable */ true)) {} + +${Storage}::${Storage}(size_t size, Allocator* allocator) + : Storage(new StorageImpl( + ScalarType::${ScalarName}, + size, + allocator, + /* resizable */ false)) {} + +// TODO: Take in Device as an input to the std::function constructor + +#if ${isCUDA} +static int getPointerDevice(void* ptr) { + struct cudaPointerAttributes attr; + THCudaCheck(cudaPointerGetAttributes(&attr, ptr)); + return attr.device; +} +#endif + +${Storage}::${Storage}( + void * data, + size_t size, + const std::function & deleter) + : Storage(new StorageImpl( + ScalarType::${ScalarName}, + size, + InefficientStdFunctionContext::makeDataPtr(data, deleter, +#if ${isCUDA} + Device(kCUDA, getPointerDevice(data)) +#else + kCPU +#endif + ), + /* allocator */ nullptr, + /* resizable */ false)) {} +} diff --git a/aten/src/ATen/templates/StorageDerived.h b/aten/src/ATen/templates/StorageDerived.h new file mode 100644 index 00000000000000..dddcd5dbf03f21 --- /dev/null +++ b/aten/src/ATen/templates/StorageDerived.h @@ -0,0 +1,31 @@ +#pragma once + +// ${generated_comment} + +$th_headers + +#include "ATen/Storage.h" +#include "ATen/Context.h" + +#include + +namespace at { + +struct Allocator; + +struct ${Storage} final : public Storage { + ${Storage}(); + ${Storage}(StorageImpl* storage_impl) : Storage(storage_impl){}; + ${Storage}(size_t size); + ${Storage}(size_t size, Allocator* allocator); + ${Storage}( + void* data, + size_t size, + const std::function& deleter); + StorageImpl* storage_impl_; + + protected: + friend struct ${Type}; +}; + +} // namespace at diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h index 55fb4aec0cbb60..31e952ebb79ff8 100644 --- a/aten/src/ATen/templates/Tensor.h +++ b/aten/src/ATen/templates/Tensor.h @@ -2,6 +2,7 @@ // ${generated_comment} +#include "ATen/Generator.h" #include "ATen/Scalar.h" #include "ATen/ScalarType.h" #include "ATen/SparseTensorRef.h" @@ -9,12 +10,12 @@ #include "ATen/TensorAccessor.h" #include "ATen/TensorBase.h" #include "ATen/TensorImpl.h" +#include "ATen/Utils.h" #include "ATen/Device.h" #include "ATen/Layout.h" #include "ATen/optional.h" namespace at { -struct Generator; struct Type; struct Tensor; struct TensorOptions; diff --git a/aten/src/ATen/templates/TensorDense.cpp b/aten/src/ATen/templates/TensorDense.cpp index aeba9fb22a3653..cc2f47a89180ab 100644 --- a/aten/src/ATen/templates/TensorDense.cpp +++ b/aten/src/ATen/templates/TensorDense.cpp @@ -3,5 +3,5 @@ std::unique_ptr ${Tensor}::storage() { auto storage = THTensor_getStoragePtr(tensor); THStorage_retain(storage); - return std::unique_ptr(new Storage(storage)); + return std::unique_ptr(new ${Storage}(storage)); } diff --git a/aten/src/ATen/templates/TensorDerived.cpp b/aten/src/ATen/templates/TensorDerived.cpp index 5fab8bf2226417..d72ba4abde2c12 100644 --- a/aten/src/ATen/templates/TensorDerived.cpp +++ b/aten/src/ATen/templates/TensorDerived.cpp @@ -5,8 +5,9 @@ // ${generated_comment} +#include "ATen/Config.h" #include "ATen/${Tensor}.h" -#include "ATen/Storage.h" +#include "ATen/${Storage}.h" #include "ATen/Scalar.h" #include "ATen/Half.h" @@ -21,7 +22,7 @@ namespace detail { } ${Tensor}::${Tensor}(${THTensor} * tensor) -: TensorImpl(Backend::${Backend}, ScalarType::${ScalarName}, tensor, /* is variable */ false) +: TensorImpl(&globalContext().getType(Backend::${Backend},ScalarType::${ScalarName}), tensor) {} ${TensorDenseOrSparse} diff --git a/aten/src/ATen/templates/TypeDerived.cpp b/aten/src/ATen/templates/TypeDerived.cpp index ddd1483f0436f3..67009473dddefc 100644 --- a/aten/src/ATen/templates/TypeDerived.cpp +++ b/aten/src/ATen/templates/TypeDerived.cpp @@ -31,14 +31,6 @@ namespace at { -#if ${isCUDA} -static int getPointerDevice(void* ptr) { - struct cudaPointerAttributes attr; - THCudaCheck(cudaPointerGetAttributes(&attr, ptr)); - return attr.device; -} -#endif - ${Type}::${Type}(Context* context) : Type(context, /*is_variable=*/false, /*is_undefined=*/false) {} ScalarType ${Type}::scalarType() const { @@ -52,44 +44,18 @@ bool ${Type}::is_sparse() const { return backend() == kSparseCPU || backend() == bool ${Type}::is_distributed() const { return false; } std::unique_ptr ${Type}::storage() const { - return std::unique_ptr(new Storage( - ScalarType::${ScalarName}, - 0, -#if ${isCUDA} - globalContext().getTHCState()->cudaDeviceAllocator -#else - getTHDefaultAllocator() -#endif - )); + return std::unique_ptr(new ${Storage}()); } std::unique_ptr ${Type}::storage(size_t size) const { - return std::unique_ptr(new Storage( - ScalarType::${ScalarName}, - size, -#if ${isCUDA} - globalContext().getTHCState()->cudaDeviceAllocator -#else - getTHDefaultAllocator() -#endif - )); + return std::unique_ptr(new ${Storage}(size)); } std::unique_ptr ${Type}::storageFromBlob(void * data, int64_t size, const std::function & deleter) const { return std::unique_ptr( - new Storage( - ScalarType::${ScalarName}, - InefficientStdFunctionContext::makeDataPtr(data, deleter, -#if ${isCUDA} - Device(kCUDA, getPointerDevice(data)) -#else - kCPU -#endif - ), - size, - deleter)); + new ${Storage}(data,size,deleter)); } std::unique_ptr ${Type}::storageWithAllocator(int64_t size, Allocator* allocator) const { return std::unique_ptr( - new Storage(ScalarType::${ScalarName}, size, allocator)); + new ${Storage}(size, allocator)); } Tensor ${Type}::unsafeTensorFromTH(void * th_pointer, bool retain) const { if (retain) @@ -99,7 +65,7 @@ Tensor ${Type}::unsafeTensorFromTH(void * th_pointer, bool retain) const { std::unique_ptr ${Type}::unsafeStorageFromTH(void * th_pointer, bool retain) const { if (retain) ${THStorage}_retain(${state,} (${THStorage}*) th_pointer); - return std::unique_ptr(new Storage((${THStorage}*) th_pointer)); + return std::unique_ptr(new ${Storage}((${THStorage}*) th_pointer)); } std::unique_ptr ${Type}::generator() const { return std::unique_ptr(new ${Generator}(context)); diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp index 8e58df97073086..6b46c8c0b70018 100644 --- a/aten/src/ATen/test/basic.cpp +++ b/aten/src/ATen/test/basic.cpp @@ -270,10 +270,6 @@ static void test(Type & type) { auto result = tensor.m(relu).m(mse_loss, other, Reduction::ElementwiseMean); REQUIRE(result.allclose(mse_loss(relu(tensor), other))); } - SECTION("core") { - int i = CoreTest(); - REQUIRE(i + 1 == CoreTest()); - } } TEST_CASE( "basic tests CPU", "[cpu]" ) { diff --git a/aten/src/ATen/test/scalar_tensor_test.cpp b/aten/src/ATen/test/scalar_tensor_test.cpp index 4a400e3a517ee6..64098c5bf76c56 100644 --- a/aten/src/ATen/test/scalar_tensor_test.cpp +++ b/aten/src/ATen/test/scalar_tensor_test.cpp @@ -65,13 +65,30 @@ void test(Type &T) { require_equal_size_dim(t2, ones({0}, T)); // unsqueeze +#ifndef USE_TH_SIZE_ZERO_DIM + if (t.numel() != 0) { + REQUIRE(t.unsqueeze(0).dim() == t.dim() + 1); + } else { + REQUIRE_THROWS(t.unsqueeze(0)); + } +#else REQUIRE(t.unsqueeze(0).dim() == t.dim() + 1); +#endif // unsqueeze_ { auto t2 = ones(*s, T); +#ifndef USE_TH_SIZE_ZERO_DIM + if (t2.numel() != 0) { + auto r = t2.unsqueeze_(0); + REQUIRE(r.dim() == t.dim() + 1); + } else { + REQUIRE_THROWS(t2.unsqueeze_(0)); + } +#else auto r = t2.unsqueeze_(0); REQUIRE(r.dim() == t.dim() + 1); +#endif } // squeeze (with dimension argument) diff --git a/aten/src/TH/THHalf.cpp b/aten/src/TH/THHalf.cpp index 840c97617c4cb2..1c46c59a9977fa 100644 --- a/aten/src/TH/THHalf.cpp +++ b/aten/src/TH/THHalf.cpp @@ -1,5 +1,4 @@ #include "THHalf.h" -#include /* Copyright 1993-2014 NVIDIA Corporation. All rights reserved. */ @@ -17,14 +16,85 @@ TH_API float TH_half2float(THHalf h) return f; } +// Host functions for converting between FP32 and FP16 formats void TH_halfbits2float(unsigned short* src, float* res) { - *res = at::detail::halfbits2float(*src); -} + unsigned h = *src; + unsigned sign = ((h >> 15) & 1); + unsigned exponent = ((h >> 10) & 0x1f); + unsigned mantissa = ((h & 0x3ff) << 13); + + if (exponent == 0x1f) { /* NaN or Inf */ + mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0); + exponent = 0xff; + } else if (!exponent) { /* Denorm or Zero */ + if (mantissa) { + unsigned int msb; + exponent = 0x71; + do { + msb = (mantissa & 0x400000); + mantissa <<= 1; /* normalize */ + --exponent; + } while (!msb); + mantissa &= 0x7fffff; /* 1.mantissa is implicit */ + } + } else { + exponent += 0x70; + } + *(unsigned*)res = ((sign << 31) | (exponent << 23) | mantissa); +} void TH_float2halfbits(float* src, unsigned short* dest) { - *dest = at::detail::float2halfbits(*src); + unsigned x = *(unsigned*)src; + unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1; + unsigned sign, exponent, mantissa; + + // Get rid of +NaN/-NaN case first. + if (u > 0x7f800000) { + *dest = 0x7fffU; + return ; + } + + sign = ((x >> 16) & 0x8000); + + // Get rid of +Inf/-Inf, +0/-0. + if (u > 0x477fefff) { + *dest = sign | 0x7c00U; + return; + } + if (u < 0x33000001) { + *dest = (sign | 0x0000); + return; + } + + exponent = ((u >> 23) & 0xff); + mantissa = (u & 0x7fffff); + + if (exponent > 0x70) { + shift = 13; + exponent -= 0x70; + } else { + shift = 0x7e - exponent; + exponent = 0; + mantissa |= 0x800000; + } + lsb = (1 << shift); + lsb_s1 = (lsb >> 1); + lsb_m1 = (lsb - 1); + + // Round to nearest even. + remainder = (mantissa & lsb_m1); + mantissa >>= shift; + if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) { + ++mantissa; + if (!(mantissa & 0x3ff)) { + ++exponent; + mantissa = 0; + } + } + + *dest = (sign | (exponent << 10) | mantissa); } diff --git a/aten/src/TH/THStorageFunctions.cpp b/aten/src/TH/THStorageFunctions.cpp index 0c36d5bf97fcf0..0f05bb466651d3 100644 --- a/aten/src/TH/THStorageFunctions.cpp +++ b/aten/src/TH/THStorageFunctions.cpp @@ -19,25 +19,38 @@ void THStorage_free(THStorage* storage) { if (!storage) { return; } - storage->release(); + + if (--storage->refcount == 0) { + if (storage->finalizer) { + (*storage->finalizer)(); + } + storage->finalizer = nullptr; + storage->data_ptr.clear(); + THStorage_weakFree(storage); + } } // Manually retains a weak reference void THStorage_weakRetain(THStorage *weak_storage) { - weak_storage->weak_retain(); + weak_storage->weakcount++; } // Releases a weak reference void THStorage_weakFree(THStorage *weak_storage) { - weak_storage->weak_release(); + if (--weak_storage->weakcount == 0) { + delete weak_storage; + } } // Given a weak reference, returns a strong reference to a storage (which must // be freed when done) or null if the storage is already dead. THStorage* THStorage_weakLock(THStorage *weak_storage) { - if (weak_storage->weak_lock()) - return weak_storage; - return nullptr; + for (;;) { + int refcount = weak_storage->refcount.load(); + if (refcount == 0) return nullptr; + if (weak_storage->refcount.compare_exchange_strong(refcount, refcount + 1)) break; + } + return weak_storage; } THDescBuff THLongStorage_sizeDesc(const THLongStorage *size) { @@ -82,7 +95,7 @@ ptrdiff_t THStorage_size(const THStorage *self) void THStorage_retain(THStorage *storage) { if (storage) { - storage->retain(); + ++storage->refcount; } } diff --git a/aten/src/TH/THStorageFunctions.hpp b/aten/src/TH/THStorageFunctions.hpp index 0e8b3e4ab17bee..671e2f39fb1c7e 100644 --- a/aten/src/TH/THStorageFunctions.hpp +++ b/aten/src/TH/THStorageFunctions.hpp @@ -35,6 +35,8 @@ TH_API ptrdiff_t THStorage_size(const THStorage *self); +TH_API void THStorage_setFlag(THStorage *storage, const char flag); +TH_API void THStorage_clearFlag(THStorage *storage, const char flag); TH_API void THStorage_retain(THStorage *storage); TH_API void THStorage_resize(THStorage *storage, ptrdiff_t size); TH_API void THStorage_swap(THStorage *storage1, THStorage *storage2); diff --git a/aten/src/TH/THTensor.cpp b/aten/src/TH/THTensor.cpp index 5f3b6ed1fef6cc..13df5128e5f5f8 100644 --- a/aten/src/TH/THTensor.cpp +++ b/aten/src/TH/THTensor.cpp @@ -32,7 +32,7 @@ THTensor_compute_stride(at::IntList oldshape, at::IntList oldstride, at::IntList // This could perhaps be combined with the below code, but the complexity didn't seem worth it. int64_t numel = std::accumulate(oldshape.begin(), oldshape.end(), 1, std::multiplies()); if (numel == 0 && oldshape.equals(newshape)) { - return oldstride.vec(); + return std::vector(oldstride); } std::vector newstride(newshape.size()); diff --git a/aten/src/TH/THTensor.hpp b/aten/src/TH/THTensor.hpp index 56204a00e9c3ed..16329f7ed7f621 100644 --- a/aten/src/TH/THTensor.hpp +++ b/aten/src/TH/THTensor.hpp @@ -56,10 +56,6 @@ struct THTensor return sizes_.size(); } - at::ScalarType scalar_type() const { - return storage_->scalar_type; - } - ptrdiff_t storage_offset() const { return storage_offset_; } @@ -113,17 +109,6 @@ inline int64_t* THTensor_getStridePtr(THTensor* tensor) { // NB: Non-retaining inline THStorage* THTensor_getStoragePtr(const THTensor* tensor) { - // Within PyTorch, the invariant is that storage_ is always - // initialized; we never have tensors that don't have any storage. - // However, for Caffe2, this is not true, because they have permitted - // tensors to be allocated without specifying what scalar type - // they should be, only to be filled when GetMutableData is called - // for the first time (providing the necessary type). It is an ERROR to - // invoke any PyTorch operations on such a half-constructed storage, - // and this check tests for that case. - AT_CHECK(tensor->storage_, "Cannot use PyTorch operations on a half-constructed " - "tensor. If this tensor came from Caffe2, please call GetMutableData on " - "it first; otherwise, this is a bug, please report it."); return tensor->storage_; } @@ -133,7 +118,6 @@ inline THStorage* THTensor_getStoragePtr(const THTensor* tensor) { inline void THTensor_resizeDim(THTensor* tensor, int64_t ndim) { // NB: This is *truly* a resize; calling code (e.g., squeeze) // assumes that old values are preserved - tensor->is_zero_dim_ = bool(ndim == 0); tensor->sizes_.resize(ndim); tensor->strides_.resize(ndim); } @@ -157,9 +141,6 @@ inline void THTensor_setStorageOffset(THTensor* tensor, ptrdiff_t storage_offset // NB: Steals ownership of storage inline void THTensor_stealAndSetStoragePtr(THTensor* tensor, THStorage* storage) { - // Caffe2 might have tensors whose storages are null, but we - // don't allow it in PyTorch. - AT_ASSERT(storage); tensor->storage_ = storage; } @@ -196,19 +177,6 @@ inline int THTensor_nDimensionLegacyAll(const THTensor* tensor) { } } -inline int64_t THTensor_strideLegacyNoScalars(const THTensor *self, int dim) { - THArgCheck((dim >= 0) && (dim < THTensor_nDimensionLegacyNoScalars(self)), 2, "dimension %d out of range of %dD tensor", - dim+TH_INDEX_BASE, THTensor_nDimensionLegacyNoScalars(self)); - return THTensor_isZeroDim(self) ? 1 : self->stride(dim); -} - -inline int64_t THTensor_sizeLegacyNoScalars(const THTensor *self, int dim) -{ - THArgCheck((dim >= 0) && (dim < THTensor_nDimensionLegacyNoScalars(self)), 2, "dimension %d out of range of %dD tensor", - dim+TH_INDEX_BASE, THTensor_nDimensionLegacyNoScalars(self)); - return THTensor_isZeroDim(self) ? 1 : self->size(dim); -} - TH_API void THTensor_free(THTensor *self); TH_CPP_API at::optional> THTensor_compute_stride(at::IntList oldshape, at::IntList oldstride, at::IntList newshape); diff --git a/aten/src/TH/THTensorDimApply.h b/aten/src/TH/THTensorDimApply.h index ff05ed8194979d..00c24dee51adb8 100644 --- a/aten/src/TH/THTensorDimApply.h +++ b/aten/src/TH/THTensorDimApply.h @@ -39,8 +39,8 @@ int TH_TENSOR_DIM_APPLY_hasFinished = THTensor_(numel)(TENSOR1) == 0; \ int TH_TENSOR_DIM_APPLY_i; \ \ - if( (DIMENSION < 0) || (DIMENSION >= THTensor_nDimensionLegacyNoScalars(TENSOR1)) ) \ - THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, THTensor_nDimensionLegacyNoScalars(TENSOR1)); \ + if( (DIMENSION < 0) || (DIMENSION >= TENSOR1->dim()) ) \ + THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, TENSOR1->dim()); \ int same_dims = 1; \ if( TENSOR1->dim() != TENSOR2->dim() ) { \ same_dims = 0; \ @@ -56,8 +56,8 @@ if (TH_TENSOR_DIM_APPLY_hasFinished) { \ return; \ } \ - TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(THTensor_nDimensionLegacyNoScalars(TENSOR1))); \ - for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyNoScalars(TENSOR1); TH_TENSOR_DIM_APPLY_i++) \ + TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(TENSOR1->dim())); \ + for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \ TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \ \ TENSOR1##_data = THTensor_getStoragePtr(TENSOR1)->data()+(TENSOR1)->storage_offset(); \ @@ -76,14 +76,14 @@ { \ CODE \ \ - if(THTensor_nDimensionLegacyNoScalars(TENSOR1) == 1) \ + if(TENSOR1->dim() == 1) \ break; \ \ - for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyNoScalars(TENSOR1); TH_TENSOR_DIM_APPLY_i++) \ + for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \ { \ if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \ { \ - if(TH_TENSOR_DIM_APPLY_i == THTensor_nDimensionLegacyNoScalars(TENSOR1)-1) \ + if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \ { \ TH_TENSOR_DIM_APPLY_hasFinished = 1; \ break; \ @@ -98,7 +98,7 @@ \ if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR1->size(TH_TENSOR_DIM_APPLY_i)) \ { \ - if(TH_TENSOR_DIM_APPLY_i == THTensor_nDimensionLegacyNoScalars(TENSOR1)-1) \ + if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \ { \ TH_TENSOR_DIM_APPLY_hasFinished = 1; \ break; \ @@ -145,13 +145,13 @@ int TH_TENSOR_DIM_APPLY_hasFinished = THTensor_(numel)(TENSOR1) == 0; \ int TH_TENSOR_DIM_APPLY_i; \ \ - if( (DIMENSION < 0) || (DIMENSION >= THTensor_nDimensionLegacyNoScalars(TENSOR1)) ) \ + if( (DIMENSION < 0) || (DIMENSION >= TENSOR1->dim()) ) \ THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, THTensor_nDimensionLegacyAll(TENSOR1)); \ if( TENSOR1->dim() != TENSOR2->dim() ) { \ AT_ERROR("inconsistent tensor size, expected ", #TENSOR1, " ", TENSOR1->sizes(), " and ", #TENSOR2, " ", TENSOR2->sizes(), " to have the same number of dimensions"); \ } \ TH_UNUSED int shape_check_flag = 0; \ - for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyNoScalars(TENSOR1); TH_TENSOR_DIM_APPLY_i++) \ + for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \ { \ if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \ continue; \ @@ -163,8 +163,8 @@ if (TH_TENSOR_DIM_APPLY_hasFinished) { \ return; \ } \ - TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(THTensor_nDimensionLegacyNoScalars(TENSOR1))); \ - for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyNoScalars(TENSOR1); TH_TENSOR_DIM_APPLY_i++) \ + TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(TENSOR1->dim())); \ + for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \ TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \ \ TENSOR1##_data = THTensor_getStoragePtr(TENSOR1)->data()+(TENSOR1)->storage_offset(); \ @@ -179,14 +179,14 @@ { \ CODE \ \ - if(THTensor_nDimensionLegacyNoScalars(TENSOR1) == 1) \ + if(TENSOR1->dim() == 1) \ break; \ \ for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \ { \ if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \ { \ - if(TH_TENSOR_DIM_APPLY_i == THTensor_nDimensionLegacyNoScalars(TENSOR1)-1) \ + if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \ { \ TH_TENSOR_DIM_APPLY_hasFinished = 1; \ break; \ @@ -200,7 +200,7 @@ \ if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR1->size(TH_TENSOR_DIM_APPLY_i)) \ { \ - if(TH_TENSOR_DIM_APPLY_i == THTensor_nDimensionLegacyNoScalars(TENSOR1)-1) \ + if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \ { \ TH_TENSOR_DIM_APPLY_hasFinished = 1; \ break; \ diff --git a/aten/src/TH/generic/THTensor.cpp b/aten/src/TH/generic/THTensor.cpp index e68c60a9455c4f..58a5d39366c294 100644 --- a/aten/src/TH/generic/THTensor.cpp +++ b/aten/src/TH/generic/THTensor.cpp @@ -373,7 +373,11 @@ void THTensor_(narrow)(THTensor *self, THTensor *src, int dimension, int64_t fir THArgCheck( (dimension >= 0) && (dimension < src->dim()), 2, "out of range"); THArgCheck( firstIndex >= 0, 3, "out of range"); +#ifdef USE_TH_SIZE_ZERO_DIM THArgCheck( size >= 0, 4, "out of range"); +#else + THArgCheck( size > 0, 4, "out of range"); +#endif THArgCheck(firstIndex <= src->size(dimension) - size, 4, "out of range"); THTensor_(set)(self, src); @@ -392,8 +396,12 @@ void THTensor_(select)(THTensor *self, THTensor *src, int dimension, int64_t sli if(!src) src = self; +#ifndef USE_TH_SIZE_ZERO_DIM + THArgCheck(THTensor_nDimensionLegacyAll(src) > 1, 1, "cannot select on a vector"); +#else #ifndef USE_TH_SCALAR THArgCheck(src->dim() > 1, 1, "cannot select on a vector"); +#endif #endif THArgCheck((dimension >= 0) && (dimension < src->dim()), 2, "out of range"); THArgCheck((sliceIndex >= 0) && (sliceIndex < src->size(dimension)), 3, "out of range"); @@ -415,8 +423,8 @@ void THTensor_(transpose)(THTensor *self, THTensor *src, int dimension1, int dim if(!src) src = self; - THArgCheck( (dimension1 >= 0) && (dimension1 < THTensor_nDimensionLegacyNoScalars(src)), 1, "out of range"); - THArgCheck( (dimension2 >= 0) && (dimension2 < THTensor_nDimensionLegacyNoScalars(src)), 2, "out of range"); + THArgCheck( (dimension1 >= 0) && (dimension1 < src->dim()), 1, "out of range"); + THArgCheck( (dimension2 >= 0) && (dimension2 < src->dim()), 2, "out of range"); THTensor_(set)(self, src); @@ -438,7 +446,10 @@ void THTensor_(unfold)(THTensor *self, THTensor *src, int dimension, int64_t siz if(!src) src = self; - THArgCheck((dimension >= 0) && (dimension < THTensor_nDimensionLegacyNoScalars(src)), 2, "out of range"); +#ifndef USE_TH_SIZE_ZERO_DIM + THArgCheck(!src->is_empty(), 1, "cannot unfold an empty tensor"); +#endif + THArgCheck((dimension >= 0) && (dimension < src->dim()), 2, "out of range"); THArgCheck(size <= src->size(dimension), 3, "out of range"); THArgCheck(step > 0, 4, "invalid step"); @@ -448,20 +459,18 @@ void THTensor_(unfold)(THTensor *self, THTensor *src, int dimension, int64_t siz std::vector newStride(/* size */ self->dim()+1); newSize[self->dim()] = size; - newStride[self->dim()] = THTensor_strideLegacyNoScalars(self, dimension); + newStride[self->dim()] = self->stride(dimension); for(d = 0; d < self->dim(); d++) { - auto self_size = THTensor_sizeLegacyNoScalars(self, d); - auto self_stride = THTensor_strideLegacyNoScalars(self, d); if(d == dimension) { - newSize[d] = (self_size - size) / step + 1; - newStride[d] = step*self_stride; + newSize[d] = (self->size(d) - size) / step + 1; + newStride[d] = step*self->stride(d); } else { - newSize[d] = self_size; - newStride[d] = self_stride; + newSize[d] = self->size(d); + newStride[d] = self->stride(d); } } @@ -538,6 +547,9 @@ void THTensor_(unsqueeze1d)(THTensor *self, THTensor *src, int dimension) src = self; THArgCheck((dimension >= 0) && (dimension <= src->dim()), 2, "dimension out of range"); +#ifndef USE_TH_SIZE_ZERO_DIM + THArgCheck(!src->is_empty(), 2, "cannot unsqueeze empty tensor"); +#endif THTensor_(set)(self, src); @@ -716,6 +728,15 @@ void THTensor_(resizeNd)(THTensor *self, int nDimension, int64_t *size, int64_t for(d = 0; d < nDimension; d++) { +#ifndef USE_TH_SIZE_ZERO_DIM + // we can't support this unless we have arbitrary 0-sized dimensions, but some calls to this + // currently exist and expect a size [0] tensor to be returned. + if (d == 0 && size[d] == 0) { + nDimension = 1; + } else { + AT_CHECK(size[d] > 0, "sizes must be non-negative"); + } +#endif if((self->dim() > d) && (size[d] != self->size(d))) { hascorrectsize = false; } @@ -769,14 +790,14 @@ void THTensor_(resizeNd)(THTensor *self, int nDimension, int64_t *size, int64_t void THTensor_(set1d)(THTensor *tensor, int64_t x0, real value) { - THArgCheck(THTensor_nDimensionLegacyNoScalars(tensor) == 1, 1, "tensor must have one dimension"); + THArgCheck(THTensor_nDimensionLegacyAll(tensor) == 1, 1, "tensor must have one dimension"); THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range"); THStorage_(set)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0), value); } real THTensor_(get1d)(const THTensor *tensor, int64_t x0) { - THArgCheck(THTensor_nDimensionLegacyNoScalars(tensor) == 1, 1, "tensor must have one dimension"); + THArgCheck(THTensor_nDimensionLegacyAll(tensor) == 1, 1, "tensor must have one dimension"); THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range"); return THStorage_(get)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)); } diff --git a/aten/src/TH/generic/THTensorEvenMoreMath.cpp b/aten/src/TH/generic/THTensorEvenMoreMath.cpp index 03946724dcadc6..644fa541a8f9ae 100644 --- a/aten/src/TH/generic/THTensorEvenMoreMath.cpp +++ b/aten/src/TH/generic/THTensorEvenMoreMath.cpp @@ -149,8 +149,15 @@ void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTens int64_t *index_data; real *tensor_data, *src_data; - THArgCheck(THTensor_nDimensionLegacyNoScalars(index) == 1, 3, "Index is supposed to be 1-dimensional"); - THArgCheck(dim < THTensor_nDimensionLegacyNoScalars(src), 4, "Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE); +#ifndef USE_TH_SIZE_ZERO_DIM + THArgCheck(THTensor_nDimensionLegacyAll(index) <= 1, 3, "Index is supposed to be an empty tensor or a vector"); + THArgCheck(dim < THTensor_nDimensionLegacyAll(src), 4, "Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE); + THArgCheck(THTensor_nDimensionLegacyAll(src) > 0, 2, "Source tensor is empty"); +#else + THArgCheck(index->dim() == 1, 3, "Index is supposed to be 1-dimensional"); + THArgCheck(dim < src->dim(), 4, "Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE); + //THArgCheck(src->dim() > 0, 2, "Source tensor is empty"); +#endif numel = THLongTensor_nElement(index); @@ -181,7 +188,7 @@ void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTens } } - if (src->dim() <= 1) { + if (src->dim() == 1) { #pragma omp parallel for if(numel > TH_OMP_OVERHEAD_THRESHOLD) private(i) for (i=0; idim() <= 1) + else if (src->dim() == 1) { for (i=0; idim() == 1, 3, "Index is supposed to be a vector"); + THArgCheck(dim < src->dim(), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE); +#endif + THArgCheck(numel == src->size(dim),4,"Number of indices should be equal to source:size(dim)"); index = THLongTensor_newContiguous(index); index_data = THLongTensor_data(index); @@ -388,8 +400,13 @@ void THTensor_(indexFill)(THTensor *tensor, int dim, THLongTensor *index, real v int64_t *index_data; numel = THLongTensor_nElement(index); - THArgCheck(THTensor_nDimensionLegacyNoScalars(index) == 1, 3, "Index is supposed to be a vector"); - THArgCheck(dim < THTensor_nDimensionLegacyNoScalars(tensor), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE); +#ifndef USE_TH_SIZE_ZERO_DIM + THArgCheck(THTensor_nDimensionLegacyAll(index) == 1, 3, "Index is supposed to be a vector"); + THArgCheck(dim < THTensor_nDimensionLegacyAll(tensor), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE); +#else + THArgCheck(index->dim() == 1, 3, "Index is supposed to be a vector"); + THArgCheck(dim < tensor->dim(), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE); +#endif index = THLongTensor_newContiguous(index); index_data = THLongTensor_data(index); @@ -442,11 +459,19 @@ void THTensor_(scatter)(THTensor *tensor, int dim, THLongTensor *index, THTensor { int64_t elems_per_row, i, idx; +#ifndef USE_TH_SIZE_ZERO_DIM + THArgCheck(dim < THTensor_(nDimensionLegacyAll)(tensor), 2, "Index dimension is out of bounds"); + THArgCheck(THLongTensor_nDimensionLegacyAll(index) == THTensor_(nDimensionLegacyAll)(tensor), 3, + "Index tensor must have same dimensions as output tensor"); + THArgCheck(THTensor_(nDimensionLegacyAll)(src) == THTensor_(nDimensionLegacyAll)(tensor), 4, + "Input tensor must have same dimensions as output tensor"); +#else THArgCheck(dim < THTensor_(nDimensionLegacyNoScalars)(tensor), 2, "Index dimension is out of bounds"); THArgCheck(THLongTensor_nDimensionLegacyNoScalars(index) == THTensor_(nDimensionLegacyNoScalars)(tensor), 3, "Index tensor must have same dimensions as output tensor"); THArgCheck(THTensor_(nDimensionLegacyNoScalars)(src) == THTensor_(nDimensionLegacyNoScalars)(tensor), 4, "Input tensor must have same dimensions as output tensor"); +#endif elems_per_row = THLongTensor_size(index, dim); diff --git a/aten/src/TH/generic/THTensorMath.cpp b/aten/src/TH/generic/THTensorMath.cpp index 24d9a7e8c4ea07..c521d1da750a43 100644 --- a/aten/src/TH/generic/THTensorMath.cpp +++ b/aten/src/TH/generic/THTensorMath.cpp @@ -805,11 +805,11 @@ void THTensor_(addcdiv)(THTensor *r_, THTensor *t, real value, THTensor *src1, T void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *mat, THTensor *vec) { - if( (mat->dim() != 2) || (THTensor_nDimensionLegacyNoScalars(vec) != 1) ) + if( (mat->dim() != 2) || (vec->dim() != 1) ) THError("matrix and vector expected, got %dD, %dD", - mat->dim(), THTensor_nDimensionLegacyNoScalars(vec)); + mat->dim(), vec->dim()); - if( mat->size(1) != THTensor_sizeLegacyNoScalars(vec, 0) ) { + if( mat->size(1) != vec->size(0) ) { THDescBuff bm = THTensor_(sizeDesc)(mat); THDescBuff bv = THTensor_(sizeDesc)(vec); THError("size mismatch, %s, %s", bm.str, bv.str); @@ -837,14 +837,14 @@ void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor { THBlas_(gemv)('n', mat->size(0), mat->size(1), alpha, THTensor_(data)(mat), mat->stride(1), - THTensor_(data)(vec), THTensor_strideLegacyNoScalars(vec, 0), + THTensor_(data)(vec), vec->stride(0), beta, THTensor_(data)(r_), r_->stride(0)); } else if(mat->stride(1) == 1 && LDA_COND(mat->size(1), mat->size(0), mat->stride(0))) { THBlas_(gemv)('t', mat->size(1), mat->size(0), alpha, THTensor_(data)(mat), mat->stride(0), - THTensor_(data)(vec), THTensor_strideLegacyNoScalars(vec, 0), + THTensor_(data)(vec), vec->stride(0), beta, THTensor_(data)(r_), r_->stride(0)); } else @@ -853,7 +853,7 @@ void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor THBlas_(gemv)('t', mat->size(1), mat->size(0), alpha, THTensor_(data)(cmat), cmat->stride(0), - THTensor_(data)(vec), THTensor_strideLegacyNoScalars(vec, 0), + THTensor_(data)(vec), vec->stride(0), beta, THTensor_(data)(r_), r_->stride(0)); THTensor_(free)(cmat); @@ -861,7 +861,7 @@ void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor // In gemv (x,0).mv(0) does not // handle beta, whereas gemm does for case where (x,0).mm(0,y). - if (THTensor_sizeLegacyNoScalars(vec, 0) == 0 && mat->size(0) != 0) { + if (vec->size(0) == 0 && mat->size(0) != 0) { if (beta == 0) { THTensor_(zero)(r_); } else if (beta != 1) { @@ -1058,19 +1058,14 @@ void THTensor_(addmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor void THTensor_(addr)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *vec1, THTensor *vec2) { - if( (THTensor_nDimensionLegacyNoScalars(vec1) != 1) || (THTensor_nDimensionLegacyNoScalars(vec2) != 1) ) + if( (vec1->dim() != 1) || (vec2->dim() != 1) ) THError("vector and vector expected, got %dD, %dD tensors", - THTensor_nDimensionLegacyNoScalars(vec1), THTensor_nDimensionLegacyNoScalars(vec2)); + vec1->dim(), vec2->dim()); if(t->dim() != 2) THError("expected matrix, got %dD tensor for t", t->dim()); - auto vec1_size = THTensor_sizeLegacyNoScalars(vec1, 0); - auto vec2_size = THTensor_sizeLegacyNoScalars(vec2, 0); - auto vec1_stride = THTensor_strideLegacyNoScalars(vec1, 0); - auto vec2_stride = THTensor_strideLegacyNoScalars(vec2, 0); - - if( (t->size(0) != vec1_size) || (t->size(1) != vec2_size) ) { + if( (t->size(0) != vec1->size(0)) || (t->size(1) != vec2->size(0)) ) { THDescBuff bt = THTensor_(sizeDesc)(t); THDescBuff bv1 = THTensor_(sizeDesc)(vec1); THDescBuff bv2 = THTensor_(sizeDesc)(vec2); @@ -1092,27 +1087,27 @@ void THTensor_(addr)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor // n == 1 || lda >= max(1, m) #define LDA_COND(M, N, LDA) ((N) == 1 || (LDA) >= THMax(1, (M))) - if(r_->stride(0) == 1 && LDA_COND(vec1_size, vec2_size, r_->stride(1))) + if(r_->stride(0) == 1 && LDA_COND(vec1->size(0), vec2->size(0), r_->stride(1))) { - THBlas_(ger)(vec1_size, vec2_size, - alpha, THTensor_(data)(vec1), vec1_stride, - THTensor_(data)(vec2), vec2_stride, + THBlas_(ger)(vec1->size(0), vec2->size(0), + alpha, THTensor_(data)(vec1), vec1->stride(0), + THTensor_(data)(vec2), vec2->stride(0), THTensor_(data)(r_), r_->stride(1)); } - else if(r_->stride(1) == 1 && LDA_COND(vec2->size(0), vec1_size, r_->stride(0))) + else if(r_->stride(1) == 1 && LDA_COND(vec2->size(0), vec1->size(0), r_->stride(0))) { - THBlas_(ger)(vec2_size, vec1_size, - alpha, THTensor_(data)(vec2), vec2_stride, - THTensor_(data)(vec1), vec1_stride, + THBlas_(ger)(vec2->size(0), vec1->size(0), + alpha, THTensor_(data)(vec2), vec2->stride(0), + THTensor_(data)(vec1), vec1->stride(0), THTensor_(data)(r_), r_->stride(0)); } else { THTensor *cr = THTensor_(newClone)(r_); - THBlas_(ger)(vec2_size, vec1_size, - alpha, THTensor_(data)(vec2), vec2_stride, - THTensor_(data)(vec1), vec1_stride, + THBlas_(ger)(vec2->size(0), vec1->size(0), + alpha, THTensor_(data)(vec2), vec2->stride(0), + THTensor_(data)(vec1), vec1->stride(0), THTensor_(data)(cr), cr->stride(0)); THTensor_(freeCopyTo)(cr, r_); diff --git a/aten/src/TH/generic/THTensorMoreMath.cpp b/aten/src/TH/generic/THTensorMoreMath.cpp index fa8fb0558661ea..d06ec255644cce 100644 --- a/aten/src/TH/generic/THTensorMoreMath.cpp +++ b/aten/src/TH/generic/THTensorMoreMath.cpp @@ -557,6 +557,9 @@ void THTensor_(onesLike)(THTensor *r_, THTensor *input) void THTensor_(diag)(THTensor *r_, THTensor *t, int k) { +#ifndef USE_TH_SIZE_ZERO_DIM + AT_ASSERT(!t->is_empty()) +#endif THArgCheck(THTensor_(nDimensionLegacyNoScalars)(t) == 1 || THTensor_(nDimensionLegacyNoScalars)(t) == 2, 1, "matrix or a vector expected"); if(THTensor_(nDimensionLegacyNoScalars)(t) == 1) @@ -1183,11 +1186,19 @@ void THTensor_(median)(THTensor *values_, THLongTensor *indices_, THTensor *t, i void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int64_t k, int dim, int dir, int sorted) { +#ifndef USE_TH_SIZE_ZERO_DIM + int numDims = THTensor_(nDimensionLegacyAll)(t); +#else int numDims = THTensor_(nDimensionLegacyNoScalars)(t); +#endif THArgCheck(dim >= 0 && dim < numDims, 3, "dim not in range"); int64_t sliceSize = THTensor_(size)(t, dim); +#ifndef USE_TH_SIZE_ZERO_DIM + THArgCheck(k > 0 && k <= sliceSize, 2, "k not in range for dimension"); +#else THArgCheck(k >= 0 && k <= sliceSize, 2, "k not in range for dimension"); +#endif THTensor *tmpResults = THTensor_(new)(); THTensor_(resize1d)(tmpResults, sliceSize); diff --git a/aten/src/THC/THCTensor.cpp b/aten/src/THC/THCTensor.cpp index a8fb33c11a5bd4..9df36f097ba6ee 100644 --- a/aten/src/THC/THCTensor.cpp +++ b/aten/src/THC/THCTensor.cpp @@ -10,7 +10,7 @@ #include "THCTensorInfo.cuh" int THCTensor_nDimensionLegacyNoScalars(THCState *state, const THCTensor *self) { - return THTensor_nDimensionLegacyNoScalars(self); + return self->dim(); } int THCTensor_nDimensionLegacyAll(THCState *state, const THCTensor *self) { @@ -99,6 +99,15 @@ void THCTensor_resizeNd(THCState *state, THCTensor *self, int nDimension, int64_ for(d = 0; d < nDimension; d++) { +#ifndef USE_TH_SIZE_ZERO_DIM + // we can't support this unless we have arbitrary 0-sized dimensions, but some calls to this + // currently exist and expect a size [0] tensor to be returned. + if (d == 0 && size[d] == 0) { + nDimension = 1; + } else { + AT_CHECK(size[d] > 0, "sizes must be non-negative"); + } +#endif if((self->dim() > d) && (size[d] != self->size(d))) { hascorrectsize = false; } @@ -225,6 +234,9 @@ void THCTensor_unsqueeze1d(THCState *state, THCTensor *self, THCTensor *src, int src = self; THArgCheck((dimension >= 0) && (dimension <= src->dim()), 3, "dimension out of range"); +#ifndef USE_TH_SIZE_ZERO_DIM + THArgCheck(!src->is_empty(), 3, "cannot unsqueeze empty tensor"); +#endif THCTensor_set(state, self, src); diff --git a/aten/src/THC/generic/THCTensor.cpp b/aten/src/THC/generic/THCTensor.cpp index 940af6eb86ead4..e15ba5e5a2c666 100644 --- a/aten/src/THC/generic/THCTensor.cpp +++ b/aten/src/THC/generic/THCTensor.cpp @@ -28,21 +28,11 @@ int64_t THCTensor_(size)(THCState *state, const THCTensor *self, int dim) return THCTensor_size(state, self, dim); } -int64_t THCTensor_(sizeLegacyNoScalars)(THCState *state, const THCTensor *self, int dim) -{ - return THTensor_sizeLegacyNoScalars(self, dim); -} - int64_t THCTensor_(stride)(THCState *state, const THCTensor *self, int dim) { return THCTensor_stride(state, self, dim); } -int64_t THCTensor_(strideLegacyNoScalars)(THCState *state, const THCTensor *self, int dim) -{ - return THTensor_strideLegacyNoScalars(self, dim); -} - THLongStorage *THCTensor_(newSizeOf)(THCState *state, THCTensor *self) { return THCTensor_newSizeOf(state, self); @@ -377,7 +367,11 @@ void THCTensor_(narrow)(THCState *state, THCTensor *self, THCTensor *src, int di THArgCheck( (dimension >= 0) && (dimension < src->dim()), 3, "out of range"); THArgCheck( firstIndex >= 0, 4, "out of range"); +#ifdef USE_TH_SIZE_ZERO_DIM THArgCheck( size >= 0, 5, "out of range"); +#else + THArgCheck( size > 0, 5, "out of range"); +#endif THArgCheck(firstIndex+size <= src->size(dimension), 5, "out of range"); THCTensor_(set)(state, self, src); @@ -396,8 +390,12 @@ void THCTensor_(select)(THCState *state, THCTensor *self, THCTensor *src, int di if(!src) src = self; +#ifndef USE_TH_SIZE_ZERO_DIM + THArgCheck(THTensor_nDimensionLegacyAll(src) > 1, 1, "cannot select on a vector"); +#else #ifndef USE_TH_SCALAR THArgCheck(src->dim() > 1, 1, "cannot select on a vector"); +#endif #endif THArgCheck((dimension >= 0) && (dimension < src->dim()), 3, "out of range"); THArgCheck((sliceIndex >= 0) && (sliceIndex < src->size(dimension)), 4, "out of range"); @@ -419,8 +417,8 @@ void THCTensor_(transpose)(THCState *state, THCTensor *self, THCTensor *src, int if(!src) src = self; - THArgCheck( (dimension1 >= 0) && (dimension1 < THTensor_nDimensionLegacyNoScalars(src)), 1, "out of range"); - THArgCheck( (dimension2 >= 0) && (dimension2 < THTensor_nDimensionLegacyNoScalars(src)), 2, "out of range"); + THArgCheck( (dimension1 >= 0) && (dimension1 < src->dim()), 1, "out of range"); + THArgCheck( (dimension2 >= 0) && (dimension2 < src->dim()), 2, "out of range"); THCTensor_(set)(state, self, src); @@ -442,8 +440,11 @@ void THCTensor_(unfold)(THCState *state, THCTensor *self, THCTensor *src, int di if(!src) src = self; - THArgCheck(dimension < THTensor_nDimensionLegacyNoScalars(src), 2, "out of range"); - THArgCheck(size <= THTensor_sizeLegacyNoScalars(src, dimension), 3, "out of range"); +#ifndef USE_TH_SIZE_ZERO_DIM + THArgCheck(!src->is_empty(), 1, "cannot unfold an empty tensor"); +#endif + THArgCheck(dimension < src->dim(), 2, "out of range"); + THArgCheck(size <= src->size(dimension), 3, "out of range"); THArgCheck(step > 0, 4, "invalid step"); THCTensor_(set)(state, self, src); @@ -452,20 +453,18 @@ void THCTensor_(unfold)(THCState *state, THCTensor *self, THCTensor *src, int di std::vector newStride(self->dim() + 1); newSize[self->dim()] = size; - newStride[self->dim()] = THTensor_strideLegacyNoScalars(self, dimension); + newStride[self->dim()] = self->stride(dimension); for(d = 0; d < self->dim(); d++) { - auto self_size = THTensor_sizeLegacyNoScalars(self, d); - auto self_stride = THTensor_strideLegacyNoScalars(self, d); if(d == dimension) { - newSize[d] = (self_size - size) / step + 1; - newStride[d] = step*self_stride; + newSize[d] = (self->size(d) - size) / step + 1; + newStride[d] = step*self->stride(d); } else { - newSize[d] = self_size; - newStride[d] = self_stride; + newSize[d] = self->size(d); + newStride[d] = self->stride(d); } } @@ -604,15 +603,15 @@ void THCTensor_(resizeNd)(THCState *state, THCTensor *self, int nDimension, int6 void THCTensor_(set1d)(THCState *state, THCTensor *tensor, int64_t x0, real value) { - THArgCheck(THTensor_nDimensionLegacyNoScalars(tensor) == 1, 1, "tensor must have one dimension"); - THArgCheck( (x0 >= 0) && (x0 < THTensor_sizeLegacyNoScalars(tensor, 0)), 2, "out of range"); + THArgCheck(tensor->dim() == 1, 1, "tensor must have one dimension"); + THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range"); THCStorage_(set)(state, THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0), value); } real THCTensor_(get1d)(THCState *state, const THCTensor *tensor, int64_t x0) { - THArgCheck(THTensor_nDimensionLegacyNoScalars(tensor) == 1, 1, "tensor must have one dimension"); - THArgCheck( (x0 >= 0) && (x0 < THTensor_sizeLegacyNoScalars(tensor, 0)), 2, "out of range"); + THArgCheck(tensor->dim() == 1, 1, "tensor must have one dimension"); + THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range"); return THCStorage_(get)(state, THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0)); } diff --git a/aten/src/THC/generic/THCTensor.h b/aten/src/THC/generic/THCTensor.h index 2ee1bf11a4be4c..dbb1591ae194f2 100644 --- a/aten/src/THC/generic/THCTensor.h +++ b/aten/src/THC/generic/THCTensor.h @@ -26,9 +26,7 @@ THC_API int THCTensor_(nDimensionLegacyNoScalars)(THCState *state, const THCTens THC_API int THCTensor_(nDimensionLegacyAll)(THCState *state, const THCTensor *self); THC_API int64_t THCTensor_(size)(THCState *state, const THCTensor *self, int dim); -THC_API int64_t THCTensor_(sizeLegacyNoScalars)(THCState *state, const THCTensor *self, int dim); THC_API int64_t THCTensor_(stride)(THCState *state, const THCTensor *self, int dim); -THC_API int64_t THCTensor_(strideLegacyNoScalars)(THCState *state, const THCTensor *self, int dim); THC_API THLongStorage *THCTensor_(newSizeOf)(THCState *state, THCTensor *self); THC_API THLongStorage *THCTensor_(newStrideOf)(THCState *state, THCTensor *self); THC_API real *THCTensor_(data)(THCState *state, const THCTensor *self); diff --git a/aten/src/THC/generic/THCTensorIndex.cu b/aten/src/THC/generic/THCTensorIndex.cu index 82f56f9946e471..4cbf5dd224abe5 100644 --- a/aten/src/THC/generic/THCTensorIndex.cu +++ b/aten/src/THC/generic/THCTensorIndex.cu @@ -537,6 +537,16 @@ void THCTensor_(indexSelect)(THCState *state, THCTensor *dst, THCTensor *src, in THLongStorage *newSize; +#ifndef USE_TH_SIZE_ZERO_DIM + if (numIndices == 0) { + newSize = THCTensor_(newSizeOf)(state, src); + THLongStorage_set(newSize, 0, numIndices); + THCTensor_(resize)(state, dst, newSize, NULL); + THLongStorage_free(newSize); + return; + } +#endif + newSize = THCTensor_(newSizeOf)(state, src); THLongStorage_set(newSize, dim, numIndices); THCTensor_(resize)(state, dst, newSize, NULL); diff --git a/aten/src/THC/generic/THCTensorMath.cu b/aten/src/THC/generic/THCTensorMath.cu index cc1a8c9ba57e41..642b14aec48cfd 100644 --- a/aten/src/THC/generic/THCTensorMath.cu +++ b/aten/src/THC/generic/THCTensorMath.cu @@ -330,6 +330,9 @@ void THCTensor_(nonzero)(THCState* state, THCudaLongTensor *tensor, void THCTensor_(diag)(THCState *state, THCTensor *self_, THCTensor *src_, int64_t k){ THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_)); int nDimension = THCTensor_(nDimensionLegacyNoScalars)(state, src_); +#ifndef USE_TH_SIZE_ZERO_DIM + AT_ASSERT(!src_->is_empty()); +#endif THArgCheck((nDimension == 2) || (nDimension == 1), 1, "expected a matrix or a vector"); if (nDimension == 2) { int64_t stride0 = THCTensor_(stride)(state, src_, 0); diff --git a/aten/src/THC/generic/THCTensorMathBlas.cu b/aten/src/THC/generic/THCTensorMathBlas.cu index 591780b04edf75..17ef020e85f8ee 100644 --- a/aten/src/THC/generic/THCTensorMathBlas.cu +++ b/aten/src/THC/generic/THCTensorMathBlas.cu @@ -49,15 +49,11 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real { #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, mat, vec)); - if( (mat->dim() != 2) || (THTensor_nDimensionLegacyNoScalars(vec) != 1) ) + if( (mat->dim() != 2) || (vec->dim() != 1) ) THError("2D tensor and 1D tensor expected, got %dD, %dD tensors", - mat->dim(), THTensor_nDimensionLegacyNoScalars(vec)); + mat->dim(), vec->dim()); - - auto vec_size = THTensor_sizeLegacyNoScalars(vec, 0); - auto vec_stride = THTensor_strideLegacyNoScalars(vec, 0); - - if( mat->size(1) != THTensor_sizeLegacyNoScalars(vec, 0) ) + if( mat->size(1) != vec->size(0) ) THError("size mismatch"); if(t->dim() != 1) @@ -78,12 +74,12 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real #ifdef THC_REAL_IS_FLOAT THCudaBlas_Sgemv(state, 'n', mat->size(0), mat->size(1), alpha, THCTensor_(data)(state, mat), mat->stride(1), - THCTensor_(data)(state, vec), vec_stride, + THCTensor_(data)(state, vec), vec->stride(0), beta, THCTensor_(data)(state, r_), r_->stride(0)); #elif defined(THC_REAL_IS_DOUBLE) THCudaBlas_Dgemv(state, 'n', mat->size(0), mat->size(1), alpha, THCTensor_(data)(state, mat), mat->stride(1), - THCTensor_(data)(state, vec), vec_stride, + THCTensor_(data)(state, vec), vec->stride(0), beta, THCTensor_(data)(state, r_), r_->stride(0)); #endif } @@ -92,12 +88,12 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real #ifdef THC_REAL_IS_FLOAT THCudaBlas_Sgemv(state, 't', mat->size(1), mat->size(0), alpha, THCTensor_(data)(state, mat), mat->stride(0), - THCTensor_(data)(state, vec), vec_stride, + THCTensor_(data)(state, vec), vec->stride(0), beta, THCTensor_(data)(state, r_), r_->stride(0)); #elif defined(THC_REAL_IS_DOUBLE) THCudaBlas_Dgemv(state, 't', mat->size(1), mat->size(0), alpha, THCTensor_(data)(state, mat), mat->stride(0), - THCTensor_(data)(state, vec), vec_stride, + THCTensor_(data)(state, vec), vec->stride(0), beta, THCTensor_(data)(state, r_), r_->stride(0)); #endif } @@ -108,12 +104,12 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real #ifdef THC_REAL_IS_FLOAT THCudaBlas_Sgemv(state, 't', mat->size(1), mat->size(0), alpha, THCTensor_(data)(state, cmat), cmat->stride(0), - THCTensor_(data)(state, vec), vec_stride, + THCTensor_(data)(state, vec), vec->stride(0), beta, THCTensor_(data)(state, r_), r_->stride(0)); #elif defined(THC_REAL_IS_DOUBLE) THCudaBlas_Dgemv(state, 't', mat->size(1), mat->size(0), alpha, THCTensor_(data)(state, cmat), cmat->stride(0), - THCTensor_(data)(state, vec), vec_stride, + THCTensor_(data)(state, vec), vec->stride(0), beta, THCTensor_(data)(state, r_), r_->stride(0)); #endif @@ -133,7 +129,7 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real #elif defined(THC_REAL_IS_HALF) // Currently no Hgemv/SgemvEx in Cublas THCTensor *vecAsMatrix = THCTensor_(newWithTensor)(state, vec); - THCTensor_(resize2d)(state, vecAsMatrix, vec_size, 1); + THCTensor_(resize2d)(state, vecAsMatrix, vecAsMatrix->size(0), 1); THCTensor *tAsMatrix = THCTensor_(newWithTensor)(state, t); THCTensor_(resize2d)(state, tAsMatrix, tAsMatrix->size(0), 1); @@ -155,20 +151,16 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real a { #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, vec1, vec2)); - if ( (THTensor_nDimensionLegacyNoScalars(vec1) != 1) || (THTensor_nDimensionLegacyNoScalars(vec2) != 1) ) { + if ( (vec1->dim() != 1) || (vec2->dim() != 1) ) { THError("1D tensors expected, got %dD, %dD tensors", - THTensor_nDimensionLegacyNoScalars(vec1), THTensor_nDimensionLegacyNoScalars(vec2)); + vec1->dim(), vec2->dim()); } - auto vec1_size = THTensor_sizeLegacyNoScalars(vec1, 0); - auto vec2_size = THTensor_sizeLegacyNoScalars(vec2, 0); - auto vec1_stride = THTensor_strideLegacyNoScalars(vec1, 0); - auto vec2_stride = THTensor_strideLegacyNoScalars(vec2, 0); if (t->dim() != 2) { THError("size mismatch"); } - if ( (t->size(0) != vec1_size) || (t->size(1) != vec2_size) ) { + if ( (t->size(0) != vec1->size(0)) || (t->size(1) != vec2->size(0)) ) { THError("size mismatch"); } @@ -187,28 +179,28 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real a if(r_->stride(0) == 1) { #ifdef THC_REAL_IS_FLOAT - THCudaBlas_Sger(state, vec1_size, vec2_size, - alpha, THCTensor_(data)(state, vec1), vec1_stride, - THCTensor_(data)(state, vec2), vec2_stride, + THCudaBlas_Sger(state, vec1->size(0), vec2->size(0), + alpha, THCTensor_(data)(state, vec1), vec1->stride(0), + THCTensor_(data)(state, vec2), vec2->stride(0), THCTensor_(data)(state, r_), r_->stride(1)); #elif defined(THC_REAL_IS_DOUBLE) - THCudaBlas_Dger(state, vec1->size(0), vec2_size, - alpha, THCTensor_(data)(state, vec1), vec1_stride, - THCTensor_(data)(state, vec2), vec2_stride, + THCudaBlas_Dger(state, vec1->size(0), vec2->size(0), + alpha, THCTensor_(data)(state, vec1), vec1->stride(0), + THCTensor_(data)(state, vec2), vec2->stride(0), THCTensor_(data)(state, r_), r_->stride(1)); #endif } else if(r_->stride(1) == 1) { #ifdef THC_REAL_IS_FLOAT - THCudaBlas_Sger(state, vec2_size, vec1_size, - alpha, THCTensor_(data)(state, vec2), vec2_stride, - THCTensor_(data)(state, vec1), vec1_stride, + THCudaBlas_Sger(state, vec2->size(0), vec1->size(0), + alpha, THCTensor_(data)(state, vec2), vec2->stride(0), + THCTensor_(data)(state, vec1), vec1->stride(0), THCTensor_(data)(state, r_), r_->stride(0)); #elif defined(THC_REAL_IS_DOUBLE) - THCudaBlas_Dger(state, vec2_size, vec1_size, - alpha, THCTensor_(data)(state, vec2), vec2_stride, - THCTensor_(data)(state, vec1), vec1_stride, + THCudaBlas_Dger(state, vec2->size(0), vec1->size(0), + alpha, THCTensor_(data)(state, vec2), vec2->stride(0), + THCTensor_(data)(state, vec1), vec1->stride(0), THCTensor_(data)(state, r_), r_->stride(0)); #endif } @@ -217,14 +209,14 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real a THCTensor *cr = THCTensor_(newClone)(state, r_); #ifdef THC_REAL_IS_FLOAT - THCudaBlas_Sger(state, vec2_size, vec1_size, - alpha, THCTensor_(data)(state, vec2), vec2_stride, - THCTensor_(data)(state, vec1), vec1_stride, + THCudaBlas_Sger(state, vec2->size(0), vec1->size(0), + alpha, THCTensor_(data)(state, vec2), vec2->stride(0), + THCTensor_(data)(state, vec1), vec1->stride(0), THCTensor_(data)(state, cr), cr->stride(0)); #elif defined(THC_REAL_IS_DOUBLE) - THCudaBlas_Dger(state, vec2_size, vec1_size, - alpha, THCTensor_(data)(state, vec2), vec2_stride, - THCTensor_(data)(state, vec1), vec1_stride, + THCudaBlas_Dger(state, vec2->size(0), vec1->size(0), + alpha, THCTensor_(data)(state, vec2), vec2->stride(0), + THCTensor_(data)(state, vec1), vec1->stride(0), THCTensor_(data)(state, cr), cr->stride(0)); #endif @@ -233,11 +225,11 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real a #elif defined(THC_REAL_IS_HALF) // currently no Hger/SgerEx in Cublas. THCTensor *vec2T = THCTensor_(newWithTensor)(state, vec2); - THCTensor_(resize2d)(state, vec2T, vec2_size, 1); + THCTensor_(resize2d)(state, vec2T, vec2T->size(0), 1); THCTensor_(transpose)(state, vec2T, NULL, 0, 1); THCTensor *vec1M = THCTensor_(newWithTensor)(state, vec1); - THCTensor_(resize2d)(state, vec1M, vec1_size, 1); + THCTensor_(resize2d)(state, vec1M, vec1M->size(0), 1); THCTensor_(addmm)(state, r_, beta, t, alpha, vec1M, vec2T); THCTensor_(free)(state, vec2T); diff --git a/aten/src/THCUNN/CMakeLists.txt b/aten/src/THCUNN/CMakeLists.txt index 78faef7a7f227b..79b11c2db9b64f 100644 --- a/aten/src/THCUNN/CMakeLists.txt +++ b/aten/src/THCUNN/CMakeLists.txt @@ -43,6 +43,7 @@ ${CMAKE_CURRENT_SOURCE_DIR}/SpatialDilatedMaxPooling.cu ${CMAKE_CURRENT_SOURCE_DIR}/SpatialFractionalMaxPooling.cu ${CMAKE_CURRENT_SOURCE_DIR}/SpatialFullConvolution.cu ${CMAKE_CURRENT_SOURCE_DIR}/SpatialFullDilatedConvolution.cu +${CMAKE_CURRENT_SOURCE_DIR}/SpatialGridSamplerBilinear.cu ${CMAKE_CURRENT_SOURCE_DIR}/SpatialMaxPooling.cu ${CMAKE_CURRENT_SOURCE_DIR}/SpatialMaxUnpooling.cu ${CMAKE_CURRENT_SOURCE_DIR}/SpatialReflectionPadding.cu @@ -70,6 +71,7 @@ ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricDilatedMaxPooling.cu ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricFractionalMaxPooling.cu ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricFullConvolution.cu ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricFullDilatedConvolution.cu +${CMAKE_CURRENT_SOURCE_DIR}/VolumetricGridSamplerBilinear.cu ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricMaxPooling.cu ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricMaxUnpooling.cu ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricReplicationPadding.cu diff --git a/aten/src/THCUNN/ELU.cu b/aten/src/THCUNN/ELU.cu index 9c4c2ea1fdc8b6..d17d185b4858bf 100644 --- a/aten/src/THCUNN/ELU.cu +++ b/aten/src/THCUNN/ELU.cu @@ -8,17 +8,15 @@ struct ELUupdateOutput_functor { const T negcoef_; const T poscoef_; - const T negiptcoef_; - ELUupdateOutput_functor(T negcoef, T poscoef, T negiptcoef) + ELUupdateOutput_functor(T negcoef, T poscoef) : negcoef_(negcoef) , poscoef_(poscoef) - , negiptcoef_(negiptcoef) {} __device__ void operator()(T *output, const T *input) const { - *output = *input <= 0 ? (exp(*input * negiptcoef_) - 1) * negcoef_ : *input * poscoef_; + *output = *input <= 0 ? (exp(*input) - 1) * negcoef_ : *input * poscoef_; } }; @@ -28,17 +26,15 @@ struct ELUupdateOutputIP_functor { const T negcoef_; const T poscoef_; - const T negiptcoef_; - ELUupdateOutputIP_functor(T negcoef, T poscoef, T negiptcoef) + ELUupdateOutputIP_functor(T negcoef, T poscoef) : negcoef_(negcoef) , poscoef_(poscoef) - , negiptcoef_(negiptcoef) {} __device__ void operator()(T *x) const { - *x = *x <= 0 ? (exp(*x * negiptcoef_) - 1) * negcoef_ : *x * poscoef_; + *x = *x <= 0 ? (exp(*x) - 1) * negcoef_ : *x * poscoef_; } }; @@ -47,17 +43,15 @@ struct ELUupdateGradInput_functor { const T negcoef_; const T poscoef_; - const T negiptcoef_; - ELUupdateGradInput_functor(T negcoef, T poscoef, T negiptcoef) + ELUupdateGradInput_functor(T negcoef, T poscoef) : negcoef_(negcoef) , poscoef_(poscoef) - , negiptcoef_(negiptcoef) {} __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const { - *gradInput = (*output) <= 0 ? (*gradOutput * negiptcoef_ * (*output + negcoef_)) : (*gradOutput * poscoef_); + *gradInput = (*output) <= 0 ? (*gradOutput * (*output + negcoef_)) : (*gradOutput * poscoef_); } }; diff --git a/aten/src/THCUNN/SpatialGridSamplerBilinear.cu b/aten/src/THCUNN/SpatialGridSamplerBilinear.cu new file mode 100644 index 00000000000000..30a1a5d5ade10b --- /dev/null +++ b/aten/src/THCUNN/SpatialGridSamplerBilinear.cu @@ -0,0 +1,243 @@ +#include "THCUNN.h" +#include "common.h" +#include "THCDeviceTensor.cuh" +#include "THCDeviceTensorUtils.cuh" +#include "THCDeviceUtils.cuh" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" + +#define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < W && y >= 0 && y < H) +#define SAFE_ADD(input, x, y, n, c, H, W, value) \ + do { \ + if (WITHIN_BOUNDS(x, y, H, W)) { \ + atomicAdd(&input[n][c][y][x], value); \ + } \ + } while(0) + +#undef MIN +#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) ) +#undef MAX +#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) ) +#define CLIP_COORDINATES(in, out, clip_limit) out = MIN((clip_limit-1), MAX(in, 0)) + +const int MODE_BORDER = 1; + + +template +__launch_bounds__(1024) +__global__ void SpatialGridSamplerBilinear_updateOutput_kernel( + const int nthreads, + THCDeviceTensor input, + THCDeviceTensor grid, + THCDeviceTensor output, + const int padding_mode) { + + int N = input.getSize(0); + int C = input.getSize(1); + int IH = input.getSize(2); + int IW = input.getSize(3); + int H = grid.getSize(1); + int W = grid.getSize(2); + + CUDA_KERNEL_LOOP(index, nthreads) { + + const int n = index % N; + const int h = (index / N) % H; + const int w = (index / (N * H)) % W; + int c; + + // get the corresponding input x, y co-ordinates from grid + Dtype ix = grid[n][h][w][0]; + Dtype iy = grid[n][h][w][1]; + + // normalize ix, iy from [-1, 1] to [0, IH-1] & [0, IW-1] + ix = ScalarConvert::to(((ix + 1.f) / 2) * (IW-1)); + iy = ScalarConvert::to(((iy + 1.f) / 2) * (IH-1)); + + // get NE, NW, SE, SW pixel values from (x, y) + int ix_nw = floor(ScalarConvert::to(ix)); + int iy_nw = floor(ScalarConvert::to(iy)); + int ix_ne = ix_nw + 1; + int iy_ne = iy_nw; + int ix_sw = ix_nw; + int iy_sw = iy_nw + 1; + int ix_se = ix_nw + 1; + int iy_se = iy_nw + 1; + + // get surfaces to each neighbor: + Dtype nw = (ix_se - ix) * (iy_se - iy); + Dtype ne = (ix - ix_sw) * (iy_sw - iy); + Dtype sw = (ix_ne - ix) * (iy - iy_ne); + Dtype se = (ix - ix_nw) * (iy - iy_nw); + + // calculate bilinear weighted pixel value and set output pixel + if (padding_mode==MODE_BORDER){ + // clip coordinates to image borders + CLIP_COORDINATES(ix_nw, ix_nw, IW); + CLIP_COORDINATES(iy_nw, iy_nw, IH); + CLIP_COORDINATES(ix_ne, ix_ne, IW); + CLIP_COORDINATES(iy_ne, iy_ne, IH); + CLIP_COORDINATES(ix_sw, ix_sw, IW); + CLIP_COORDINATES(iy_sw, iy_sw, IH); + CLIP_COORDINATES(ix_se, ix_se, IW); + CLIP_COORDINATES(iy_se, iy_se, IH); + } + + Dtype out_val; + for (c = 0; c < C; ++c) { + out_val = ScalarConvert::to(0); + if (WITHIN_BOUNDS(ix_nw, iy_nw, IH, IW)) { + out_val += input[n][c][iy_nw][ix_nw] * nw; + } + if (WITHIN_BOUNDS(ix_ne, iy_ne, IH, IW)) { + out_val += input[n][c][iy_ne][ix_ne] * ne; + } + if (WITHIN_BOUNDS(ix_sw, iy_sw, IH, IW)) { + out_val += input[n][c][iy_sw][ix_sw] * sw; + } + if (WITHIN_BOUNDS(ix_se, iy_se, IH, IW)) { + out_val += input[n][c][iy_se][ix_se] * se; + } + output[n][c][h][w] = out_val; + } + } +} + +template +__launch_bounds__(1024) +__global__ void SpatialGridSamplerBilinear_updateGradInput_kernel( + const int nthreads, + THCDeviceTensor input, THCDeviceTensor gradInput, + THCDeviceTensor grid, THCDeviceTensor gradGrid, + THCDeviceTensor gradOutput, + const int padding_mode) { + + int N = input.getSize(0); + int C = input.getSize(1); + int IH = input.getSize(2); + int IW = input.getSize(3); + int H = grid.getSize(1); + int W = grid.getSize(2); + + CUDA_KERNEL_LOOP(index, nthreads) { + + const int n = index % N; + const int h = (index / N) % H; + const int w = (index / (N * H)) % W; + + // get the corresponding input x, y co-ordinates from grid + Dtype ix = grid[n][h][w][0]; + Dtype iy = grid[n][h][w][1]; + + Dtype gix = ScalarConvert::to(0); + Dtype giy = ScalarConvert::to(0); + + // normalize ix, iy from [-1, 1] to [0, H-1] & [0, W-1] + ix = ScalarConvert::to(((ix + 1.f) / 2) * (IW-1)); + iy = ScalarConvert::to(((iy + 1.f) / 2) * (IH-1));; + + // get NE, NW, SE, SW pixel values from (x, y) + int ix_nw = floor(ScalarConvert::to(ix)); + int iy_nw = floor(ScalarConvert::to(iy));; + int ix_ne = ix_nw + 1; + int iy_ne = iy_nw; + int ix_sw = ix_nw; + int iy_sw = iy_nw + 1; + int ix_se = ix_nw + 1; + int iy_se = iy_nw + 1; + + // get surfaces to each neighbor: + Dtype nw = (ix_se - ix) * (iy_se - iy); + Dtype ne = (ix - ix_sw) * (iy_sw - iy); + Dtype sw = (ix_ne - ix) * (iy - iy_ne); + Dtype se = (ix - ix_nw) * (iy - iy_nw); + + Dtype gradout; + Dtype nw_val; + Dtype ne_val; + Dtype sw_val; + Dtype se_val; + + int ix_nw_cl, iy_nw_cl, ix_ne_cl, iy_ne_cl, ix_sw_cl, iy_sw_cl, ix_se_cl, iy_se_cl; + + if (padding_mode==MODE_BORDER){ + // get clipped NE, NW, SE, SW pixel values from (x, y) + CLIP_COORDINATES(ix_nw, ix_nw_cl, IW); + CLIP_COORDINATES(iy_nw, iy_nw_cl, IH); + CLIP_COORDINATES(ix_ne, ix_ne_cl, IW); + CLIP_COORDINATES(iy_ne, iy_ne_cl, IH); + CLIP_COORDINATES(ix_sw, ix_sw_cl, IW); + CLIP_COORDINATES(iy_sw, iy_sw_cl, IH); + CLIP_COORDINATES(ix_se, ix_se_cl, IW); + CLIP_COORDINATES(iy_se, iy_se_cl, IH); + } + else { + ix_nw_cl = ix_nw; + iy_nw_cl = iy_nw; + ix_ne_cl = ix_ne; + iy_ne_cl = iy_ne; + ix_sw_cl = ix_sw; + iy_sw_cl = iy_sw; + ix_se_cl = ix_se; + iy_se_cl = iy_se; + } + + for (int c = 0; c < C; ++c) { + gradout = gradOutput[n][c][h][w]; + + // calculate and set gradInput + SAFE_ADD(gradInput, ix_nw_cl, iy_nw_cl, n, c, IH, IW, nw * gradout); + SAFE_ADD(gradInput, ix_ne_cl, iy_ne_cl, n, c, IH, IW, ne * gradout); + SAFE_ADD(gradInput, ix_sw_cl, iy_sw_cl, n, c, IH, IW, sw * gradout); + SAFE_ADD(gradInput, ix_se_cl, iy_se_cl, n, c, IH, IW, se * gradout); + + // calculate gradGrid + nw_val = ScalarConvert::to(0); + if (WITHIN_BOUNDS(ix_nw_cl, iy_nw_cl, IH, IW)) { + nw_val = input[n][c][iy_nw_cl][ix_nw_cl]; + } + ne_val = ScalarConvert::to(0); + if (WITHIN_BOUNDS(ix_ne_cl, iy_ne_cl, IH, IW)) { + ne_val = input[n][c][iy_ne_cl][ix_ne_cl]; + } + sw_val = ScalarConvert::to(0); + if (WITHIN_BOUNDS(ix_sw_cl, iy_sw_cl, IH, IW)) { + sw_val = input[n][c][iy_sw_cl][ix_sw_cl]; + } + se_val = ScalarConvert::to(0); + if (WITHIN_BOUNDS(ix_se_cl, iy_se_cl, IH, IW)) { + se_val = input[n][c][iy_se_cl][ix_se_cl]; + } + + gix += ScalarConvert::to(-1)*(nw_val * (iy_se - iy) * gradout); + gix += ne_val * (iy_sw - iy) * gradout; + gix += ScalarConvert::to(-1)*(sw_val * (iy - iy_ne) * gradout); + gix += se_val * (iy - iy_nw) * gradout; + + giy += ScalarConvert::to(-1)*(nw_val * (ix_se - ix) * gradout); + giy += ScalarConvert::to(-1)*(ne_val * (ix - ix_sw) * gradout); + giy += sw_val * (ix_ne - ix) * gradout; + giy += se_val * (ix - ix_nw) * gradout; + } + + // un-normalize gradGrid values back to [-1, 1] constraints + gix = gix * (IW - 1) / 2; + giy = giy * (IH - 1) / 2; + + Dtype gix_old = gradGrid[n][h][w][0]; + Dtype giy_old = gradGrid[n][h][w][1]; + + gradGrid[n][h][w][0] = gix_old + gix; + gradGrid[n][h][w][1] = giy_old + giy; + } +} + +#undef MIN +#undef MAX +#undef CLIP_COORDINATES +#undef WITHIN_BOUNDS +#undef SAFE_ADD + +#include "generic/SpatialGridSamplerBilinear.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/VolumetricGridSamplerBilinear.cu b/aten/src/THCUNN/VolumetricGridSamplerBilinear.cu new file mode 100644 index 00000000000000..43b8ceff1cb8ae --- /dev/null +++ b/aten/src/THCUNN/VolumetricGridSamplerBilinear.cu @@ -0,0 +1,421 @@ +#include "THCUNN.h" +#include "common.h" +#include "THCDeviceTensor.cuh" +#include "THCDeviceTensorUtils.cuh" +#include "THCDeviceUtils.cuh" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" + +#define WITHIN_BOUNDS(x, y, z, D, H, W) (x >= 0 && x < W && y >= 0 && y < H && z >= 0 && z < D) +#define SAFE_ADD(input, x, y, z, n, c, D, H, W, value) \ + do { \ + if (WITHIN_BOUNDS(x, y, z, D, H, W)) { \ + atomicAdd(&input[n][c][z][y][x], value); \ + } \ + } while(0) + +#undef MIN +#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) ) +#undef MAX +#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) ) +#define CLIP_COORDINATES(in, out, clip_limit) out = MIN((clip_limit-1), MAX(in, 0)) + +const int MODE_BORDER = 1; + + +template +__launch_bounds__(1024) +__global__ void VolumetricGridSamplerBilinear_updateOutput_kernel( + const int nthreads, + THCDeviceTensor input, + THCDeviceTensor grid, + THCDeviceTensor output, + const int padding_mode) { + + int N = input.getSize(0); + int C = input.getSize(1); + int ID = input.getSize(2); + int IH = input.getSize(3); + int IW = input.getSize(4); + int D = grid.getSize(1); + int H = grid.getSize(2); + int W = grid.getSize(3); + + CUDA_KERNEL_LOOP(index, nthreads) { + + const int n = index % N; + const int d = (index / N) % D; + const int h = (index / (N * D)) % H; + const int w = (index / (N * D * H)) % W; + int c; + + // get the corresponding input x, y, z co-ordinates from grid + Dtype ix = grid[n][d][h][w][0]; + Dtype iy = grid[n][d][h][w][1]; + Dtype iz = grid[n][d][h][w][2]; + + // normalize ix, iy, iz from [-1, 1] to [0, IW-1] & [0, IH-1] & [0, ID-1] + ix = ScalarConvert::to(((ix + 1.f) / 2) * (IW-1)); + iy = ScalarConvert::to(((iy + 1.f) / 2) * (IH-1)); + iz = ScalarConvert::to(((iz + 1.f) / 2) * (ID-1)); + + // get corner pixel values from (x, y, z) + // for 4d, we used north-east-south-west + // for 5d, we add top-bottom + int ix_tnw = floor(ScalarConvert::to(ix)); + int iy_tnw = floor(ScalarConvert::to(iy)); + int iz_tnw = floor(ScalarConvert::to(iz)); + + int ix_tne = ix_tnw + 1; + int iy_tne = iy_tnw; + int iz_tne = iz_tnw; + + int ix_tsw = ix_tnw; + int iy_tsw = iy_tnw + 1; + int iz_tsw = iz_tnw; + + int ix_tse = ix_tnw + 1; + int iy_tse = iy_tnw + 1; + int iz_tse = iz_tnw; + + int ix_bnw = ix_tnw; + int iy_bnw = iy_tnw; + int iz_bnw = iz_tnw + 1; + + int ix_bne = ix_tnw + 1; + int iy_bne = iy_tnw; + int iz_bne = iz_tnw + 1; + + int ix_bsw = ix_tnw; + int iy_bsw = iy_tnw + 1; + int iz_bsw = iz_tnw + 1; + + int ix_bse = ix_tnw + 1; + int iy_bse = iy_tnw + 1; + int iz_bse = iz_tnw + 1; + + // get surfaces to each neighbor: + Dtype tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz); + Dtype tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz); + Dtype tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz); + Dtype tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz); + Dtype bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse); + Dtype bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw); + Dtype bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne); + Dtype bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw); + + // calculate bilinear weighted pixel value and set output pixel + if (padding_mode==MODE_BORDER){ + // clip coordinates to image borders + CLIP_COORDINATES(ix_tnw, ix_tnw, IW); + CLIP_COORDINATES(iy_tnw, iy_tnw, IH); + CLIP_COORDINATES(iz_tnw, iz_tnw, ID); + CLIP_COORDINATES(ix_tne, ix_tne, IW); + CLIP_COORDINATES(iy_tne, iy_tne, IH); + CLIP_COORDINATES(iz_tne, iz_tne, ID); + CLIP_COORDINATES(ix_tsw, ix_tsw, IW); + CLIP_COORDINATES(iy_tsw, iy_tsw, IH); + CLIP_COORDINATES(iz_tsw, iz_tsw, ID); + CLIP_COORDINATES(ix_tse, ix_tse, IW); + CLIP_COORDINATES(iy_tse, iy_tse, IH); + CLIP_COORDINATES(iz_tse, iz_tse, ID); + CLIP_COORDINATES(ix_bnw, ix_bnw, IW); + CLIP_COORDINATES(iy_bnw, iy_bnw, IH); + CLIP_COORDINATES(iz_bnw, iz_bnw, ID); + CLIP_COORDINATES(ix_bne, ix_bne, IW); + CLIP_COORDINATES(iy_bne, iy_bne, IH); + CLIP_COORDINATES(iz_bne, iz_bne, ID); + CLIP_COORDINATES(ix_bsw, ix_bsw, IW); + CLIP_COORDINATES(iy_bsw, iy_bsw, IH); + CLIP_COORDINATES(iz_bsw, iz_bsw, ID); + CLIP_COORDINATES(ix_bse, ix_bse, IW); + CLIP_COORDINATES(iy_bse, iy_bse, IH); + CLIP_COORDINATES(iz_bse, iz_bse, ID); + } + + Dtype out_val; + for (c = 0; c < C; ++c) { + out_val = ScalarConvert::to(0); + if (WITHIN_BOUNDS(ix_tnw, iy_tnw, iz_tnw, ID, IH, IW)) { + out_val += input[n][c][iz_tnw][iy_tnw][ix_tnw] * tnw; + } + if (WITHIN_BOUNDS(ix_tne, iy_tne, iz_tne, ID, IH, IW)) { + out_val += input[n][c][iz_tne][iy_tne][ix_tne] * tne; + } + if (WITHIN_BOUNDS(ix_tsw, iy_tsw, iz_tsw, ID, IH, IW)) { + out_val += input[n][c][iz_tsw][iy_tsw][ix_tsw] * tsw; + } + if (WITHIN_BOUNDS(ix_tse, iy_tse, iz_tse, ID, IH, IW)) { + out_val += input[n][c][iz_tse][iy_tse][ix_tse] * tse; + } + if (WITHIN_BOUNDS(ix_bnw, iy_bnw, iz_bnw, ID, IH, IW)) { + out_val += input[n][c][iz_bnw][iy_bnw][ix_bnw] * bnw; + } + if (WITHIN_BOUNDS(ix_bne, iy_bne, iz_bne, ID, IH, IW)) { + out_val += input[n][c][iz_bne][iy_bne][ix_bne] * bne; + } + if (WITHIN_BOUNDS(ix_bsw, iy_bsw, iz_bsw, ID, IH, IW)) { + out_val += input[n][c][iz_bsw][iy_bsw][ix_bsw] * bsw; + } + if (WITHIN_BOUNDS(ix_bse, iy_bse, iz_bse, ID, IH, IW)) { + out_val += input[n][c][iz_bse][iy_bse][ix_bse] * bse; + } + output[n][c][d][h][w] = out_val; + } + } +} + +template +__launch_bounds__(1024) +__global__ void VolumetricGridSamplerBilinear_updateGradInput_kernel( + const int nthreads, + THCDeviceTensor input, THCDeviceTensor gradInput, + THCDeviceTensor grid, THCDeviceTensor gradGrid, + THCDeviceTensor gradOutput, + const int padding_mode) { + + int N = input.getSize(0); + int C = input.getSize(1); + int ID = input.getSize(2); + int IH = input.getSize(3); + int IW = input.getSize(4); + int D = grid.getSize(1); + int H = grid.getSize(2); + int W = grid.getSize(3); + + CUDA_KERNEL_LOOP(index, nthreads) { + + const int n = index % N; + const int d = (index / N) % D; + const int h = (index / (N * D)) % H; + const int w = (index / (N * D * H)) % W; + + // get the corresponding input x, y, z co-ordinates from grid + Dtype ix = grid[n][d][h][w][0]; + Dtype iy = grid[n][d][h][w][1]; + Dtype iz = grid[n][d][h][w][2]; + + Dtype gix = ScalarConvert::to(0); + Dtype giy = ScalarConvert::to(0); + Dtype giz = ScalarConvert::to(0); + + // normalize ix, iy, iz from [-1, 1] to [0, IW-1] & [0, IH-1] & [0, ID-1] + ix = ScalarConvert::to(((ix + 1.f) / 2) * (IW-1)); + iy = ScalarConvert::to(((iy + 1.f) / 2) * (IH-1)); + iz = ScalarConvert::to(((iz + 1.f) / 2) * (ID-1)); + + // get corner pixel values from (x, y, z) + // for 4d, we used north-east-south-west + // for 5d, we add top-bottom + int ix_tnw = floor(ScalarConvert::to(ix)); + int iy_tnw = floor(ScalarConvert::to(iy)); + int iz_tnw = floor(ScalarConvert::to(iz)); + + int ix_tne = ix_tnw + 1; + int iy_tne = iy_tnw; + int iz_tne = iz_tnw; + + int ix_tsw = ix_tnw; + int iy_tsw = iy_tnw + 1; + int iz_tsw = iz_tnw; + + int ix_tse = ix_tnw + 1; + int iy_tse = iy_tnw + 1; + int iz_tse = iz_tnw; + + int ix_bnw = ix_tnw; + int iy_bnw = iy_tnw; + int iz_bnw = iz_tnw + 1; + + int ix_bne = ix_tnw + 1; + int iy_bne = iy_tnw; + int iz_bne = iz_tnw + 1; + + int ix_bsw = ix_tnw; + int iy_bsw = iy_tnw + 1; + int iz_bsw = iz_tnw + 1; + + int ix_bse = ix_tnw + 1; + int iy_bse = iy_tnw + 1; + int iz_bse = iz_tnw + 1; + + // get surfaces to each neighbor: + Dtype tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz); + Dtype tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz); + Dtype tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz); + Dtype tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz); + Dtype bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse); + Dtype bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw); + Dtype bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne); + Dtype bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw); + + Dtype gradout; + Dtype tnw_val; + Dtype tne_val; + Dtype tsw_val; + Dtype tse_val; + Dtype bnw_val; + Dtype bne_val; + Dtype bsw_val; + Dtype bse_val; + + int ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, ix_tne_cl, iy_tne_cl, iz_tne_cl; + int ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, ix_tse_cl, iy_tse_cl, iz_tse_cl; + int ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, ix_bne_cl, iy_bne_cl, iz_bne_cl; + int ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, ix_bse_cl, iy_bse_cl, iz_bse_cl; + + if (padding_mode==MODE_BORDER){ + // clip coordinates to image borders + CLIP_COORDINATES(ix_tnw, ix_tnw_cl, IW); + CLIP_COORDINATES(iy_tnw, iy_tnw_cl, IH); + CLIP_COORDINATES(iz_tnw, iz_tnw_cl, ID); + CLIP_COORDINATES(ix_tne, ix_tne_cl, IW); + CLIP_COORDINATES(iy_tne, iy_tne_cl, IH); + CLIP_COORDINATES(iz_tne, iz_tne_cl, ID); + CLIP_COORDINATES(ix_tsw, ix_tsw_cl, IW); + CLIP_COORDINATES(iy_tsw, iy_tsw_cl, IH); + CLIP_COORDINATES(iz_tsw, iz_tsw_cl, ID); + CLIP_COORDINATES(ix_tse, ix_tse_cl, IW); + CLIP_COORDINATES(iy_tse, iy_tse_cl, IH); + CLIP_COORDINATES(iz_tse, iz_tse_cl, ID); + CLIP_COORDINATES(ix_bnw, ix_bnw_cl, IW); + CLIP_COORDINATES(iy_bnw, iy_bnw_cl, IH); + CLIP_COORDINATES(iz_bnw, iz_bnw_cl, ID); + CLIP_COORDINATES(ix_bne, ix_bne_cl, IW); + CLIP_COORDINATES(iy_bne, iy_bne_cl, IH); + CLIP_COORDINATES(iz_bne, iz_bne_cl, ID); + CLIP_COORDINATES(ix_bsw, ix_bsw_cl, IW); + CLIP_COORDINATES(iy_bsw, iy_bsw_cl, IH); + CLIP_COORDINATES(iz_bsw, iz_bsw_cl, ID); + CLIP_COORDINATES(ix_bse, ix_bse_cl, IW); + CLIP_COORDINATES(iy_bse, iy_bse_cl, IH); + CLIP_COORDINATES(iz_bse, iz_bse_cl, ID); + } + else { + ix_tnw_cl = ix_tnw; + iy_tnw_cl = iy_tnw; + iz_tnw_cl = iz_tnw; + ix_tne_cl = ix_tne; + iy_tne_cl = iy_tne; + iz_tne_cl = iz_tne; + ix_tsw_cl = ix_tsw; + iy_tsw_cl = iy_tsw; + iz_tsw_cl = iz_tsw; + ix_tse_cl = ix_tse; + iy_tse_cl = iy_tse; + iz_tse_cl = iz_tse; + ix_bnw_cl = ix_bnw; + iy_bnw_cl = iy_bnw; + iz_bnw_cl = iz_bnw; + ix_bne_cl = ix_bne; + iy_bne_cl = iy_bne; + iz_bne_cl = iz_bne; + ix_bsw_cl = ix_bsw; + iy_bsw_cl = iy_bsw; + iz_bsw_cl = iz_bsw; + ix_bse_cl = ix_bse; + iy_bse_cl = iy_bse; + iz_bse_cl = iz_bse; + } + + for (int c = 0; c < C; ++c) { + gradout = gradOutput[n][c][d][h][w]; + + // calculate and set gradInput + SAFE_ADD(gradInput, ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, n, c, ID, IH, IW, tnw * gradout); + SAFE_ADD(gradInput, ix_tne_cl, iy_tne_cl, iz_tne_cl, n, c, ID, IH, IW, tne * gradout); + SAFE_ADD(gradInput, ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, n, c, ID, IH, IW, tsw * gradout); + SAFE_ADD(gradInput, ix_tse_cl, iy_tse_cl, iz_tse_cl, n, c, ID, IH, IW, tse * gradout); + SAFE_ADD(gradInput, ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, n, c, ID, IH, IW, bnw * gradout); + SAFE_ADD(gradInput, ix_bne_cl, iy_bne_cl, iz_bne_cl, n, c, ID, IH, IW, bne * gradout); + SAFE_ADD(gradInput, ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, n, c, ID, IH, IW, bsw * gradout); + SAFE_ADD(gradInput, ix_bse_cl, iy_bse_cl, iz_bse_cl, n, c, ID, IH, IW, bse * gradout); + + // calculate gradGrid + tnw_val = ScalarConvert::to(0); + if (WITHIN_BOUNDS(ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, ID, IH, IW)) { + tnw_val = input[n][c][iz_tnw_cl][iy_tnw_cl][ix_tnw_cl]; + } + tne_val = ScalarConvert::to(0); + if (WITHIN_BOUNDS(ix_tne_cl, iy_tne_cl, iz_tne_cl, ID, IH, IW)) { + tne_val = input[n][c][iz_tne_cl][iy_tne_cl][ix_tne_cl]; + } + tsw_val = ScalarConvert::to(0); + if (WITHIN_BOUNDS(ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, ID, IH, IW)) { + tsw_val = input[n][c][iz_tsw_cl][iy_tsw_cl][ix_tsw_cl]; + } + tse_val = ScalarConvert::to(0); + if (WITHIN_BOUNDS(ix_tse_cl, iy_tse_cl, iz_tse_cl, ID, IH, IW)) { + tse_val = input[n][c][iz_tse_cl][iy_tse_cl][ix_tse_cl]; + } + bnw_val = ScalarConvert::to(0); + if (WITHIN_BOUNDS(ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, ID, IH, IW)) { + bnw_val = input[n][c][iz_bnw_cl][iy_bnw_cl][ix_bnw_cl]; + } + bne_val = ScalarConvert::to(0); + if (WITHIN_BOUNDS(ix_bne_cl, iy_bne_cl, iz_bne_cl, ID, IH, IW)) { + bne_val = input[n][c][iz_bne_cl][iy_bne_cl][ix_bne_cl]; + } + bsw_val = ScalarConvert::to(0); + if (WITHIN_BOUNDS(ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, ID, IH, IW)) { + bsw_val = input[n][c][iz_bsw_cl][iy_bsw_cl][ix_bsw_cl]; + } + bse_val = ScalarConvert::to(0); + if (WITHIN_BOUNDS(ix_bse_cl, iy_bse_cl, iz_bse_cl, ID, IH, IW)) { + bse_val = input[n][c][iz_bse_cl][iy_bse_cl][ix_bse_cl]; + } + + Dtype m1 = ScalarConvert::to(-1); + gix += m1 * tnw_val * (iy_bse - iy) * (iz_bse - iz) * gradout; + gix += tne_val * (iy_bsw - iy) * (iz_bsw - iz) * gradout; + gix += m1 * tsw_val * (iy - iy_bne) * (iz_bne - iz) * gradout; + gix += tse_val * (iy - iy_bnw) * (iz_bnw - iz) * gradout; + gix += m1 * bnw_val * (iy_tse - iy) * (iz - iz_tse) * gradout; + gix += bne_val * (iy_tsw - iy) * (iz - iz_tsw) * gradout; + gix += m1 * bsw_val * (iy - iy_tne) * (iz - iz_tne) * gradout; + gix += bse_val * (iy - iy_tnw) * (iz - iz_tnw) * gradout; + + + giy += m1 * tnw_val * (ix_bse - ix) * (iz_bse - iz) * gradout; + giy += m1 * tne_val * (ix - ix_bsw) * (iz_bsw - iz) * gradout; + giy += tsw_val * (ix_bne - ix) * (iz_bne - iz) * gradout; + giy += tse_val * (ix - ix_bnw) * (iz_bnw - iz) * gradout; + giy += m1 * bnw_val * (ix_tse - ix) * (iz - iz_tse) * gradout; + giy += m1 * bne_val * (ix - ix_tsw) * (iz - iz_tsw) * gradout; + giy += bsw_val * (ix_tne - ix) * (iz - iz_tne) * gradout; + giy += bse_val * (ix - ix_tnw) * (iz - iz_tnw) * gradout; + + giz += m1 * tnw_val * (ix_bse - ix) * (iy_bse - iy) * gradout; + giz += m1 * tne_val * (ix - ix_bsw) * (iy_bsw - iy) * gradout; + giz += m1 * tsw_val * (ix_bne - ix) * (iy - iy_bne) * gradout; + giz += m1 * tse_val * (ix - ix_bnw) * (iy - iy_bnw) * gradout; + giz += bnw_val * (ix_tse - ix) * (iy_tse - iy) * gradout; + giz += bne_val * (ix - ix_tsw) * (iy_tsw - iy) * gradout; + giz += bsw_val * (ix_tne - ix) * (iy - iy_tne) * gradout; + giz += bse_val * (ix - ix_tnw) * (iy - iy_tnw) * gradout; + } + + // un-normalize gradGrid values back to [-1, 1] constraints + gix = gix * (IW - 1) / 2; + giy = giy * (IH - 1) / 2; + giz = giz * (ID - 1) / 2; + + Dtype gix_old = gradGrid[n][d][h][w][0]; + Dtype giy_old = gradGrid[n][d][h][w][1]; + Dtype giz_old = gradGrid[n][d][h][w][2]; + + gradGrid[n][d][h][w][0] = gix_old + gix; + gradGrid[n][d][h][w][1] = giy_old + giy; + gradGrid[n][d][h][w][2] = giz_old + giz; + } +} + +#undef MIN +#undef MAX +#undef CLIP_COORDINATES +#undef WITHIN_BOUNDS +#undef SAFE_ADD + +#include "generic/VolumetricGridSamplerBilinear.cu" +#include "THCGenerateFloatTypes.h" diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h index e2a99640ba69b6..47f9bee0fb6744 100644 --- a/aten/src/THCUNN/common.h +++ b/aten/src/THCUNN/common.h @@ -62,7 +62,7 @@ inline int GET_BLOCKS(const int N) #define THCUNN_check_dim_size(STATE, T, DIM, DIM_SIZE, SIZE) \ if (THCTensor_(nDimensionLegacyNoScalars)(STATE, T) != DIM || \ - THCTensor_(sizeLegacyNoScalars)(STATE, T, DIM_SIZE) != SIZE) { \ + THCTensor_(size)(STATE, T, DIM_SIZE) != SIZE) { \ THCDescBuff s1 = THCTensor_(sizeDesc)(state, T); \ THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \ " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \ @@ -70,7 +70,7 @@ inline int GET_BLOCKS(const int N) #define THCUNN_check_dim_size_indices(STATE, T, DIM, DIM_SIZE, SIZE) \ if (THCIndexTensor_(nDimensionLegacyNoScalars)(STATE, T) != DIM || \ - THCIndexTensor_(sizeLegacyNoScalars)(STATE, T, DIM_SIZE) != SIZE) { \ + THCIndexTensor_(size)(STATE, T, DIM_SIZE) != SIZE) { \ THCDescBuff s1 = THCIndexTensor_(sizeDesc)(state, T); \ THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \ " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \ diff --git a/aten/src/THCUNN/generic/BatchNormalization.cu b/aten/src/THCUNN/generic/BatchNormalization.cu index 81eabc68812f36..03dd38a7bd76ee 100644 --- a/aten/src/THCUNN/generic/BatchNormalization.cu +++ b/aten/src/THCUNN/generic/BatchNormalization.cu @@ -21,11 +21,11 @@ static THCDeviceTensor THNN_(devicetensor)(THCState *state, THCTensor int size[Dim]; for (int i = 0; i < Dim || i < inDim; ++i) { if (i < Dim && i < inDim) { - size[i] = THTensor_sizeLegacyNoScalars(t, i); + size[i] = t->size(i); } else if (i < Dim) { size[i] = 1; } else { - size[Dim - 1] *= THTensor_sizeLegacyNoScalars(t, i); + size[Dim - 1] *= t->size(i); } } return THCDeviceTensor(t->data(), size); diff --git a/aten/src/THCUNN/generic/ClassNLLCriterion.cu b/aten/src/THCUNN/generic/ClassNLLCriterion.cu index 6866c5798f7d23..6126dee76dcb27 100644 --- a/aten/src/THCUNN/generic/ClassNLLCriterion.cu +++ b/aten/src/THCUNN/generic/ClassNLLCriterion.cu @@ -16,7 +16,7 @@ void THNN_(ClassNLLCriterion_updateOutput)( } int n_dims = THCTensor_(nDimensionLegacyNoScalars)(state, input); - int n_classes = THCTensor_(sizeLegacyNoScalars)(state, input, n_dims - 1); + int n_classes = THCTensor_(size)(state, input, n_dims - 1); ignore_index -= TH_INDEX_BASE; if (weights) { @@ -31,8 +31,8 @@ void THNN_(ClassNLLCriterion_updateOutput)( THArgCheck(!input->is_empty() && (n_dims <= 2 && n_dims > 0), 2, "non-empty vector or matrix expected"); - int64_t batch_size = n_dims == 1 ? 1 : THCTensor_(sizeLegacyNoScalars)(state, input, 0); - int64_t num_targets = THCudaLongTensor_sizeLegacyNoScalars(state, target, 0); + int64_t batch_size = n_dims == 1 ? 1 : THCTensor_(size)(state, input, 0); + int64_t num_targets = THCudaLongTensor_size(state, target, 0); THArgCheck(batch_size == num_targets, 2, "mismatch between the batch size of input (%ld) and that of target (%ld)", batch_size, num_targets); @@ -152,7 +152,7 @@ void THNN_(ClassNLLCriterion_updateGradInput)( THArgCheck(!input->is_empty() && (n_dims <= 2 && n_dims > 0), 2, "non-empty vector or matrix expected"); int64_t batch_size = n_dims == 1 ? 1 : THCTensor_(size)(state, input, 0); - int64_t num_targets = THCudaLongTensor_sizeLegacyNoScalars(state, target, 0); + int64_t num_targets = THCudaLongTensor_size(state, target, 0); THArgCheck(batch_size == num_targets, 2, "mismatch between the batch size of input (%ld) and that of target (%ld)", batch_size, num_targets); diff --git a/aten/src/THCUNN/generic/ELU.cu b/aten/src/THCUNN/generic/ELU.cu index 6f78349110ec35..5c09a0607f0246 100644 --- a/aten/src/THCUNN/generic/ELU.cu +++ b/aten/src/THCUNN/generic/ELU.cu @@ -11,23 +11,21 @@ void THNN_(ELU_updateOutput)( THCTensor *output, accreal alpha, accreal scale, - accreal input_scale, bool inplace) { real negcoef = ScalarConvert::to(alpha * scale); - real poscoef = ScalarConvert::to(scale * input_scale); - real negiptcoef = ScalarConvert::to(input_scale); + real poscoef = ScalarConvert::to(scale); THCUNN_assertSameGPU(state, 2, input, output); if (inplace) { - THC_pointwiseApply1(state, input, ELUupdateOutputIP_functor(negcoef, poscoef, negiptcoef)); + THC_pointwiseApply1(state, input, ELUupdateOutputIP_functor(negcoef, poscoef)); THCTensor_(set)(state, output, input); } else { THCTensor_(resizeAs)(state, output, input); - THC_pointwiseApply2(state, output, input, ELUupdateOutput_functor(negcoef, poscoef, negiptcoef)); + THC_pointwiseApply2(state, output, input, ELUupdateOutput_functor(negcoef, poscoef)); } } @@ -38,17 +36,15 @@ void THNN_(ELU_updateGradInput)( THCTensor *gradInput, THCTensor *output, accreal alpha, - accreal scale, - accreal input_scale) + accreal scale) { real negcoef = ScalarConvert::to(alpha * scale); - real poscoef = ScalarConvert::to(scale * input_scale); - real negiptcoef = ScalarConvert::to(input_scale); + real poscoef = ScalarConvert::to(scale); THCUNN_check_nElement(state, output, gradOutput); THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput); THCTensor_(resizeAs)(state, gradInput, output); - THC_pointwiseApply3(state, gradInput, output, gradOutput, ELUupdateGradInput_functor(negcoef, poscoef, negiptcoef)); + THC_pointwiseApply3(state, gradInput, output, gradOutput, ELUupdateGradInput_functor(negcoef, poscoef)); } #endif diff --git a/aten/src/THCUNN/generic/GatedLinearUnit.cu b/aten/src/THCUNN/generic/GatedLinearUnit.cu index 9bd59eec538cb6..4622403e76088f 100644 --- a/aten/src/THCUNN/generic/GatedLinearUnit.cu +++ b/aten/src/THCUNN/generic/GatedLinearUnit.cu @@ -12,7 +12,7 @@ void THNN_(GatedLinear_updateOutput)( // size output to half of input dim = dim - TH_INDEX_BASE; - const int64_t nIn = THCTensor_(sizeLegacyNoScalars)(state, input, dim); + const int64_t nIn = THCTensor_(size)(state, input, dim); THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld", dim + TH_INDEX_BASE, nIn); const int64_t inputSize = THCTensor_(size)(state, input, dim) / 2; diff --git a/aten/src/THCUNN/generic/MultiMarginCriterion.cu b/aten/src/THCUNN/generic/MultiMarginCriterion.cu index 65bd6cdec850bb..8272b3d4020ec7 100644 --- a/aten/src/THCUNN/generic/MultiMarginCriterion.cu +++ b/aten/src/THCUNN/generic/MultiMarginCriterion.cu @@ -18,7 +18,7 @@ void THNN_(MultiMarginCriterion_updateOutput)( input = THCTensor_(newContiguous)(state, input); if(weights) weights = THCTensor_(newContiguous)(state, weights); - if (THTensor_nDimensionLegacyNoScalars(input) == 1) + if (input->dim() == 1) { dim3 blocks(1); dim3 threads(MULTIMARGIN_THREADS); @@ -30,7 +30,7 @@ void THNN_(MultiMarginCriterion_updateOutput)( THCTensor_(data)(state, input), THCIndexTensor_(data)(state, target), weights ? THCTensor_(data)(state, weights) : NULL, - 1, THTensor_sizeLegacyNoScalars(input, 0), + 1, input->size(0), reduction == Reduction::ElementwiseMean, margin ); @@ -42,7 +42,7 @@ void THNN_(MultiMarginCriterion_updateOutput)( THCTensor_(data)(state, input), THCIndexTensor_(data)(state, target), weights ? THCTensor_(data)(state, weights) : NULL, - 1, THTensor_sizeLegacyNoScalars(input, 0), + 1, input->size(0), reduction == Reduction::ElementwiseMean, margin ); @@ -52,7 +52,7 @@ void THNN_(MultiMarginCriterion_updateOutput)( else if (input->dim() == 2) { int nframe = input->size(0); - THArgCheck(!target->is_empty() && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), 3, + THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe), 3, "inconsistent target size"); dim3 blocks(input->size(0)); dim3 threads(MULTIMARGIN_THREADS); @@ -149,7 +149,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)( if(weights) weights = THCTensor_(newContiguous)(state, weights); - if (THTensor_nDimensionLegacyNoScalars(input) == 1) + if (input->dim() == 1) { dim3 blocks(1); dim3 threads(MULTIMARGIN_THREADS); @@ -162,7 +162,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)( THCTensor_(data)(state, input), THCIndexTensor_(data)(state, target), weights ? THCTensor_(data)(state, weights) : NULL, - 1, THTensor_sizeLegacyNoScalars(gradInput, 0), + 1, gradInput->size(0), reduction == Reduction::ElementwiseMean, margin, reduction != Reduction::None @@ -176,7 +176,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)( THCTensor_(data)(state, input), THCIndexTensor_(data)(state, target), weights ? THCTensor_(data)(state, weights) : NULL, - 1, THTensor_sizeLegacyNoScalars(gradInput, 0), + 1, gradInput->size(0), reduction == Reduction::ElementwiseMean, margin, reduction != Reduction::None @@ -187,7 +187,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)( else if (input->dim() == 2) { int nframe = gradInput->size(0); - THArgCheck(!target->is_empty() && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), 3, + THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe), 3, "inconsistent target size"); dim3 blocks(gradInput->size(0)); dim3 threads(MULTIMARGIN_THREADS); diff --git a/aten/src/THCUNN/generic/PReLU.cu b/aten/src/THCUNN/generic/PReLU.cu index 2a0d719ff6a3e6..2517b409409aed 100644 --- a/aten/src/THCUNN/generic/PReLU.cu +++ b/aten/src/THCUNN/generic/PReLU.cu @@ -24,8 +24,8 @@ void THNN_(PReLU_updateOutput)( input = THCTensor_(newContiguous)(state, input); int n = THCTensor_(nElement)(state, input); - if (THTensor_sizeLegacyNoScalars(input, ndim > 1) != nOutputPlane) - THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, THTensor_sizeLegacyNoScalars(input, ndim > 1)); + if (input->size(ndim > 1) != nOutputPlane) + THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(ndim > 1)); int mapSize = 1; for (int d = 2; d < ndim; d++) { @@ -69,8 +69,8 @@ void THNN_(PReLU_updateGradInput)( gradOutput = THCTensor_(newContiguous)(state, gradOutput); int n = THCTensor_(nElement)(state, input); - if (THTensor_sizeLegacyNoScalars(input, ndim > 1) != nOutputPlane) - THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, THTensor_sizeLegacyNoScalars(input, ndim > 1)); + if (input->size(ndim > 1) != nOutputPlane) + THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(ndim > 1)); int mapSize = 1; for (int d = 2; d < ndim; d++) { diff --git a/aten/src/THCUNN/generic/SparseLinear.cu b/aten/src/THCUNN/generic/SparseLinear.cu index 0363dcf0e3996a..f73bd5835c04bb 100644 --- a/aten/src/THCUNN/generic/SparseLinear.cu +++ b/aten/src/THCUNN/generic/SparseLinear.cu @@ -4,17 +4,17 @@ static bool THNN_(checkInput)(THCTensor* t) { - return !t->is_empty() && t->dim() == 2 && t->size(1) == 3; + return !t->is_empty() && THTensor_nDimensionLegacyAll(t) == 2 && t->size(1) == 3; } static bool THNN_(checkSize2D)(THCTensor* t, int64_t size0, int64_t size1) { - return !t->is_empty() && t->dim() == 2 && t->size(0) == size0 && t->size(1) == size1; + return !t->is_empty() && THTensor_nDimensionLegacyAll(t) == 2 && t->size(0) == size0 && t->size(1) == size1; } static bool THNN_(checkSize1D)(THCTensor* t, int64_t size0) { - return !t->is_empty() && THTensor_nDimensionLegacyNoScalars(t) == 1 && THTensor_sizeLegacyNoScalars(t, 0) == size0; + return !t->is_empty() && THTensor_nDimensionLegacyAll(t) == 1 && t->size(0) == size0; } static inline void THNN_(copyCudaFloatingType)(THCState *state, THCudaIntTensor *buf, THCTensor *t) { diff --git a/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu b/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu index ae211774a580db..b7010977558816 100644 --- a/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu +++ b/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu @@ -8,10 +8,10 @@ void THNN_(SpatialClassNLLCriterion_shapeCheck)( THCIndexTensor *target, THCTensor *weights) { - AT_CHECK(!target->is_empty() && target->dim() == 3, 1, + AT_CHECK(!target->is_empty() && THCIndexTensor_(nDimensionLegacyNoScalars)(state, target) == 3, 1, "only batches of spatial targets supported (non-empty 3D tensors)" \ " but got targets of size: : ", target->sizes()); - AT_CHECK(!input->is_empty() && input->dim() == 4, 2, + AT_CHECK(!input->is_empty() && THCTensor_(nDimensionLegacyNoScalars)(state, input) == 4, 2, "only batches of spatial inputs supported (non-empty 4D tensors), " \ "but got input of size: ", input->sizes()); if (THCTensor_(size)(state, input, 0) != THCIndexTensor_(size)(state, target, 0) || diff --git a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu index 7860404b685f52..334afe93cb727e 100644 --- a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu +++ b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu @@ -73,7 +73,7 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)( int64_t nOutputPlane = weight->size(0); THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0); + int64_t nOutputPlane = bias->size(0); THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); } THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight); diff --git a/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu b/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu index 546ec2ae3c6185..7c6716c41f5bff 100644 --- a/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu +++ b/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu @@ -31,7 +31,7 @@ void THNN_(SpatialDepthwiseConvolution_updateOutput)( // Bias has same # of channels as output if (bias) { - THAssert(THTensor_sizeLegacyNoScalars(bias, 0) == weight->size(0)); + THAssert(bias->size(0) == weight->size(0)); } input = THCTensor_(newContiguous)(state, input); diff --git a/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu b/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu index 4225583735460e..ad0f47418b86cf 100644 --- a/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu +++ b/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu @@ -65,7 +65,7 @@ static inline void THNN_(SpatialDilatedConvolution_shapeCheck)( int64_t nOutputPlane = weight->size(0); THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0); + int64_t nOutputPlane = bias->size(0); THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); } THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight); diff --git a/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu b/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu index 8d039d54068aaf..76777796e361e4 100644 --- a/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu +++ b/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu @@ -65,7 +65,7 @@ static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)( int64_t nOutputPlane = weight->size(1); THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0); + int64_t nOutputPlane = bias->size(0); THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); } THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight); @@ -351,7 +351,7 @@ void THNN_(SpatialFullDilatedConvolution_accGradParameters)( if (gradWeight != NULL) { nOutputPlane = THCTensor_(size)(state, gradWeight, 1); } else if (gradBias != NULL) { - nOutputPlane = THCTensor_(sizeLegacyNoScalars)(state, gradBias, 0); + nOutputPlane = THCTensor_(size)(state, gradBias, 0); } else { return; } diff --git a/aten/src/THCUNN/generic/SpatialGridSamplerBilinear.cu b/aten/src/THCUNN/generic/SpatialGridSamplerBilinear.cu new file mode 100644 index 00000000000000..7e285cb55fa7d2 --- /dev/null +++ b/aten/src/THCUNN/generic/SpatialGridSamplerBilinear.cu @@ -0,0 +1,97 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialGridSamplerBilinear.cu" +#else + +static inline void THNN_(SpatialGridSamplerBilinear_shapeCheck)( + THCState *state, + THCTensor *input, + THCTensor *grid, + THCTensor *gradOutput) { + THCUNN_argCheck(state, !input->is_empty() && THCTensor_(nDimensionLegacyNoScalars)(state, input) == 4, 2, input, + "non-empty 4D input tensor expected but got: %s"); + THCUNN_argCheck(state, !grid->is_empty() && THCTensor_(nDimensionLegacyNoScalars)(state, grid) == 4, 2, grid, + "4D grid tensor expected but got: %s"); + + int64_t nbatch = THCTensor_(size)(state, input, 0); + int64_t channels = THCTensor_(size)(state, input, 1); + int64_t iheight = THCTensor_(size)(state, input, 2); + int64_t iwidth = THCTensor_(size)(state, input, 3); + int64_t oheight = THCTensor_(size)(state, grid, 1); + int64_t owidth = THCTensor_(size)(state, grid, 2); + + THCUNN_check_dim_size(state, grid, 4, 0, nbatch); + THCUNN_check_dim_size(state, grid, 4, 3, 2); + + if (gradOutput != NULL) { + THCUNN_check_dim_size(state, gradOutput, 4, 0, nbatch); + THCUNN_check_dim_size(state, gradOutput, 4, 1, channels); + THCUNN_check_dim_size(state, gradOutput, 4, 2, oheight); + THCUNN_check_dim_size(state, gradOutput, 4, 3, owidth); + } +} + +THC_API void THNN_(SpatialGridSamplerBilinear_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *grid, + THCTensor *output, + int padding_mode) { + + THCUNN_assertSameGPU(state, 3, input, grid, output); + THNN_(SpatialGridSamplerBilinear_shapeCheck)(state, input, grid, NULL); + int64_t N = THCTensor_(size)(state, input, 0); + int64_t C = THCTensor_(size)(state, input, 1); + int64_t IH = THCTensor_(size)(state, input, 2); + int64_t IW = THCTensor_(size)(state, input, 3); + int64_t H = THCTensor_(size)(state,grid, 1); + int64_t W = THCTensor_(size)(state, grid, 2); + + // resize output to the same shape as input + THCTensor_(resize4d)(state, output, N, C, H, W); + + THCDeviceTensor devInput = toDeviceTensor(state, input); + THCDeviceTensor devGrid = toDeviceTensor(state, grid); + THCDeviceTensor devOutput = toDeviceTensor(state, output); + + int count = static_cast(N*H*W); + SpatialGridSamplerBilinear_updateOutput_kernel + <<>>( + count, devInput, devGrid, devOutput, padding_mode); + THCudaCheck(cudaGetLastError()); +} + +THC_API void THNN_(SpatialGridSamplerBilinear_updateGradInput)( + THCState *state, + THCTensor *input, THCTensor *gradInput, + THCTensor *grid, THCTensor *gradGrid, + THCTensor *gradOutput, + int padding_mode) { + + THCUNN_assertSameGPU(state, 5, input, gradInput, grid, gradGrid, gradOutput); + THNN_(SpatialGridSamplerBilinear_shapeCheck)(state, input, grid, gradOutput); + int64_t N = THCTensor_(size)(state, input, 0); + int64_t C = THCTensor_(size)(state, input, 1); + int64_t IH = THCTensor_(size)(state, input, 2); + int64_t IW = THCTensor_(size)(state, input, 3); + int64_t H = THCTensor_(size)(state, grid, 1); + int64_t W = THCTensor_(size)(state, grid, 2); + + THCTensor_(resize4d)(state, gradInput, N, C, IH, IW); + THCTensor_(resize4d)(state, gradGrid, N, H, W, 2); + THCTensor_(zero)(state, gradInput); + THCTensor_(zero)(state, gradGrid); + + THCDeviceTensor devInput = toDeviceTensor(state, input); + THCDeviceTensor devGradInput = toDeviceTensor(state, gradInput); + THCDeviceTensor devGrid = toDeviceTensor(state, grid); + THCDeviceTensor devGradGrid = toDeviceTensor(state, gradGrid); + THCDeviceTensor devGradOutput = toDeviceTensor(state, gradOutput); + + int count = static_cast(N*H*W); + SpatialGridSamplerBilinear_updateGradInput_kernel + <<>>( + count, devInput, devGradInput, devGrid, devGradGrid, devGradOutput, padding_mode); + THCudaCheck(cudaGetLastError()); +} + +#endif diff --git a/aten/src/THCUNN/generic/THCUNN.h b/aten/src/THCUNN/generic/THCUNN.h index 3c4883a1e3c45d..eaadf66c8306ee 100644 --- a/aten/src/THCUNN/generic/THCUNN.h +++ b/aten/src/THCUNN/generic/THCUNN.h @@ -119,7 +119,6 @@ THC_API void THNN_(ELU_updateOutput)( THCTensor *output, accreal alpha, accreal scale, - accreal input_scale, bool inplace); THC_API void THNN_(ELU_updateGradInput)( @@ -128,8 +127,7 @@ THC_API void THNN_(ELU_updateGradInput)( THCTensor *gradInput, THCTensor *output, accreal alpha, - accreal scale, - accreal input_scale); + accreal scale); THC_API void THNN_(FeatureLPPooling_updateOutput)( THCState* state, @@ -1047,6 +1045,34 @@ THC_API void THNN_(SpatialUpSamplingNearest_updateOutput)( int outputHeight, int outputWidth); +THC_API void THNN_(SpatialGridSamplerBilinear_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *grid, + THCTensor *output, + int padding_mode); + +THC_API void THNN_(SpatialGridSamplerBilinear_updateGradInput)( + THCState *state, + THCTensor *input, THCTensor *gradInput, + THCTensor *grid, THCTensor *gradGrid, + THCTensor *gradOutput, + int padding_mode); + +THC_API void THNN_(VolumetricGridSamplerBilinear_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *grid, + THCTensor *output, + int padding_mode); + +THC_API void THNN_(VolumetricGridSamplerBilinear_updateGradInput)( + THCState *state, + THCTensor *input, THCTensor *gradInput, + THCTensor *grid, THCTensor *gradGrid, + THCTensor *gradOutput, + int padding_mode); + THC_API void THNN_(RReLU_updateOutput)( THCState *state, THCTensor *input, diff --git a/aten/src/THCUNN/generic/TemporalReflectionPadding.cu b/aten/src/THCUNN/generic/TemporalReflectionPadding.cu index 310f22d03e5dfa..870d38ba225f8c 100644 --- a/aten/src/THCUNN/generic/TemporalReflectionPadding.cu +++ b/aten/src/THCUNN/generic/TemporalReflectionPadding.cu @@ -79,7 +79,7 @@ void THNN_(TemporalReflectionPadding_updateGradInput)( int planeDim = 0; int dimw = 1; - int numInputDims = input->dim(); + int numInputDims = THCTensor_(nDimensionLegacyNoScalars)(state, input); if (numInputDims == 3) { planeDim++; dimw++; diff --git a/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu b/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu index d6ffba3519553c..52d97fbf2a3638 100644 --- a/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu +++ b/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu @@ -75,7 +75,7 @@ static inline void THNN_(VolumetricDilatedConvolution_shapeCheck)( int64_t nOutputPlane = weight->size(0); THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0); + int64_t nOutputPlane = bias->size(0); THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane); } THCUNN_check_dim_size(state, gradOutput, ndim, dimd, outputDepth); diff --git a/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu b/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu index 10a5fdc2643193..96310609e956f4 100644 --- a/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu +++ b/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu @@ -387,7 +387,7 @@ void THNN_(VolumetricFullDilatedConvolution_accGradParameters)( if (gradWeight) { nOutputPlane = THCTensor_(size)(state, gradWeight, 1); } else if (gradBias) { - nOutputPlane = THCTensor_(sizeLegacyNoScalars)(state, gradBias, 0); + nOutputPlane = THCTensor_(size)(state, gradBias, 0); } else { return; } diff --git a/aten/src/THCUNN/generic/VolumetricGridSamplerBilinear.cu b/aten/src/THCUNN/generic/VolumetricGridSamplerBilinear.cu new file mode 100644 index 00000000000000..086667ca476ac1 --- /dev/null +++ b/aten/src/THCUNN/generic/VolumetricGridSamplerBilinear.cu @@ -0,0 +1,104 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/VolumetricGridSamplerBilinear.cu" +#else + +static inline void THNN_(VolumetricGridSamplerBilinear_shapeCheck)( + THCState *state, + THCTensor *input, + THCTensor *grid, + THCTensor *gradOutput) { + THCUNN_argCheck(state, !input->is_empty() && THCTensor_(nDimensionLegacyNoScalars)(state, input) == 5, 2, input, + "non-empty 5D input tensor expected but got: %s"); + THCUNN_argCheck(state, !grid->is_empty() && THCTensor_(nDimensionLegacyNoScalars)(state, grid) == 5, 2, grid, + "non-empty 5D grid tensor expected but got: %s"); + + int64_t nbatch = THCTensor_(size)(state, input, 0); + int64_t channels = THCTensor_(size)(state, input, 1); + int64_t idepth = THCTensor_(size)(state, input, 2); + int64_t iheight = THCTensor_(size)(state, input, 3); + int64_t iwidth = THCTensor_(size)(state, input, 4); + int64_t odepth = THCTensor_(size)(state, grid, 1); + int64_t oheight = THCTensor_(size)(state, grid, 2); + int64_t owidth = THCTensor_(size)(state, grid, 3); + + THCUNN_check_dim_size(state, grid, 5, 0, nbatch); + THCUNN_check_dim_size(state, grid, 5, 4, 3); + + if (gradOutput != NULL) { + THCUNN_check_dim_size(state, gradOutput, 5, 0, nbatch); + THCUNN_check_dim_size(state, gradOutput, 5, 1, channels); + THCUNN_check_dim_size(state, gradOutput, 5, 2, odepth); + THCUNN_check_dim_size(state, gradOutput, 5, 3, oheight); + THCUNN_check_dim_size(state, gradOutput, 5, 4, owidth); + } +} + +THC_API void THNN_(VolumetricGridSamplerBilinear_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *grid, + THCTensor *output, + int padding_mode) { + + THCUNN_assertSameGPU(state, 3, input, grid, output); + THNN_(VolumetricGridSamplerBilinear_shapeCheck)(state, input, grid, NULL); + int64_t N = THCTensor_(size)(state, input, 0); + int64_t C = THCTensor_(size)(state, input, 1); + int64_t ID = THCTensor_(size)(state, input, 2); + int64_t IH = THCTensor_(size)(state, input, 3); + int64_t IW = THCTensor_(size)(state, input, 4); + int64_t D = THCTensor_(size)(state,grid, 1); + int64_t H = THCTensor_(size)(state,grid, 2); + int64_t W = THCTensor_(size)(state, grid, 3); + + // resize output to the same shape as input + THCTensor_(resize5d)(state, output, N, C, D, H, W); + + THCDeviceTensor devInput = toDeviceTensor(state, input); + THCDeviceTensor devGrid = toDeviceTensor(state, grid); + THCDeviceTensor devOutput = toDeviceTensor(state, output); + + int count = static_cast(N*D*H*W); + VolumetricGridSamplerBilinear_updateOutput_kernel + <<>>( + count, devInput, devGrid, devOutput, padding_mode); + THCudaCheck(cudaGetLastError()); +} + +THC_API void THNN_(VolumetricGridSamplerBilinear_updateGradInput)( + THCState *state, + THCTensor *input, THCTensor *gradInput, + THCTensor *grid, THCTensor *gradGrid, + THCTensor *gradOutput, + int padding_mode) { + + THCUNN_assertSameGPU(state, 5, input, gradInput, grid, gradGrid, gradOutput); + THNN_(VolumetricGridSamplerBilinear_shapeCheck)(state, input, grid, gradOutput); + int64_t N = THCTensor_(size)(state, input, 0); + int64_t C = THCTensor_(size)(state, input, 1); + int64_t ID = THCTensor_(size)(state, input, 2); + int64_t IH = THCTensor_(size)(state, input, 3); + int64_t IW = THCTensor_(size)(state, input, 4); + int64_t D = THCTensor_(size)(state,grid, 1); + int64_t H = THCTensor_(size)(state,grid, 2); + int64_t W = THCTensor_(size)(state, grid, 3); + + THCTensor_(resize5d)(state, gradInput, N, C, ID, IH, IW); + THCTensor_(resize5d)(state, gradGrid, N, D, H, W, 3); + THCTensor_(zero)(state, gradInput); + THCTensor_(zero)(state, gradGrid); + + THCDeviceTensor devInput = toDeviceTensor(state, input); + THCDeviceTensor devGradInput = toDeviceTensor(state, gradInput); + THCDeviceTensor devGrid = toDeviceTensor(state, grid); + THCDeviceTensor devGradGrid = toDeviceTensor(state, gradGrid); + THCDeviceTensor devGradOutput = toDeviceTensor(state, gradOutput); + + int count = static_cast(N*D*H*W); + VolumetricGridSamplerBilinear_updateGradInput_kernel + <<>>( + count, devInput, devGradInput, devGrid, devGradGrid, devGradOutput, padding_mode); + THCudaCheck(cudaGetLastError()); +} + +#endif diff --git a/aten/src/THNN/generic/ClassNLLCriterion.c b/aten/src/THNN/generic/ClassNLLCriterion.c index 7db0531d60d1ef..c7d42b583374cc 100644 --- a/aten/src/THNN/generic/ClassNLLCriterion.c +++ b/aten/src/THNN/generic/ClassNLLCriterion.c @@ -82,7 +82,7 @@ void THNN_(ClassNLLCriterion_updateOutput)( } } else if (THTensor_(nDimensionLegacyAll)(input) == 2) { int batch_size = THTensor_(size)(input, 0); - THAssert(THTensor_sizeLegacyNoScalars(target, 0) == batch_size); + THAssert(THIndexTensor_(size)(target, 0) == batch_size); int n_target = THTensor_(size)(input, 1); @@ -189,7 +189,7 @@ void THNN_(ClassNLLCriterion_updateGradInput)( } else if (THTensor_(nDimensionLegacyAll)(input) == 2) { int batch_size = THTensor_(size)(input, 0); - THAssert(THTensor_sizeLegacyNoScalars(target, 0) == batch_size); + THAssert(THIndexTensor_(size)(target, 0) == batch_size); int n_target = THTensor_(size)(input, 1); diff --git a/aten/src/THNN/generic/ELU.c b/aten/src/THNN/generic/ELU.c index 62111ebbf4d7c2..f2d87185b813a5 100644 --- a/aten/src/THNN/generic/ELU.c +++ b/aten/src/THNN/generic/ELU.c @@ -8,21 +8,19 @@ void THNN_(ELU_updateOutput)( THTensor *output, accreal alpha_, accreal scale, - accreal input_scale, bool inplace) { real negcoef = TH_CONVERT_ACCREAL_TO_REAL(alpha_ * scale); - real poscoef = TH_CONVERT_ACCREAL_TO_REAL(scale * input_scale); - real negiptcoef = TH_CONVERT_ACCREAL_TO_REAL(input_scale); + real poscoef = TH_CONVERT_ACCREAL_TO_REAL(scale); if (inplace) { TH_TENSOR_APPLY(real, input, - *input_data = *input_data <= 0 ? (exp(*input_data * negiptcoef)-1) * negcoef : *input_data * poscoef; + *input_data = *input_data <= 0 ? (exp(*input_data)-1) * negcoef : *input_data * poscoef; ); THTensor_(set)(output, input); } else { THTensor_(resizeAs)(output, input); TH_TENSOR_APPLY2(real, input, real, output, - *output_data = *input_data <= 0 ? (exp(*input_data * negiptcoef)-1) * negcoef : *input_data * poscoef; + *output_data = *input_data <= 0 ? (exp(*input_data)-1) * negcoef : *input_data * poscoef; ); } } @@ -33,16 +31,14 @@ void THNN_(ELU_updateGradInput)( THTensor *gradInput, THTensor *output, accreal alpha_, - accreal scale, - accreal input_scale) + accreal scale) { real negcoef = TH_CONVERT_ACCREAL_TO_REAL(alpha_ * scale); - real poscoef = TH_CONVERT_ACCREAL_TO_REAL(scale * input_scale); - real negiptcoef = TH_CONVERT_ACCREAL_TO_REAL(input_scale); + real poscoef = TH_CONVERT_ACCREAL_TO_REAL(scale); THNN_CHECK_NELEMENT(output, gradOutput); THTensor_(resizeAs)(gradInput, output); TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output, - *gradInput_data = *output_data <= 0 ? *gradOutput_data * negiptcoef * (*output_data + negcoef) : *gradOutput_data * poscoef; + *gradInput_data = *output_data <= 0 ? *gradOutput_data * (*output_data + negcoef) : *gradOutput_data * poscoef; ); } diff --git a/aten/src/THNN/generic/GatedLinearUnit.c b/aten/src/THNN/generic/GatedLinearUnit.c index 0f888744240473..68cdc37d54214a 100644 --- a/aten/src/THNN/generic/GatedLinearUnit.c +++ b/aten/src/THNN/generic/GatedLinearUnit.c @@ -10,7 +10,7 @@ void THNN_(GatedLinear_updateOutput)( { // size output to half of input dim = dim - TH_INDEX_BASE; - const int64_t nIn = THTensor_sizeLegacyNoScalars(input, dim); + const int64_t nIn = THTensor_(size)(input, dim); THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld", dim + TH_INDEX_BASE, nIn); diff --git a/aten/src/THNN/generic/LookupTable.c b/aten/src/THNN/generic/LookupTable.c index fa6648e2a6b80c..2260b168d8e8d5 100644 --- a/aten/src/THNN/generic/LookupTable.c +++ b/aten/src/THNN/generic/LookupTable.c @@ -40,7 +40,7 @@ void THNN_(LookupTable_accGradParameters)( if (scaleGradByFreq) { - THIntegerTensor_(resize1d)(count, THTensor_sizeLegacyNoScalars(gradWeight, 0)); + THIntegerTensor_(resize1d)(count, gradWeight->size(0)); count_data = THIntegerTensor_(data)(count); } diff --git a/aten/src/THNN/generic/MultiLabelMarginCriterion.c b/aten/src/THNN/generic/MultiLabelMarginCriterion.c index a18252b06914d6..0699c3ac471c55 100644 --- a/aten/src/THNN/generic/MultiLabelMarginCriterion.c +++ b/aten/src/THNN/generic/MultiLabelMarginCriterion.c @@ -17,14 +17,14 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)( int64_t t, d, dt, ddt; real sum; - AT_CHECK(!input->is_empty() && input->dim() <= 2, + AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2), "non-empty vector or matrix expected, got size: ", input->sizes()); - if (input->dim() <= 1) + if (input->dim() == 1) { nframe = 1; - dim = THTensor_sizeLegacyNoScalars(input, 0); - AT_CHECK(!target->is_empty() && (target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == dim), + dim = input->size(0); + AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size(0) == dim), "inconsistent target size"); } else @@ -155,16 +155,16 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)( int64_t t, d, dt; real g; - AT_CHECK(!input->is_empty() && input->dim() <= 2, + AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2), "vector or matrix expected, got size: ", input->sizes()); - if (input->dim() <= 1) + if (input->dim() == 1) { nframe = 1; - dim = THTensor_sizeLegacyNoScalars(input, 0); - AT_CHECK((!target->is_empty() && target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == dim), + dim = input->size(0); + AT_CHECK((!target->is_empty() && target->dim() == 1) && (target->size(0) == dim), "inconsistent target size"); - AT_CHECK((!isTarget->is_empty() && isTarget->dim() <= 1) && (THTensor_sizeLegacyNoScalars(isTarget, 0) == dim), + AT_CHECK((!isTarget->is_empty() && isTarget->dim() == 1) && (isTarget->size(0) == dim), "inconsistent isTarget size"); } else diff --git a/aten/src/THNN/generic/MultiMarginCriterion.c b/aten/src/THNN/generic/MultiMarginCriterion.c index 2c8f38be23eb3a..424669e5de8515 100644 --- a/aten/src/THNN/generic/MultiMarginCriterion.c +++ b/aten/src/THNN/generic/MultiMarginCriterion.c @@ -20,19 +20,19 @@ void THNN_(MultiMarginCriterion_updateOutput)( int64_t t, d; real sum; - AT_CHECK(!input->is_empty() && input->dim() <= 2, + AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2), "non-empty vector or matrix expected, got size: ", input->sizes()); - if (input->dim() <= 1) + if (input->dim() == 1) { nframe = 1; - dim = THTensor_sizeLegacyNoScalars(input, 0); + dim = input->size(0); } else { nframe = input->size(0); dim = input->size(1); - AT_CHECK(!target->is_empty() && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), + AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe), "inconsistent target size, got: ", target->sizes()); } @@ -136,19 +136,19 @@ void THNN_(MultiMarginCriterion_updateGradInput)( int64_t t, d; real g; - AT_CHECK(!input->is_empty() && (input->dim() <= 2), + AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2), "non-empty vector or matrix expected, got size: ", input->sizes()); - if (input->dim() <= 1) + if (input->dim() == 1) { nframe = 1; - dim = THTensor_sizeLegacyNoScalars(input, 0); + dim = input->size(0); } else { nframe = input->size(0); dim = input->size(1); - AT_CHECK(!target->is_empty() && (target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), + AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe), "inconsistent target size, got: ", target->sizes()); } diff --git a/aten/src/THNN/generic/PReLU.c b/aten/src/THNN/generic/PReLU.c index 1837874852d2bb..e148fde783ce9d 100644 --- a/aten/src/THNN/generic/PReLU.c +++ b/aten/src/THNN/generic/PReLU.c @@ -26,8 +26,8 @@ void THNN_(PReLU_updateOutput)( int64_t bs = 1, ks = 1; { int64_t input_ndim = THTensor_(nDimensionLegacyAll)(input); - if (THTensor_sizeLegacyNoScalars(input, input_ndim > 1) != nOutputPlane) - THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, THTensor_sizeLegacyNoScalars(input, input_ndim > 1)); + if (input->size(input_ndim > 1) != nOutputPlane) + THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(input_ndim > 1)); if (input_ndim > 1) { bs = input->size(0); @@ -91,8 +91,8 @@ void THNN_(PReLU_updateGradInput)( int64_t bs = 1, ks = 1; { int64_t input_ndim = THTensor_(nDimensionLegacyAll)(input); - if (THTensor_sizeLegacyNoScalars(input, input_ndim > 1) != nOutputPlane) - THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, THTensor_sizeLegacyNoScalars(input, input_ndim > 1)); + if (input->size(input_ndim > 1) != nOutputPlane) + THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(input_ndim > 1)); if (input_ndim > 1) { bs = input->size(0); @@ -162,8 +162,8 @@ void THNN_(PReLU_accGradParameters)( int64_t bs = 1, ks = 1; { int64_t input_ndim = THTensor_(nDimensionLegacyAll)(input); - if (THTensor_sizeLegacyNoScalars(input, input_ndim > 1) != nOutputPlane) - THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, THTensor_sizeLegacyNoScalars(input, input_ndim > 1)); + if (input->size(input_ndim > 1) != nOutputPlane) + THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(input_ndim > 1)); if (input_ndim > 1) { bs = input->size(0); diff --git a/aten/src/THNN/generic/SparseLinear.c b/aten/src/THNN/generic/SparseLinear.c index 3bf8e652fa9ed9..a28d4e78477ceb 100644 --- a/aten/src/THNN/generic/SparseLinear.c +++ b/aten/src/THNN/generic/SparseLinear.c @@ -26,7 +26,7 @@ static bool THNN_(checkSize2D)(THTensor* t, int64_t size0, int64_t size1) static bool THNN_(checkSize1D)(THTensor* t, int64_t size0) { - return !t->is_empty() && THTensor_nDimensionLegacyNoScalars(t) == 1 && THTensor_sizeLegacyNoScalars(t, 0) == size0; + return !t->is_empty() && t->dim() == 1 && t->size(0) == size0; } static void THNN_(set1d)(THTensor *t, int64_t x0, real value) { diff --git a/aten/src/THNN/generic/SpatialConvolutionMM.c b/aten/src/THNN/generic/SpatialConvolutionMM.c index f18a6d0817059b..fce2c8575935a5 100644 --- a/aten/src/THNN/generic/SpatialConvolutionMM.c +++ b/aten/src/THNN/generic/SpatialConvolutionMM.c @@ -72,7 +72,7 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)( int64_t nOutputPlane = weight->size(0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0); + int64_t nOutputPlane = bias->size(0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); @@ -332,7 +332,7 @@ static void THNN_(SpatialConvolutionMM_accGradParameters_frame)( } if (gradBias) { - for(i = 0; i < THTensor_sizeLegacyNoScalars(gradBias, 0); i++) + for(i = 0; i < gradBias->size(0); i++) { int64_t k; real sum = 0; diff --git a/aten/src/THNN/generic/SpatialDilatedConvolution.c b/aten/src/THNN/generic/SpatialDilatedConvolution.c index 2f71861963fcdf..63e7bd81033e12 100644 --- a/aten/src/THNN/generic/SpatialDilatedConvolution.c +++ b/aten/src/THNN/generic/SpatialDilatedConvolution.c @@ -64,7 +64,7 @@ static inline void THNN_(SpatialDilatedConvolution_shapeCheck)( int64_t nOutputPlane = weight->size(0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0); + int64_t nOutputPlane = bias->size(0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); diff --git a/aten/src/THNN/generic/SpatialFullDilatedConvolution.c b/aten/src/THNN/generic/SpatialFullDilatedConvolution.c index eeb644fc9eb5e6..7226db67ef1a74 100644 --- a/aten/src/THNN/generic/SpatialFullDilatedConvolution.c +++ b/aten/src/THNN/generic/SpatialFullDilatedConvolution.c @@ -64,7 +64,7 @@ static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)( int64_t nOutputPlane = weight->size(1); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0); + int64_t nOutputPlane = bias->size(0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); @@ -332,7 +332,7 @@ void THNN_(SpatialFullDilatedConvolution_accGradParameters)( if (gradWeight) { nOutputPlane = THTensor_(size)(gradWeight, 1); } else if (gradBias) { - nOutputPlane = THTensor_sizeLegacyNoScalars(gradBias, 0); + nOutputPlane = THTensor_(size)(gradBias, 0); } else { return; } @@ -402,7 +402,7 @@ void THNN_(SpatialFullDilatedConvolution_accGradParameters)( // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) int64_t n = columns->size(0); // nOutputPlane * kh * kw - int64_t m = THTensor_sizeLegacyNoScalars(input_n, 0); // nInputPlane + int64_t m = input_n->size(0); // nInputPlane int64_t k = columns->size(1); // inputHeight * inputWidth // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) diff --git a/aten/src/THNN/generic/SpatialGridSamplerBilinear.c b/aten/src/THNN/generic/SpatialGridSamplerBilinear.c new file mode 100644 index 00000000000000..d31f3e0a76c20a --- /dev/null +++ b/aten/src/THNN/generic/SpatialGridSamplerBilinear.c @@ -0,0 +1,250 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialGridSamplerBilinear.c" +#else + +#undef MIN +#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) ) +#undef MAX +#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) ) + +#undef MODE_BORDER +#define MODE_BORDER 1 + +static inline void THNN_(SpatialGridSamplerBilinear_shapeCheck) + (THTensor *input, THTensor *grid, THTensor *gradOutput) { + THNN_ARGCHECK(!input->is_empty() && input->dim() == 4, 2, input, + "non-empty 4D input tensor expected but got: %s"); + THNN_ARGCHECK(!grid->is_empty() && grid->dim() == 4, 2, grid, + "non-empty 4D grid tensor expected but got: %s"); + + int nbatch = THTensor_(size)(input, 0); + int channels = THTensor_(size)(input, 1); + int oheight = THTensor_(size)(grid, 1); + int owidth = THTensor_(size)(grid, 2); + + THNN_CHECK_DIM_SIZE(grid, 4, 0, nbatch); + THNN_CHECK_DIM_SIZE(grid, 4, 3, 2); + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, 4, 0, nbatch); + THNN_CHECK_DIM_SIZE(gradOutput, 4, 1, channels); + THNN_CHECK_DIM_SIZE(gradOutput, 4, 2, oheight); + THNN_CHECK_DIM_SIZE(gradOutput, 4, 3, owidth); + } +} + +#define SAFE_GET(input, x, y, n, c, H, W) x >= 0 && x < W && y >=0 \ + && y < H ? THTensor_(fastGet4d)(input, n, c, y, x) : 0 + +#define CLIP_COORDINATES(in, out, clip_limit) out = MIN((clip_limit-1), MAX(in, 0)) + +TH_API void THNN_(SpatialGridSamplerBilinear_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *grid, + THTensor *output, + int padding_mode) { + + THNN_(SpatialGridSamplerBilinear_shapeCheck)(input, grid, NULL); + int N = THTensor_(size)(input, 0); + int C = THTensor_(size)(input, 1); + int IH = THTensor_(size)(input, 2); + int IW = THTensor_(size)(input, 3); + int H = THTensor_(size)(grid, 1); + int W = THTensor_(size)(grid, 2); + + // resize output to the same shape as input + THTensor_(resize4d)(output, N, C, H, W); + + // loop over each output pixel + int n, h, w, c; +#pragma omp parallel for private(n, h, w, c) + for (n = 0; n < N; ++n) { + for (h = 0; h < H; ++h) { + for (w = 0; w < W; ++w) { + // get the corresponding input x, y co-ordinates from grid + real ix = THTensor_(fastGet4d)(grid, n, h, w, 0); + real iy = THTensor_(fastGet4d)(grid, n, h, w, 1); + + // normalize ix, iy from [-1, 1] to [0, IH-1] & [0, IW-1] + ix = ((ix + 1) / 2) * (IW-1); + iy = ((iy + 1) / 2) * (IH-1); + + // get NE, NW, SE, SW pixel values from (x, y) + int ix_nw = floor(ix); + int iy_nw = floor(iy); + int ix_ne = ix_nw + 1; + int iy_ne = iy_nw; + int ix_sw = ix_nw; + int iy_sw = iy_nw + 1; + int ix_se = ix_nw + 1; + int iy_se = iy_nw + 1; + + // get surfaces to each neighbor: + real nw = (ix_se - ix) * (iy_se - iy); + real ne = (ix - ix_sw) * (iy_sw - iy); + real sw = (ix_ne - ix) * (iy - iy_ne); + real se = (ix - ix_nw) * (iy - iy_nw); + + if (padding_mode==MODE_BORDER){ + // clip coordinates to image borders + CLIP_COORDINATES(ix_nw, ix_nw, IW); + CLIP_COORDINATES(iy_nw, iy_nw, IH); + CLIP_COORDINATES(ix_ne, ix_ne, IW); + CLIP_COORDINATES(iy_ne, iy_ne, IH); + CLIP_COORDINATES(ix_sw, ix_sw, IW); + CLIP_COORDINATES(iy_sw, iy_sw, IH); + CLIP_COORDINATES(ix_se, ix_se, IW); + CLIP_COORDINATES(iy_se, iy_se, IH); + } + + // calculate bilinear weighted pixel value and set output pixel + for (c = 0; c < C; ++c) { + // (c, iy_nw, ix_nw) * nw + (c, iy_ne, ix_ne) * ne + // + (c, iy_sw, ix_sw) * sw + (c, iy_se, ix_se) * se + real nw_val = SAFE_GET(input, ix_nw, iy_nw, n, c, IH, IW); + real ne_val = SAFE_GET(input, ix_ne, iy_ne, n, c, IH, IW); + real sw_val = SAFE_GET(input, ix_sw, iy_sw, n, c, IH, IW); + real se_val = SAFE_GET(input, ix_se, iy_se, n, c, IH, IW); + real out_val = nw_val * nw + ne_val * ne + sw_val * sw + se_val * se; + THTensor_(fastSet4d)(output, n, c, h, w, out_val); + } + } + } + } +} + +#define SAFE_ADD(input, x, y, n, c, H, W, value) \ + do { \ + if (x >= 0 && x < W && y >=0 && y < H) { \ + real old_value = THTensor_(fastGet4d)(input, n, c, y, x); \ + THTensor_(fastSet4d)(input, n, c, y, x, value + old_value); \ + } \ + } while(0) + +TH_API void THNN_(SpatialGridSamplerBilinear_updateGradInput)( + THNNState *state, + THTensor *input, THTensor *gradInput, + THTensor *grid, THTensor *gradGrid, + THTensor *gradOutput, + int padding_mode) { + + THNN_(SpatialGridSamplerBilinear_shapeCheck)(input, grid, gradOutput); + int N = THTensor_(size)(input, 0); + int C = THTensor_(size)(input, 1); + int IH = THTensor_(size)(input, 2); + int IW = THTensor_(size)(input, 3); + int H = THTensor_(size)(grid, 1); + int W = THTensor_(size)(grid, 2); + + THTensor_(resize4d)(gradInput, N, C, IH, IW); + THTensor_(resize4d)(gradGrid, N, H, W, 2); + THTensor_(zero)(gradInput); + THTensor_(zero)(gradGrid); + + // loop over each output pixel + int n, h, w; +#pragma omp parallel for private(n, h, w) + for (n = 0; n < N; ++n) { + for (h = 0; h < H; ++h) { + for (w = 0; w < W; ++w) { + // get the corresponding input x, y co-ordinates from grid + real ix = THTensor_(fastGet4d)(grid, n, h, w, 0); + real iy = THTensor_(fastGet4d)(grid, n, h, w, 1); + + real gix = 0; + real giy = 0; + + // normalize ix, iy from [-1, 1] to [0, H-1] & [0, W-1] + ix = ((ix + 1) / 2) * (IW-1); + iy = ((iy + 1) / 2) * (IH-1); + + // get NE, NW, SE, SW pixel values from (x, y) + int ix_nw = floor(ix); + int iy_nw = floor(iy); + int ix_ne = ix_nw + 1; + int iy_ne = iy_nw; + int ix_sw = ix_nw; + int iy_sw = iy_nw + 1; + int ix_se = ix_nw + 1; + int iy_se = iy_nw + 1; + + // get surfaces to each neighbor: + real nw = (ix_se - ix) * (iy_se - iy); + real ne = (ix - ix_sw) * (iy_sw - iy); + real sw = (ix_ne - ix) * (iy - iy_ne); + real se = (ix - ix_nw) * (iy - iy_nw); + + int ix_nw_cl, iy_nw_cl, ix_ne_cl, iy_ne_cl, ix_sw_cl, iy_sw_cl, ix_se_cl, iy_se_cl; + + if (padding_mode==MODE_BORDER){ + // get clipped NE, NW, SE, SW pixel values from (x, y) + CLIP_COORDINATES(ix_nw, ix_nw_cl, IW); + CLIP_COORDINATES(iy_nw, iy_nw_cl, IH); + CLIP_COORDINATES(ix_ne, ix_ne_cl, IW); + CLIP_COORDINATES(iy_ne, iy_ne_cl, IH); + CLIP_COORDINATES(ix_sw, ix_sw_cl, IW); + CLIP_COORDINATES(iy_sw, iy_sw_cl, IH); + CLIP_COORDINATES(ix_se, ix_se_cl, IW); + CLIP_COORDINATES(iy_se, iy_se_cl, IH); + } + else { + ix_nw_cl = ix_nw; + iy_nw_cl = iy_nw; + ix_ne_cl = ix_ne; + iy_ne_cl = iy_ne; + ix_sw_cl = ix_sw; + iy_sw_cl = iy_sw; + ix_se_cl = ix_se; + iy_se_cl = iy_se; + } + + for (int c = 0; c < C; ++c) { + real gradout = THTensor_(fastGet4d)(gradOutput, n, c, h, w); + + // calculate and set gradInput + SAFE_ADD(gradInput, ix_nw_cl, iy_nw_cl, n, c, IH, IW, nw * gradout); + SAFE_ADD(gradInput, ix_ne_cl, iy_ne_cl, n, c, IH, IW, ne * gradout); + SAFE_ADD(gradInput, ix_sw_cl, iy_sw_cl, n, c, IH, IW, sw * gradout); + SAFE_ADD(gradInput, ix_se_cl, iy_se_cl, n, c, IH, IW, se * gradout); + + // calculate gradGrid + real nw_val = SAFE_GET(input, ix_nw_cl, iy_nw_cl, n, c, IH, IW); + real ne_val = SAFE_GET(input, ix_ne_cl, iy_ne_cl, n, c, IH, IW); + real sw_val = SAFE_GET(input, ix_sw_cl, iy_sw_cl, n, c, IH, IW); + real se_val = SAFE_GET(input, ix_se_cl, iy_se_cl, n, c, IH, IW); + + gix -= nw_val * (iy_se - iy) * gradout; + gix += ne_val * (iy_sw - iy) * gradout; + gix -= sw_val * (iy - iy_ne) * gradout; + gix += se_val * (iy - iy_nw) * gradout; + + giy -= nw_val * (ix_se - ix) * gradout; + giy -= ne_val * (ix - ix_sw) * gradout; + giy += sw_val * (ix_ne - ix) * gradout; + giy += se_val * (ix - ix_nw) * gradout; + } + + // un-normalize gradGrid values back to [-1, 1] constraints + gix = gix * (IW - 1) / 2; + giy = giy * (IH - 1) / 2; + + real gix_old = THTensor_(fastGet4d)(gradGrid, n, h, w, 0); + real giy_old = THTensor_(fastGet4d)(gradGrid, n, h, w, 1); + + THTensor_(fastSet4d)(gradGrid, n, h, w, 0, gix_old + gix); + THTensor_(fastSet4d)(gradGrid, n, h, w, 1, giy_old + giy); + } + } + } +} + + +#undef MIN +#undef MAX +#undef SAFE_GET +#undef CLIP_COORDINATES +#undef SAFE_ADD +#undef MODE_BORDER + +#endif diff --git a/aten/src/THNN/generic/THNN.h b/aten/src/THNN/generic/THNN.h index 1d7a9176553756..455da04c7e4454 100644 --- a/aten/src/THNN/generic/THNN.h +++ b/aten/src/THNN/generic/THNN.h @@ -90,8 +90,7 @@ TH_API void THNN_(ELU_updateOutput)( THTensor *input, // input tensor THTensor *output, // [OUT] ELU output accreal alpha, // an ELU parameter (as in paper) - accreal scale, // scaling factor for output - accreal input_scale, // scaling factor for input + accreal scale, // scaling factor bool inplace); // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated) TH_API void THNN_(ELU_updateGradInput)( THNNState *state, // library's state @@ -99,8 +98,7 @@ TH_API void THNN_(ELU_updateGradInput)( THTensor *gradInput, // [OUT] gradient w.r.t. input THTensor *output, // output from a forward pass accreal alpha, // an ELU parameter (as in paper) - accreal scale, - accreal input_scale); + accreal scale); TH_API void THNN_(DistKLDivCriterion_updateOutput)( THNNState *state, // library's state @@ -1229,6 +1227,34 @@ TH_API void THNN_(SpatialUpSamplingBilinear_updateGradInput)( int osizeW, bool align_corners); +TH_API void THNN_(SpatialGridSamplerBilinear_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *grid, + THTensor *output, + int padding_mode); + +TH_API void THNN_(SpatialGridSamplerBilinear_updateGradInput)( + THNNState *state, + THTensor *input, THTensor *gradInput, + THTensor *grid, THTensor *gradGrid, + THTensor *gradOutput, + int padding_mode); + +TH_API void THNN_(VolumetricGridSamplerBilinear_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *grid, + THTensor *output, + int padding_mode); + +TH_API void THNN_(VolumetricGridSamplerBilinear_updateGradInput)( + THNNState *state, + THTensor *input, THTensor *gradInput, + THTensor *grid, THTensor *gradGrid, + THTensor *gradOutput, + int padding_mode); + TH_API void THNN_(unfolded_acc)( THTensor *finput, THTensor *input, diff --git a/aten/src/THNN/generic/TemporalRowConvolution.c b/aten/src/THNN/generic/TemporalRowConvolution.c index e7b51ec194c402..b623e5a2ad7fd4 100644 --- a/aten/src/THNN/generic/TemporalRowConvolution.c +++ b/aten/src/THNN/generic/TemporalRowConvolution.c @@ -38,7 +38,7 @@ static inline void THNN_(TemporalRowConvolution_shapeCheck)( THNN_ARGCHECK(!input->is_empty() && (ndim == 2 || ndim == 3), 1, input, "non-empty 2D or 3D (batch mode) input tensor expected, but got :%s"); - int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0); + int64_t inputFrameSize = weight->size(0); int64_t nInputFrame = input->size(dimS); int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1; @@ -197,7 +197,7 @@ void THNN_(TemporalRowConvolution_updateOutput)( THNN_(TemporalRowConvolution_shapeCheck)( state, input, NULL, weight, bias, kW, dW, padW); - int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0); + int64_t inputFrameSize = weight->size(0); int64_t nInputFrame = input->size(ndim - 1); int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1; @@ -311,7 +311,7 @@ void THNN_(TemporalRowConvolution_updateGradInput)( THNN_(TemporalRowConvolution_shapeCheck)(state, input, gradOutput, weight, NULL, kW, dW, padW); - int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0); + int64_t inputFrameSize = weight->size(0); int64_t nInputFrame = input->size(ndim - 1); int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1; @@ -386,7 +386,7 @@ static void THNN_(TemporalRowConvolution_accGradParameters_frame)( THTensor_(free)(tfinput); if (gradBias != NULL) { - for (i = 0; i < THTensor_sizeLegacyNoScalars(gradBias, 0); i++) { + for (i = 0; i < gradBias->size(0); i++) { int64_t k; real sum = 0; real *data = THStorage_(data)(THTensor_getStoragePtr(gradOutput3d)) diff --git a/aten/src/THNN/generic/VolumetricConvolution.c b/aten/src/THNN/generic/VolumetricConvolution.c index c979edf71f8f4c..4b74445e047705 100644 --- a/aten/src/THNN/generic/VolumetricConvolution.c +++ b/aten/src/THNN/generic/VolumetricConvolution.c @@ -51,7 +51,7 @@ void THNN_(VolumetricConvolution_updateOutput)( /* add bias */ if (bias) { - for (i = 0; i < THTensor_sizeLegacyNoScalars(bias, 0); i++) + for (i = 0; i < bias->size(0); i++) { THTensor_(select)(outn, output, 0, i); THTensor_(fill)(outn, THTensor_(get1d)(bias, i)); @@ -78,7 +78,7 @@ void THNN_(VolumetricConvolution_updateOutput)( /* add bias */ if (bias) { - for (i = 0; i < THTensor_sizeLegacyNoScalars(bias, 0); i++) + for (i = 0; i < bias->size(0); i++) { THTensor_(select)(outn, outb, 0, i); THTensor_(fill)(outn, THTensor_(get1d)(bias, i)); @@ -117,7 +117,7 @@ void THNN_(VolumetricConvolution_updateGradInput)( "non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor " "expected for weight, but got: %s"); - int nOutputPlane = (int)THTensor_sizeLegacyNoScalars(weight, 0); + int nOutputPlane = (int)weight->size(0); THNN_ARGCHECK(!gradOutput->is_empty() && (gradOutput->dim() == 4 || gradOutput->dim() == 5), 3, gradOutput, @@ -187,9 +187,9 @@ void THNN_(VolumetricConvolution_accGradParameters)( "non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor " "expected for gradWeight, but got: %s"); - int nOutputPlane = (int)THTensor_sizeLegacyNoScalars(gradWeight, 0); + int nOutputPlane = (int)gradWeight->size(0); if (gradBias) { - THArgCheck(!gradBias->is_empty() && THTensor_nDimensionLegacyNoScalars(gradBias) == 1 && THTensor_sizeLegacyNoScalars(gradBias, 0) == nOutputPlane, 5, + THArgCheck(!gradBias->is_empty() && gradBias->dim() == 1 && gradBias->size(0) == nOutputPlane, 5, "gradBias tensor has wrong size" ); } diff --git a/aten/src/THNN/generic/VolumetricConvolutionMM.c b/aten/src/THNN/generic/VolumetricConvolutionMM.c index 209d1575dacbec..14d98a79dd29b8 100644 --- a/aten/src/THNN/generic/VolumetricConvolutionMM.c +++ b/aten/src/THNN/generic/VolumetricConvolutionMM.c @@ -102,7 +102,7 @@ static void inline THNN_(VolumetricConvolutionMM_shapeCheck)( int64_t nOutputPlane = weight->size(0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0); + int64_t nOutputPlane = bias->size(0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, outputDepth); @@ -691,7 +691,7 @@ static void THNN_(VolumetricConvolutionMM_accGradParameters_frame)( } if (gradBias) { - for (i = 0; i < THTensor_sizeLegacyNoScalars(gradBias, 0); i++) + for (i = 0; i < gradBias->size(0); i++) { int64_t k; real sum = 0; diff --git a/aten/src/THNN/generic/VolumetricDilatedConvolution.c b/aten/src/THNN/generic/VolumetricDilatedConvolution.c index c9fa19f0adf488..8222c534612fd5 100644 --- a/aten/src/THNN/generic/VolumetricDilatedConvolution.c +++ b/aten/src/THNN/generic/VolumetricDilatedConvolution.c @@ -69,7 +69,7 @@ static inline void THNN_(VolumetricDilatedConvolution_shapeCheck)( int64_t nOutputPlane = weight->size(0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0); + int64_t nOutputPlane = bias->size(0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimd, outputDepth); diff --git a/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c b/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c index 16dedeffb9c58f..4cc4dcc69837d8 100644 --- a/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c +++ b/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c @@ -154,7 +154,7 @@ static inline void THNN_(VolumetricFullDilatedConvolution_shapeCheck)( const int64_t nOutputPlane = weight->size(1); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } else if (bias != NULL) { - const int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0); + const int64_t nOutputPlane = bias->size(0); THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); } THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimd, outputDepth); @@ -441,7 +441,7 @@ void THNN_(VolumetricFullDilatedConvolution_accGradParameters)( if (gradWeight) { nOutputPlane = THTensor_(size)(gradWeight, 1); } else if (gradBias) { - nOutputPlane = THTensor_sizeLegacyNoScalars(gradBias, 0); + nOutputPlane = THTensor_(size)(gradBias, 0); } else { return; } diff --git a/aten/src/THNN/generic/VolumetricGridSamplerBilinear.c b/aten/src/THNN/generic/VolumetricGridSamplerBilinear.c new file mode 100644 index 00000000000000..4d7ace422d4e97 --- /dev/null +++ b/aten/src/THNN/generic/VolumetricGridSamplerBilinear.c @@ -0,0 +1,409 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/VolumetricGridSamplerBilinear.c" +#else + +#undef MIN +#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) ) +#undef MAX +#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) ) + +#undef MODE_BORDER +#define MODE_BORDER 1 + +static inline void THNN_(VolumetricGridSamplerBilinear_shapeCheck) + (THTensor *input, THTensor *grid, THTensor *gradOutput) { + THNN_ARGCHECK(!input->is_empty() && input->dim() == 5, 2, input, + "non-empty 5D input tensor expected but got: %s"); + THNN_ARGCHECK(!grid->is_empty() && grid->dim() == 5, 2, grid, + "non-empty 5D grid tensor expected but got: %s"); + + int nbatch = THTensor_(size)(input, 0); + int channels = THTensor_(size)(input, 1); + int odepth = THTensor_(size)(grid, 1); + int oheight = THTensor_(size)(grid, 2); + int owidth = THTensor_(size)(grid, 3); + + THNN_CHECK_DIM_SIZE(grid, 5, 0, nbatch); + THNN_CHECK_DIM_SIZE(grid, 5, 4, 3); + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, 5, 0, nbatch); + THNN_CHECK_DIM_SIZE(gradOutput, 5, 1, channels); + THNN_CHECK_DIM_SIZE(gradOutput, 5, 2, odepth); + THNN_CHECK_DIM_SIZE(gradOutput, 5, 3, oheight); + THNN_CHECK_DIM_SIZE(gradOutput, 5, 4, owidth); + } +} + +#define SAFE_GET(input, x, y, z, n, c, D, H, W) \ + x >= 0 && x < W && y >=0 && y < H && z >= 0 && z < D \ + ? THTensor_(fastGet5d)(input, n, c, z, y, x) : 0 + +#define CLIP_COORDINATES(in, out, clip_limit) out = MIN((clip_limit-1), MAX(in, 0)) + +TH_API void THNN_(VolumetricGridSamplerBilinear_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *grid, + THTensor *output, + int padding_mode) { + + THNN_(VolumetricGridSamplerBilinear_shapeCheck)(input, grid, NULL); + int N = THTensor_(size)(input, 0); + int C = THTensor_(size)(input, 1); + int ID = THTensor_(size)(input, 2); + int IH = THTensor_(size)(input, 3); + int IW = THTensor_(size)(input, 4); + int D = THTensor_(size)(grid, 1); + int H = THTensor_(size)(grid, 2); + int W = THTensor_(size)(grid, 3); + + // resize output to the same shape as input + THTensor_(resize5d)(output, N, C, D, H, W); + + // loop over each output pixel + int n, d, h, w, c; +#pragma omp parallel for private(n, d, h, w, c) + for (n = 0; n < N; ++n) { + for (d = 0; d < D; ++d) { + for (h = 0; h < H; ++h) { + for (w = 0; w < W; ++w) { + // get the corresponding input x, y, z co-ordinates from grid + real ix = THTensor_(fastGet5d)(grid, n, d, h, w, 0); + real iy = THTensor_(fastGet5d)(grid, n, d, h, w, 1); + real iz = THTensor_(fastGet5d)(grid, n, d, h, w, 2); + + // normalize ix, iy, iz from [-1, 1] to [0, IW-1] & [0, IH-1] & [0, ID-1] + ix = ((ix + 1) / 2) * (IW-1); + iy = ((iy + 1) / 2) * (IH-1); + iz = ((iz + 1) / 2) * (ID-1); + + // get corner pixel values from (x, y, z) + // for 4d, we used north-east-south-west + // for 5d, we add top-bottom + int ix_tnw = floor(ix); + int iy_tnw = floor(iy); + int iz_tnw = floor(iz); + + int ix_tne = ix_tnw + 1; + int iy_tne = iy_tnw; + int iz_tne = iz_tnw; + + int ix_tsw = ix_tnw; + int iy_tsw = iy_tnw + 1; + int iz_tsw = iz_tnw; + + int ix_tse = ix_tnw + 1; + int iy_tse = iy_tnw + 1; + int iz_tse = iz_tnw; + + int ix_bnw = ix_tnw; + int iy_bnw = iy_tnw; + int iz_bnw = iz_tnw + 1; + + int ix_bne = ix_tnw + 1; + int iy_bne = iy_tnw; + int iz_bne = iz_tnw + 1; + + int ix_bsw = ix_tnw; + int iy_bsw = iy_tnw + 1; + int iz_bsw = iz_tnw + 1; + + int ix_bse = ix_tnw + 1; + int iy_bse = iy_tnw + 1; + int iz_bse = iz_tnw + 1; + + // get surfaces to each neighbor: + real tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz); + real tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz); + real tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz); + real tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz); + real bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse); + real bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw); + real bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne); + real bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw); + + if (padding_mode==MODE_BORDER){ + // clip coordinates to image borders + CLIP_COORDINATES(ix_tnw, ix_tnw, IW); + CLIP_COORDINATES(iy_tnw, iy_tnw, IH); + CLIP_COORDINATES(iz_tnw, iz_tnw, ID); + CLIP_COORDINATES(ix_tne, ix_tne, IW); + CLIP_COORDINATES(iy_tne, iy_tne, IH); + CLIP_COORDINATES(iz_tne, iz_tne, ID); + CLIP_COORDINATES(ix_tsw, ix_tsw, IW); + CLIP_COORDINATES(iy_tsw, iy_tsw, IH); + CLIP_COORDINATES(iz_tsw, iz_tsw, ID); + CLIP_COORDINATES(ix_tse, ix_tse, IW); + CLIP_COORDINATES(iy_tse, iy_tse, IH); + CLIP_COORDINATES(iz_tse, iz_tse, ID); + CLIP_COORDINATES(ix_bnw, ix_bnw, IW); + CLIP_COORDINATES(iy_bnw, iy_bnw, IH); + CLIP_COORDINATES(iz_bnw, iz_bnw, ID); + CLIP_COORDINATES(ix_bne, ix_bne, IW); + CLIP_COORDINATES(iy_bne, iy_bne, IH); + CLIP_COORDINATES(iz_bne, iz_bne, ID); + CLIP_COORDINATES(ix_bsw, ix_bsw, IW); + CLIP_COORDINATES(iy_bsw, iy_bsw, IH); + CLIP_COORDINATES(iz_bsw, iz_bsw, ID); + CLIP_COORDINATES(ix_bse, ix_bse, IW); + CLIP_COORDINATES(iy_bse, iy_bse, IH); + CLIP_COORDINATES(iz_bse, iz_bse, ID); + } + + // calculate bilinear weighted pixel value and set output pixel + for (c = 0; c < C; ++c) { + // (c, iy_nw, ix_nw) * nw + (c, iy_ne, ix_ne) * ne + // + (c, iy_sw, ix_sw) * sw + (c, iy_se, ix_se) * se + real tnw_val = SAFE_GET(input, ix_tnw, iy_tnw, iz_tnw, n, c, ID, IH, IW); + real tne_val = SAFE_GET(input, ix_tne, iy_tne, iz_tne, n, c, ID, IH, IW); + real tsw_val = SAFE_GET(input, ix_tsw, iy_tsw, iz_tsw, n, c, ID, IH, IW); + real tse_val = SAFE_GET(input, ix_tse, iy_tse, iz_tse, n, c, ID, IH, IW); + real bnw_val = SAFE_GET(input, ix_bnw, iy_bnw, iz_bnw, n, c, ID, IH, IW); + real bne_val = SAFE_GET(input, ix_bne, iy_bne, iz_bne, n, c, ID, IH, IW); + real bsw_val = SAFE_GET(input, ix_bsw, iy_bsw, iz_bsw, n, c, ID, IH, IW); + real bse_val = SAFE_GET(input, ix_bse, iy_bse, iz_bse, n, c, ID, IH, IW); + real out_val = tnw_val * tnw + tne_val * tne + tsw_val * tsw + tse_val * tse + + bnw_val * bnw + bne_val * bne + bsw_val * bsw + bse_val * bse; + THTensor_(fastSet5d)(output, n, c, d, h, w, out_val); + } + } + } + } + } +} + +#define SAFE_ADD(input, x, y, z, n, c, D, H, W, value) \ + do { \ + if (x >= 0 && x < W && y >=0 && y < H && z >=0 && z < D) { \ + real old_value = THTensor_(fastGet5d)(input, n, c, z, y, x); \ + THTensor_(fastSet5d)(input, n, c, z, y, x, value + old_value); \ + } \ + } while(0) + +TH_API void THNN_(VolumetricGridSamplerBilinear_updateGradInput)( + THNNState *state, + THTensor *input, THTensor *gradInput, + THTensor *grid, THTensor *gradGrid, + THTensor *gradOutput, + int padding_mode) { + + THNN_(VolumetricGridSamplerBilinear_shapeCheck)(input, grid, gradOutput); + int N = THTensor_(size)(input, 0); + int C = THTensor_(size)(input, 1); + int ID = THTensor_(size)(input, 2); + int IH = THTensor_(size)(input, 3); + int IW = THTensor_(size)(input, 4); + int D = THTensor_(size)(grid, 1); + int H = THTensor_(size)(grid, 2); + int W = THTensor_(size)(grid, 3); + + THTensor_(resize5d)(gradInput, N, C, ID, IH, IW); + THTensor_(resize5d)(gradGrid, N, D, H, W, 3); + THTensor_(zero)(gradInput); + THTensor_(zero)(gradGrid); + + // loop over each output pixel + int n, d, h, w; +//#pragma omp parallel for private(n, d, h, w) + for (n = 0; n < N; ++n) { + for (d = 0; d < D; ++d) { + for (h = 0; h < H; ++h) { + for (w = 0; w < W; ++w) { + // get the corresponding input x, y, z co-ordinates from grid + real ix = THTensor_(fastGet5d)(grid, n, d, h, w, 0); + real iy = THTensor_(fastGet5d)(grid, n, d, h, w, 1); + real iz = THTensor_(fastGet5d)(grid, n, d, h, w, 2); + + real gix = 0; + real giy = 0; + real giz = 0; + + // normalize ix, iy, iz from [-1, 1] to [0, W-1] & [0, H-1] & [0, D-1] + ix = ((ix + 1) / 2) * (IW-1); + iy = ((iy + 1) / 2) * (IH-1); + iz = ((iz + 1) / 2) * (ID-1); + + // get corner pixel values from (x, y, z) + // for 4d, we used north-east-south-west + // for 5d, we add top-bottom + int ix_tnw = floor(ix); + int iy_tnw = floor(iy); + int iz_tnw = floor(iz); + + int ix_tne = ix_tnw + 1; + int iy_tne = iy_tnw; + int iz_tne = iz_tnw; + + int ix_tsw = ix_tnw; + int iy_tsw = iy_tnw + 1; + int iz_tsw = iz_tnw; + + int ix_tse = ix_tnw + 1; + int iy_tse = iy_tnw + 1; + int iz_tse = iz_tnw; + + int ix_bnw = ix_tnw; + int iy_bnw = iy_tnw; + int iz_bnw = iz_tnw + 1; + + int ix_bne = ix_tnw + 1; + int iy_bne = iy_tnw; + int iz_bne = iz_tnw + 1; + + int ix_bsw = ix_tnw; + int iy_bsw = iy_tnw + 1; + int iz_bsw = iz_tnw + 1; + + int ix_bse = ix_tnw + 1; + int iy_bse = iy_tnw + 1; + int iz_bse = iz_tnw + 1; + + // get surfaces to each neighbor: + real tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz); + real tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz); + real tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz); + real tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz); + real bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse); + real bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw); + real bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne); + real bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw); + + int ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, ix_tne_cl, iy_tne_cl, iz_tne_cl; + int ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, ix_tse_cl, iy_tse_cl, iz_tse_cl; + int ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, ix_bne_cl, iy_bne_cl, iz_bne_cl; + int ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, ix_bse_cl, iy_bse_cl, iz_bse_cl; + + if (padding_mode==MODE_BORDER){ + // clip coordinates to image borders + CLIP_COORDINATES(ix_tnw, ix_tnw_cl, IW); + CLIP_COORDINATES(iy_tnw, iy_tnw_cl, IH); + CLIP_COORDINATES(iz_tnw, iz_tnw_cl, ID); + CLIP_COORDINATES(ix_tne, ix_tne_cl, IW); + CLIP_COORDINATES(iy_tne, iy_tne_cl, IH); + CLIP_COORDINATES(iz_tne, iz_tne_cl, ID); + CLIP_COORDINATES(ix_tsw, ix_tsw_cl, IW); + CLIP_COORDINATES(iy_tsw, iy_tsw_cl, IH); + CLIP_COORDINATES(iz_tsw, iz_tsw_cl, ID); + CLIP_COORDINATES(ix_tse, ix_tse_cl, IW); + CLIP_COORDINATES(iy_tse, iy_tse_cl, IH); + CLIP_COORDINATES(iz_tse, iz_tse_cl, ID); + CLIP_COORDINATES(ix_bnw, ix_bnw_cl, IW); + CLIP_COORDINATES(iy_bnw, iy_bnw_cl, IH); + CLIP_COORDINATES(iz_bnw, iz_bnw_cl, ID); + CLIP_COORDINATES(ix_bne, ix_bne_cl, IW); + CLIP_COORDINATES(iy_bne, iy_bne_cl, IH); + CLIP_COORDINATES(iz_bne, iz_bne_cl, ID); + CLIP_COORDINATES(ix_bsw, ix_bsw_cl, IW); + CLIP_COORDINATES(iy_bsw, iy_bsw_cl, IH); + CLIP_COORDINATES(iz_bsw, iz_bsw_cl, ID); + CLIP_COORDINATES(ix_bse, ix_bse_cl, IW); + CLIP_COORDINATES(iy_bse, iy_bse_cl, IH); + CLIP_COORDINATES(iz_bse, iz_bse_cl, ID); + } + else { + ix_tnw_cl = ix_tnw; + iy_tnw_cl = iy_tnw; + iz_tnw_cl = iz_tnw; + ix_tne_cl = ix_tne; + iy_tne_cl = iy_tne; + iz_tne_cl = iz_tne; + ix_tsw_cl = ix_tsw; + iy_tsw_cl = iy_tsw; + iz_tsw_cl = iz_tsw; + ix_tse_cl = ix_tse; + iy_tse_cl = iy_tse; + iz_tse_cl = iz_tse; + ix_bnw_cl = ix_bnw; + iy_bnw_cl = iy_bnw; + iz_bnw_cl = iz_bnw; + ix_bne_cl = ix_bne; + iy_bne_cl = iy_bne; + iz_bne_cl = iz_bne; + ix_bsw_cl = ix_bsw; + iy_bsw_cl = iy_bsw; + iz_bsw_cl = iz_bsw; + ix_bse_cl = ix_bse; + iy_bse_cl = iy_bse; + iz_bse_cl = iz_bse; + } + + for (int c = 0; c < C; ++c) { + real gradout = THTensor_(fastGet5d)(gradOutput, n, c, d, h, w); + + // calculate and set gradInput + SAFE_ADD(gradInput, ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, n, c, ID, IH, IW, tnw * gradout); + SAFE_ADD(gradInput, ix_tne_cl, iy_tne_cl, iz_tne_cl, n, c, ID, IH, IW, tne * gradout); + SAFE_ADD(gradInput, ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, n, c, ID, IH, IW, tsw * gradout); + SAFE_ADD(gradInput, ix_tse_cl, iy_tse_cl, iz_tse_cl, n, c, ID, IH, IW, tse * gradout); + SAFE_ADD(gradInput, ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, n, c, ID, IH, IW, bnw * gradout); + SAFE_ADD(gradInput, ix_bne_cl, iy_bne_cl, iz_bne_cl, n, c, ID, IH, IW, bne * gradout); + SAFE_ADD(gradInput, ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, n, c, ID, IH, IW, bsw * gradout); + SAFE_ADD(gradInput, ix_bse_cl, iy_bse_cl, iz_bse_cl, n, c, ID, IH, IW, bse * gradout); + + // calculate gradGrid + real tnw_val = SAFE_GET(input, ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, n, c, ID, IH, IW); + real tne_val = SAFE_GET(input, ix_tne_cl, iy_tne_cl, iz_tne_cl, n, c, ID, IH, IW); + real tsw_val = SAFE_GET(input, ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, n, c, ID, IH, IW); + real tse_val = SAFE_GET(input, ix_tse_cl, iy_tse_cl, iz_tse_cl, n, c, ID, IH, IW); + real bnw_val = SAFE_GET(input, ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, n, c, ID, IH, IW); + real bne_val = SAFE_GET(input, ix_bne_cl, iy_bne_cl, iz_bne_cl, n, c, ID, IH, IW); + real bsw_val = SAFE_GET(input, ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, n, c, ID, IH, IW); + real bse_val = SAFE_GET(input, ix_bse_cl, iy_bse_cl, iz_bse_cl, n, c, ID, IH, IW); + + gix -= tnw_val * (iy_bse - iy) * (iz_bse - iz) * gradout; + gix += tne_val * (iy_bsw - iy) * (iz_bsw - iz) * gradout; + gix -= tsw_val * (iy - iy_bne) * (iz_bne - iz) * gradout; + gix += tse_val * (iy - iy_bnw) * (iz_bnw - iz) * gradout; + gix -= bnw_val * (iy_tse - iy) * (iz - iz_tse) * gradout; + gix += bne_val * (iy_tsw - iy) * (iz - iz_tsw) * gradout; + gix -= bsw_val * (iy - iy_tne) * (iz - iz_tne) * gradout; + gix += bse_val * (iy - iy_tnw) * (iz - iz_tnw) * gradout; + + + giy -= tnw_val * (ix_bse - ix) * (iz_bse - iz) * gradout; + giy -= tne_val * (ix - ix_bsw) * (iz_bsw - iz) * gradout; + giy += tsw_val * (ix_bne - ix) * (iz_bne - iz) * gradout; + giy += tse_val * (ix - ix_bnw) * (iz_bnw - iz) * gradout; + giy -= bnw_val * (ix_tse - ix) * (iz - iz_tse) * gradout; + giy -= bne_val * (ix - ix_tsw) * (iz - iz_tsw) * gradout; + giy += bsw_val * (ix_tne - ix) * (iz - iz_tne) * gradout; + giy += bse_val * (ix - ix_tnw) * (iz - iz_tnw) * gradout; + + giz -= tnw_val * (ix_bse - ix) * (iy_bse - iy) * gradout; + giz -= tne_val * (ix - ix_bsw) * (iy_bsw - iy) * gradout; + giz -= tsw_val * (ix_bne - ix) * (iy - iy_bne) * gradout; + giz -= tse_val * (ix - ix_bnw) * (iy - iy_bnw) * gradout; + giz += bnw_val * (ix_tse - ix) * (iy_tse - iy) * gradout; + giz += bne_val * (ix - ix_tsw) * (iy_tsw - iy) * gradout; + giz += bsw_val * (ix_tne - ix) * (iy - iy_tne) * gradout; + giz += bse_val * (ix - ix_tnw) * (iy - iy_tnw) * gradout; + + } + + // un-normalize gradGrid values back to [-1, 1] constraints + gix = gix * (IW - 1) / 2; + giy = giy * (IH - 1) / 2; + giz = giz * (ID - 1) / 2; + + real gix_old = THTensor_(fastGet5d)(gradGrid, n, d, h, w, 0); + real giy_old = THTensor_(fastGet5d)(gradGrid, n, d, h, w, 1); + real giz_old = THTensor_(fastGet5d)(gradGrid, n, d, h, w, 2); + + THTensor_(fastSet5d)(gradGrid, n, d, h, w, 0, gix_old + gix); + THTensor_(fastSet5d)(gradGrid, n, d, h, w, 1, giy_old + giy); + THTensor_(fastSet5d)(gradGrid, n, d, h, w, 2, giz_old + giz); + } + } + } + } +} + +#undef MIN +#undef MAX +#undef SAFE_GET +#undef CLIP_COORDINATES +#undef SAFE_ADD +#undef MODE_BORDER + +#endif diff --git a/aten/src/THNN/init.cpp b/aten/src/THNN/init.cpp index c77cd76d54ec87..6c79f5be295b60 100644 --- a/aten/src/THNN/init.cpp +++ b/aten/src/THNN/init.cpp @@ -45,7 +45,7 @@ #define THNN_CHECK_DIM_SIZE(T, DIM, DIM_SIZE, SIZE) \ if (THTensor_(nDimensionLegacyNoScalars)(T) != DIM || \ - THTensor_sizeLegacyNoScalars(T, DIM_SIZE) != SIZE) { \ + THTensor_(size)(T, DIM_SIZE) != SIZE) { \ THDescBuff s1 = THTensor_(sizeDesc)(T); \ THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \ " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \ @@ -53,7 +53,7 @@ #define THNN_CHECK_DIM_SIZE_INDICES(T, DIM, DIM_SIZE, SIZE) \ if (THIndexTensor_(nDimensionLegacyNoScalars)(T) != DIM || \ - THTensor_sizeLegacyNoScalars(T, DIM_SIZE) != SIZE) { \ + THIndexTensor_(size)(T, DIM_SIZE) != SIZE) { \ THDescBuff s1 = THIndexTensor_(sizeDesc)(T); \ THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \ " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \ @@ -245,6 +245,9 @@ #include "generic/SpatialUpSamplingBilinear.c" #include "THGenerateFloatTypes.h" +#include "generic/SpatialGridSamplerBilinear.c" +#include "THGenerateFloatTypes.h" + #include "generic/VolumetricAveragePooling.c" #include "THGenerateFloatTypes.h" @@ -301,3 +304,6 @@ #include "generic/VolumetricUpSamplingTrilinear.c" #include "THGenerateFloatTypes.h" + +#include "generic/VolumetricGridSamplerBilinear.c" +#include "THGenerateFloatTypes.h" diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 588dae10e8e8e3..0d84ccbfb606a1 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -40,7 +40,6 @@ if(BUILD_ATEN) # ATen tests use catch instead of gtest so keep separate for now # list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS}) # list(APPEND Caffe2_GPU_TEST_SRCS ${ATen_CUDA_TEST_SRCS}) - list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CORE_TEST_SRCS}) list(APPEND Caffe2_CPU_INCLUDE ${ATen_CPU_INCLUDE}) list(APPEND Caffe2_GPU_INCLUDE ${ATen_CUDA_INCLUDE}) list(APPEND Caffe2_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS}) @@ -52,15 +51,6 @@ if(BUILD_ATEN) set(Caffe2_HIP_SRCS ${ATen_CUDA_SRCS}) set(Caffe2_HIP_INCLUDES ${Caffe2_HIP_INCLUDES} ${Caffe2_GPU_INCLUDE}) ENDIF(USE_ROCM) -else() - # Only add "ATen Core", a minimal, easy-to-compile fragment of ATen. - # This codepath should only be exercised by the Android build. - add_subdirectory(../aten/src/ATen/core ATen_core) - list(APPEND Caffe2_CPU_SRCS ${ATen_CORE_SRCS}) - list(APPEND Caffe2_CPU_INCLUDE ${ATen_CORE_INCLUDE}) - list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CORE_TEST_SRCS}) - # TODO: We should probably install the headers, but I don't know - # how to do that. endif() # ---[ Torch build @@ -225,72 +215,6 @@ target_include_directories(caffe2 SYSTEM PRIVATE "${Caffe2_DEPENDENCY_INCLUDE}") aten_set_target_props(caffe2) target_compile_options(caffe2 INTERFACE "-std=c++11") target_compile_options(caffe2 PRIVATE "-DCAFFE2_BUILD_MAIN_LIB") -if (MSVC AND NOT BUILD_SHARED_LIBS) - # Note [Supporting both static and dynamic libraries on Window] - # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # A Windows library may be distributed as either a static or dynamic - # library. The chosen distribution mechanism affects how you setup - # the headers for the library: if you statically link a function, - # all you need is an ordinary signature: - # - # void f(); - # - # But if you *dynamically* link it, then you must provide a __declspec - # specifying that it should be imported from a DLL: - # - # __declspec(dllimport) void f(); - # - # Mixing the two situations will not work: if you specify dllimport - # while statically linking, the linker will complain it cannot find - # the __imp_f symbol (which serve as the DLL entrypoint); if you - # fail to specify dllimport for a symbol that's coming from a DLL, - # the linker will complain that it can't find f. Joy! - # - # Most places on the Internet, you will find people have written - # their headers under the assumption that the application will - # only ever be dynamically linked, as they define a macro which - # tags a function as __declspec(dllexport) if you are actually - # building the library, and __declspec(dllimport) otherwise. But - # if you want these headers to also work if you are linking against - # a static library, you need a way to avoid adding these __declspec's - # at all. And that "mechanism" needs to apply to any downstream - # libraries/executables which are going to link against your library. - # - # As an aside, why do we need to support both modes? - # For historical reasons, PyTorch ATen on Windows is built dynamically, - # while Caffe2 on Windows is built statically (mostly because if - # we build it dynamically, we are over the DLL exported symbol limit--and - # that is because Caffe2 hasn't comprehensively annotated all symbols - # which cross the DLL boundary with CAFFE_API). So any code - # which is used by both PyTorch and Caffe2 needs to support both - # modes of linking. - # - # So, you have a macro (call it AT_CORE_STATIC_WINDOWS) which you need to have - # set for any downstream library/executable that transitively includes your - # headers. How are you going to do this? You have two options: - # - # 1. Write out a config.h header which stores whether or not - # you are linking statically or dynamically. - # - # 2. Force all of users to set the the macro themselves. If they - # use cmake, you can set -DAT_CORE_STATIC_WINDOWS=1 as a PUBLIC - # compile option, in which case cmake will automatically - # add the macro for you. - # - # Which one is better? Well, it depends: they trade off implementor - # ease versus user ease: (1) is more work for the library author - # but the user doesn't have to worry about it; (2) requires the user - # to set the macro themselves... but only if they don't use cmake. - # - # So, which is appropriate in our situation? In my mind, here is - # the distinguishing factor: it is more common to distribute - # DLLs, since they don't require you to line up the CRT version - # (/MD, /MDd, /MT, /MTd) and MSVC version at the use site. So, - # if a user is already in the business of static linkage, they're - # already in "expert user" realm. So, I've decided that at this - # point in time, the simplicity of implementation of (2) wins out. - target_compile_options(caffe2 PUBLIC "-DAT_CORE_STATIC_WINDOWS=1") -endif() # Use -O2 for release builds (-O3 doesn't improve perf, and -Os results in perf regression) target_compile_options(caffe2 PRIVATE "$<$,$>:-O2>") install(TARGETS caffe2 EXPORT Caffe2Targets DESTINATION lib) diff --git a/caffe2/contrib/aten/aten_op.cc b/caffe2/contrib/aten/aten_op.cc index df3ee5326b7d90..bc93f4866ebc28 100644 --- a/caffe2/contrib/aten/aten_op.cc +++ b/caffe2/contrib/aten/aten_op.cc @@ -10,6 +10,7 @@ at::Backend ATenOp::backend() const { } OPERATOR_SCHEMA(ATen); +CAFFE_KNOWN_TYPE(at::Half); namespace math { template <> diff --git a/caffe2/core/context.h b/caffe2/core/context.h index fc3969879f30c4..f2831909e1587a 100644 --- a/caffe2/core/context.h +++ b/caffe2/core/context.h @@ -13,9 +13,6 @@ #include "caffe2/core/typeid.h" #include "caffe2/proto/caffe2.pb.h" -#include "ATen/core/ATenCoreTest.h" -#include "ATen/core/ArrayRef.h" - CAFFE2_DECLARE_bool(caffe2_report_cpu_memory_usage); namespace caffe2 { diff --git a/caffe2/core/context_test.cc b/caffe2/core/context_test.cc index 8924a9dc931be9..a6e44846e9e0be 100644 --- a/caffe2/core/context_test.cc +++ b/caffe2/core/context_test.cc @@ -6,11 +6,6 @@ namespace caffe2 { -TEST(CPUContextTest, ATenCoreTest) { - int i = at::CoreTest(); - EXPECT_EQ(i + 1, at::CoreTest()); -} - TEST(CPUContextTest, TestAllocAlignment) { for (int i = 1; i < 10; ++i) { auto data = CPUContext::New(i); diff --git a/caffe2/core/dispatch/DeviceId.h b/caffe2/core/dispatch/DeviceId.h index e5744ce1e1c2d6..e74a803557ea0d 100644 --- a/caffe2/core/dispatch/DeviceId.h +++ b/caffe2/core/dispatch/DeviceId.h @@ -1,8 +1,8 @@ #pragma once -#include #include #include +#include "caffe2/utils/C++17.h" namespace c10 { diff --git a/caffe2/core/dispatch/LayoutId.h b/caffe2/core/dispatch/LayoutId.h index 9ec44519b95a99..7f039fadfa9698 100644 --- a/caffe2/core/dispatch/LayoutId.h +++ b/caffe2/core/dispatch/LayoutId.h @@ -1,10 +1,10 @@ #pragma once -#include "ATen/core/IdWrapper.h" +#include "caffe2/utils/IdWrapper.h" namespace c10 { -class LayoutId final : public at::IdWrapper { +class LayoutId final : public c10::guts::IdWrapper { public: constexpr explicit LayoutId(underlying_type id): IdWrapper(id) {} @@ -19,4 +19,4 @@ class LayoutId final : public at::IdWrapper { } -AT_DEFINE_HASH_FOR_IDWRAPPER(c10::LayoutId) +C10_DEFINE_HASH_FOR_IDWRAPPER(c10::LayoutId) diff --git a/caffe2/core/dispatch/TensorTypeId.h b/caffe2/core/dispatch/TensorTypeId.h index 244817904667b9..a80fc8377c8ca5 100644 --- a/caffe2/core/dispatch/TensorTypeId.h +++ b/caffe2/core/dispatch/TensorTypeId.h @@ -1,6 +1,6 @@ #pragma once -#include "ATen/core/IdWrapper.h" +#include "caffe2/utils/IdWrapper.h" #include #include #include @@ -21,7 +21,7 @@ namespace details { /** * Dynamic type ID of a Tensor argument. It represents something like CPUTensor, etc. */ -class TensorTypeId final : public at::IdWrapper { +class TensorTypeId final : public guts::IdWrapper { public: // Don't use this! // Unfortunately, a default constructor needs to be defined because of https://reviews.llvm.org/D41223 @@ -35,4 +35,4 @@ class TensorTypeId final : public at::IdWrapper +#include "caffe2/utils/C++17.h" namespace c10 { diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpClasses.h b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpClasses.h index 70490856b5ecaf..1e8156abe42172 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpClasses.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpClasses.h @@ -659,3 +659,336 @@ class NHWC2NCHW : public NeuralNetOperator { private: }; + +class Int8Quantize : public NeuralNetOperator { + public: + Int8Quantize() : NeuralNetOperator(NNKind::Int8Quantize) {} + + ~Int8Quantize() {} + + NOMNIGRAPH_DEFINE_NN_RTTI(Int8Quantize); + + private: +}; + +class Int8Dequantize : public NeuralNetOperator { + public: + Int8Dequantize() : NeuralNetOperator(NNKind::Int8Dequantize) {} + + ~Int8Dequantize() {} + + NOMNIGRAPH_DEFINE_NN_RTTI(Int8Dequantize); + + private: +}; + +class Int8AveragePool : public NeuralNetOperator { + public: + Int8AveragePool() : NeuralNetOperator(NNKind::Int8AveragePool) {} + + Int8AveragePool(const AveragePool& averagePool) + : NeuralNetOperator(NNKind::Int8AveragePool) {} + + ~Int8AveragePool() {} + + NOMNIGRAPH_DEFINE_NN_RTTI(Int8AveragePool); + + private: +}; + +class Int8Conv : public NeuralNetOperator { + public: + Int8Conv() : NeuralNetOperator(NNKind::Int8Conv) {} + + Int8Conv(const Conv& conv) : NeuralNetOperator(NNKind::Int8Conv) {} + + ~Int8Conv() {} + + NOMNIGRAPH_DEFINE_NN_RTTI(Int8Conv); + + private: +}; + +class Int8ConvTranspose : public NeuralNetOperator { + public: + Int8ConvTranspose() : NeuralNetOperator(NNKind::Int8ConvTranspose) {} + + Int8ConvTranspose(const ConvTranspose& convTranspose) + : NeuralNetOperator(NNKind::Int8ConvTranspose) {} + + ~Int8ConvTranspose() {} + + NOMNIGRAPH_DEFINE_NN_RTTI(Int8ConvTranspose); + + private: +}; + +class Int8FC : public NeuralNetOperator { + public: + Int8FC() : NeuralNetOperator(NNKind::Int8FC) {} + + Int8FC(const FC& fC) : NeuralNetOperator(NNKind::Int8FC) {} + + ~Int8FC() {} + + NOMNIGRAPH_DEFINE_NN_RTTI(Int8FC); + + private: +}; + +class Int8MaxPool : public NeuralNetOperator { + public: + Int8MaxPool() : NeuralNetOperator(NNKind::Int8MaxPool) {} + + Int8MaxPool(const MaxPool& maxPool) + : NeuralNetOperator(NNKind::Int8MaxPool) {} + + ~Int8MaxPool() {} + + NOMNIGRAPH_DEFINE_NN_RTTI(Int8MaxPool); + + private: +}; + +class Int8Relu : public NeuralNetOperator { + public: + Int8Relu() : NeuralNetOperator(NNKind::Int8Relu) {} + + Int8Relu(const Relu& relu) : NeuralNetOperator(NNKind::Int8Relu) {} + + ~Int8Relu() {} + + NOMNIGRAPH_DEFINE_NN_RTTI(Int8Relu); + + private: +}; + +class Int8GivenTensorFill : public NeuralNetOperator { + public: + Int8GivenTensorFill() : NeuralNetOperator(NNKind::Int8GivenTensorFill) {} + + Int8GivenTensorFill(const GivenTensorFill& givenTensorFill) + : NeuralNetOperator(NNKind::Int8GivenTensorFill) {} + + ~Int8GivenTensorFill() {} + + NOMNIGRAPH_DEFINE_NN_RTTI(Int8GivenTensorFill); + + private: +}; + +class Int8Concat : public NeuralNetOperator { + public: + Int8Concat() : NeuralNetOperator(NNKind::Int8Concat) {} + + Int8Concat(const Concat& concat) : NeuralNetOperator(NNKind::Int8Concat) {} + + ~Int8Concat() {} + + NOMNIGRAPH_DEFINE_NN_RTTI(Int8Concat); + + private: +}; + +class Int8Softmax : public NeuralNetOperator { + public: + Int8Softmax() : NeuralNetOperator(NNKind::Int8Softmax) {} + + Int8Softmax(const Softmax& softmax) + : NeuralNetOperator(NNKind::Int8Softmax) {} + + ~Int8Softmax() {} + + NOMNIGRAPH_DEFINE_NN_RTTI(Int8Softmax); + + private: +}; + +class Int8ChannelShuffle : public NeuralNetOperator { + public: + Int8ChannelShuffle() : NeuralNetOperator(NNKind::Int8ChannelShuffle) {} + + Int8ChannelShuffle(const ChannelShuffle& channelShuffle) + : NeuralNetOperator(NNKind::Int8ChannelShuffle) {} + + ~Int8ChannelShuffle() {} + + NOMNIGRAPH_DEFINE_NN_RTTI(Int8ChannelShuffle); + + private: +}; + +class Int8Sum : public NeuralNetOperator { + public: + Int8Sum() : NeuralNetOperator(NNKind::Int8Sum) {} + + Int8Sum(const Sum& sum) : NeuralNetOperator(NNKind::Int8Sum) {} + + ~Int8Sum() {} + + NOMNIGRAPH_DEFINE_NN_RTTI(Int8Sum); + + private: +}; + +class Int8Add : public NeuralNetOperator { + public: + Int8Add() : NeuralNetOperator(NNKind::Int8Add) {} + + Int8Add(const Add& add) : NeuralNetOperator(NNKind::Int8Add) {} + + ~Int8Add() {} + + NOMNIGRAPH_DEFINE_NN_RTTI(Int8Add); + + private: +}; + +class Int8Reshape : public NeuralNetOperator { + public: + Int8Reshape() : NeuralNetOperator(NNKind::Int8Reshape) {} + + Int8Reshape(const Reshape& reshape) + : NeuralNetOperator(NNKind::Int8Reshape) {} + + ~Int8Reshape() {} + + NOMNIGRAPH_DEFINE_NN_RTTI(Int8Reshape); + + private: +}; + +class Int8Flatten : public NeuralNetOperator { + public: + Int8Flatten() : NeuralNetOperator(NNKind::Int8Flatten) {} + + Int8Flatten(const Flatten& flatten) + : NeuralNetOperator(NNKind::Int8Flatten) {} + + ~Int8Flatten() {} + + NOMNIGRAPH_DEFINE_NN_RTTI(Int8Flatten); + + private: +}; + +class Int8ConvRelu : public NeuralNetOperator { + public: + Int8ConvRelu() : NeuralNetOperator(NNKind::Int8ConvRelu) {} + + Int8ConvRelu(const ConvRelu& convRelu) + : NeuralNetOperator(NNKind::Int8ConvRelu) {} + + ~Int8ConvRelu() {} + + NOMNIGRAPH_DEFINE_NN_RTTI(Int8ConvRelu); + + private: +}; + +class Int8SumRelu : public NeuralNetOperator { + public: + Int8SumRelu() : NeuralNetOperator(NNKind::Int8SumRelu) {} + + Int8SumRelu(const SumRelu& sumRelu) + : NeuralNetOperator(NNKind::Int8SumRelu) {} + + ~Int8SumRelu() {} + + NOMNIGRAPH_DEFINE_NN_RTTI(Int8SumRelu); + + private: +}; + +class Int8AveragePoolRelu : public NeuralNetOperator { + public: + Int8AveragePoolRelu() : NeuralNetOperator(NNKind::Int8AveragePoolRelu) {} + + Int8AveragePoolRelu(const AveragePoolRelu& averagePoolRelu) + : NeuralNetOperator(NNKind::Int8AveragePoolRelu) {} + + ~Int8AveragePoolRelu() {} + + NOMNIGRAPH_DEFINE_NN_RTTI(Int8AveragePoolRelu); + + private: +}; + +class Int8MaxPoolRelu : public NeuralNetOperator { + public: + Int8MaxPoolRelu() : NeuralNetOperator(NNKind::Int8MaxPoolRelu) {} + + Int8MaxPoolRelu(const MaxPoolRelu& maxPoolRelu) + : NeuralNetOperator(NNKind::Int8MaxPoolRelu) {} + + ~Int8MaxPoolRelu() {} + + NOMNIGRAPH_DEFINE_NN_RTTI(Int8MaxPoolRelu); + + private: +}; + +class BatchMatMul : public NeuralNetOperator { + public: + BatchMatMul(bool transA = false, bool transB = true, bool broadcast = false) + : NeuralNetOperator(NNKind::BatchMatMul), + TransA(transA), + TransB(transB), + Broadcast(broadcast) {} + + ~BatchMatMul() {} + + NOMNIGRAPH_DEFINE_NN_RTTI(BatchMatMul); + + bool getTransA() const { + return TransA; + } + + bool getTransB() const { + return TransB; + } + + bool getBroadcast() const { + return Broadcast; + } + + void setTransA(bool transA) { + TransA = transA; + } + + void setTransB(bool transB) { + TransB = transB; + } + + void setBroadcast(bool broadcast) { + Broadcast = broadcast; + } + + private: + bool TransA; + bool TransB; + bool Broadcast; +}; + +class BatchGather : public NeuralNetOperator { + public: + BatchGather() : NeuralNetOperator(NNKind::BatchGather) {} + + ~BatchGather() {} + + NOMNIGRAPH_DEFINE_NN_RTTI(BatchGather); + + private: +}; + +class ConcatBatchMatMulBatchGatherOp : public NeuralNetOperator { + public: + ConcatBatchMatMulBatchGatherOp() + : NeuralNetOperator(NNKind::ConcatBatchMatMulBatchGatherOp) {} + + ~ConcatBatchMatMulBatchGatherOp() {} + + NOMNIGRAPH_DEFINE_NN_RTTI(ConcatBatchMatMulBatchGatherOp); + + private: +}; diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpEnum.h b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpEnum.h index 4d15dd40613403..9c4277293d0b41 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpEnum.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpEnum.h @@ -1,4 +1,9 @@ Relu, Conv, ConvRelu, ConvTranspose, AveragePool, AveragePoolRelu, MaxPool, MaxPoolRelu, Sum, SumRelu, Send, Receive, BatchNormalization, FC, GivenTensorFill, Concat, Softmax, ChannelShuffle, Add, Reshape, Flatten, - NCHW2NHWC, NHWC2NCHW + NCHW2NHWC, NHWC2NCHW, Int8Quantize, Int8Dequantize, Int8AveragePool, + Int8Conv, Int8ConvTranspose, Int8FC, Int8MaxPool, Int8Relu, + Int8GivenTensorFill, Int8Concat, Int8Softmax, Int8ChannelShuffle, Int8Sum, + Int8Add, Int8Reshape, Int8Flatten, Int8ConvRelu, Int8SumRelu, + Int8AveragePoolRelu, Int8MaxPoolRelu, BatchMatMul, BatchGather, + ConcatBatchMatMulBatchGatherOp diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpNames.h b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpNames.h index 88ffa0b1ba6bb0..87ffda3c4f3436 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpNames.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpNames.h @@ -1,68 +1,92 @@ case NNKind::Relu: return "Relu"; - case NNKind::Conv: return "Conv"; - case NNKind::ConvRelu: return "ConvRelu"; - case NNKind::ConvTranspose: return "ConvTranspose"; - case NNKind::AveragePool: return "AveragePool"; - case NNKind::AveragePoolRelu: return "AveragePoolRelu"; - case NNKind::MaxPool: return "MaxPool"; - case NNKind::MaxPoolRelu: return "MaxPoolRelu"; - case NNKind::Sum: return "Sum"; - case NNKind::SumRelu: return "SumRelu"; - case NNKind::Send: return "Send"; - case NNKind::Receive: return "Receive"; - case NNKind::BatchNormalization: return "BatchNormalization"; - case NNKind::FC: return "FC"; - case NNKind::GivenTensorFill: return "GivenTensorFill"; - case NNKind::Concat: return "Concat"; - case NNKind::Softmax: return "Softmax"; - case NNKind::ChannelShuffle: return "ChannelShuffle"; - case NNKind::Add: return "Add"; - case NNKind::Reshape: return "Reshape"; - case NNKind::Flatten: return "Flatten"; - case NNKind::NCHW2NHWC: return "NCHW2NHWC"; - case NNKind::NHWC2NCHW: return "NHWC2NCHW"; +case NNKind::Int8Quantize: + return "Int8Quantize"; +case NNKind::Int8Dequantize: + return "Int8Dequantize"; +case NNKind::Int8AveragePool: + return "Int8AveragePool"; +case NNKind::Int8Conv: + return "Int8Conv"; +case NNKind::Int8ConvTranspose: + return "Int8ConvTranspose"; +case NNKind::Int8FC: + return "Int8FC"; +case NNKind::Int8MaxPool: + return "Int8MaxPool"; +case NNKind::Int8Relu: + return "Int8Relu"; +case NNKind::Int8GivenTensorFill: + return "Int8GivenTensorFill"; +case NNKind::Int8Concat: + return "Int8Concat"; +case NNKind::Int8Softmax: + return "Int8Softmax"; +case NNKind::Int8ChannelShuffle: + return "Int8ChannelShuffle"; +case NNKind::Int8Sum: + return "Int8Sum"; +case NNKind::Int8Add: + return "Int8Add"; +case NNKind::Int8Reshape: + return "Int8Reshape"; +case NNKind::Int8Flatten: + return "Int8Flatten"; +case NNKind::Int8ConvRelu: + return "Int8ConvRelu"; +case NNKind::Int8SumRelu: + return "Int8SumRelu"; +case NNKind::Int8AveragePoolRelu: + return "Int8AveragePoolRelu"; +case NNKind::Int8MaxPoolRelu: + return "Int8MaxPoolRelu"; +case NNKind::BatchMatMul: + return "BatchMatMul"; +case NNKind::BatchGather: + return "BatchGather"; +case NNKind::ConcatBatchMatMulBatchGatherOp: + return "ConcatBatchMatMulBatchGatherOp"; diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h index aab127d8c56e16..3c5148e5b6c70f 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h @@ -46,31 +46,28 @@ class Edge : public StorageType { public: using NodeRef = typename Graph::NodeRef; Edge(NodeRef tail, NodeRef head, U... args) - : StorageType(std::forward(args)...), - tail_(tail), - head_(head) { + : StorageType(std::forward(args)...), Tail(tail), Head(head) { DEBUG_PRINT("Creating instance of Edge: %p\n", this); } const NodeRef& tail() const { - return tail_; + return Tail; } const NodeRef& head() const { - return head_; + return Head; } void setTail(NodeRef n) { - tail_ = n; + Tail = n; } void setHead(NodeRef n) { - head_ = n; + Head = n; } private: - NodeRef tail_; - NodeRef head_; - + NodeRef Tail; + NodeRef Head; friend class Graph; }; @@ -91,55 +88,54 @@ class Node : public StorageType, public Notifier> { /// \brief Adds an edge by reference to known in-edges. /// \p e A reference to an edge that will be added as an in-edge. void addInEdge(EdgeRef e) { - inEdges_.emplace_back(e); + inEdges.emplace_back(e); } /// \brief Adds an edge by reference to known out-edges. /// \p e A reference to an edge that will be added as an out-edge. void addOutEdge(EdgeRef e) { - outEdges_.emplace_back(e); + outEdges.emplace_back(e); } /// \brief Removes an edge by reference to known in-edges. /// \p e A reference to an edge that will be removed from in-edges. void removeInEdge(EdgeRef e) { - removeEdgeInternal(inEdges_, e); + auto iter = std::find(inEdges.begin(), inEdges.end(), e); + assert( + iter != inEdges.end() && + "Attempted to remove edge that isn't connected to this node"); + inEdges.erase(iter); } /// \brief Removes an edge by reference to known out-edges. /// \p e A reference to an edge that will be removed from out-edges. void removeOutEdge(EdgeRef e) { - removeEdgeInternal(outEdges_, e); + auto iter = std::find(outEdges.begin(), outEdges.end(), e); + assert( + iter != outEdges.end() && + "Attempted to remove edge that isn't connected to this node"); + outEdges.erase(iter); } const std::vector& getOutEdges() const { - return outEdges_; + return outEdges; } const std::vector& getInEdges() const { - return inEdges_; + return inEdges; } - void setInEdges(std::vector edges) { - inEdges_ = edges; + void setInEdges(std::vector es) { + inEdges = es; } - void setOutEdges(std::vector edges) { - outEdges_ = edges; + void setOutEdges(std::vector es) { + outEdges = es; } - private: - std::vector inEdges_; - std::vector outEdges_; - + protected: + std::vector inEdges; + std::vector outEdges; friend class Graph; - - void removeEdgeInternal(std::vector& edges, EdgeRef e) { - auto iter = std::find(edges.begin(), edges.end(), e); - assert( - iter != edges.end() && - "Attempted to remove edge that isn't connected to this node"); - edges.erase(iter); - } }; /// \brief Effectively a constant reference to a graph. @@ -162,56 +158,46 @@ class Subgraph { using EdgeRef = typename Graph::EdgeRef; void addNode(NodeRef n) { - nodes_.insert(n); + Nodes.insert(n); } - bool hasNode(NodeRef n) const { - return nodes_.count(n) != 0; + return Nodes.count(n) != 0; } - void removeNode(NodeRef n) { - nodes_.erase(n); + Nodes.erase(n); } void addEdge(EdgeRef e) { - edges_.insert(e); + Edges.insert(e); } - - bool hasEdge(EdgeRef e) const { - return edges_.count(e) != 0; + bool hasEdge(EdgeRef n) const { + return Edges.count(n) != 0; } - void removeEdge(EdgeRef e) { - edges_.erase(e); + Edges.erase(e); } const std::unordered_set& getNodes() const { - return nodes_; - } - - const size_t getNodesCount() const { - return (size_t)nodes_.size(); + return Nodes; } - const std::unordered_set& getEdges() const { - return edges_; + return Edges; } - private: - std::unordered_set nodes_; - std::unordered_set edges_; - void printEdges() { - for (const auto& edge : edges_) { + for (const auto& edge : Edges) { printf("Edge: %p (%p -> %p)\n", &edge, edge->tail(), edge->head()); } } void printNodes() const { - for (const auto& node : nodes_) { + for (const auto& node : Nodes) { printf("Node: %p\n", node); } } + + std::unordered_set Nodes; + std::unordered_set Edges; }; /// \brief A simple graph implementation @@ -245,21 +231,21 @@ class Graph { } void importNode(NodeRef node, Graph& otherGraph) { - for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { + for (auto it = Nodes.begin(); it != Nodes.end(); ++it) { if (&(*it) == node) { - std::list>& otherNodes = otherGraph.nodes_; - otherNodes.splice(otherNodes.end(), nodes_, it, ++it); - otherGraph.nodeRefs_.insert(node); + std::list>& otherNodes = otherGraph.Nodes; + otherNodes.splice(otherNodes.end(), Nodes, it, ++it); + otherGraph.NodeRefs.insert(node); break; } } } void importEdge(EdgeRef edge, Graph& otherGraph) { - std::list>& otherEdges = otherGraph.edges_; - for (auto it = edges_.begin(); it != edges_.end(); ++it) { + std::list>& otherEdges = otherGraph.Edges; + for (auto it = Edges.begin(); it != Edges.end(); ++it) { if (&(*it) == edge) { - otherEdges.splice(otherEdges.end(), edges_, it, ++it); + otherEdges.splice(otherEdges.end(), Edges, it, ++it); break; } } @@ -327,9 +313,9 @@ class Graph { /// \return A reference to the edge created. EdgeRef createEdge(NodeRef tail, NodeRef head, U... data) { DEBUG_PRINT("Creating edge (%p -> %p)\n", tail, head); - this->edges_.emplace_back( + this->Edges.emplace_back( Edge(tail, head, std::forward(data)...)); - EdgeRef e = &this->edges_.back(); + EdgeRef e = &this->Edges.back(); head->addInEdge(e); tail->addOutEdge(e); return e; @@ -353,85 +339,85 @@ class Graph { /// related to the node. void deleteNode(NodeRef n, bool deleteEdges = true) { if (deleteEdges) { - auto inEdges = n->inEdges_; + auto inEdges = n->inEdges; for (auto& edge : inEdges) { deleteEdge(edge); } - auto outEdges = n->outEdges_; + auto outEdges = n->outEdges; for (auto& edge : outEdges) { deleteEdge(edge); } } - for (auto i = nodes_.begin(); i != nodes_.end(); ++i) { + for (auto i = Nodes.begin(); i != Nodes.end(); ++i) { if (&*i == n) { - nodeRefs_.erase(n); - nodes_.erase(i); + NodeRefs.erase(n); + Nodes.erase(i); break; } } } - bool hasNode(NodeRef node) const { - return nodeRefs_.find(node) != nodeRefs_.end(); + bool hasNode(NodeRef ref) const { + return NodeRefs.find(ref) != NodeRefs.end(); } /// \brief Deletes a edge from the graph. /// \p e A reference to the edge. - void deleteEdge(EdgeRef e, bool removeRef = true) { - if (removeRef) { - e->tail_->removeOutEdge(e); - e->head_->removeInEdge(e); + void deleteEdge(EdgeRef e, bool remove_ref = true) { + if (remove_ref) { + e->Tail->removeOutEdge(e); + e->Head->removeInEdge(e); } - for (auto i = edges_.begin(); i != edges_.end(); ++i) { + for (auto i = Edges.begin(); i != Edges.end(); ++i) { if (&*i == e) { - edges_.erase(i); + Edges.erase(i); break; } } } const std::vector getMutableNodes() { - std::vector result; - for (auto& n : nodes_) { + std::vector v; + for (auto& n : Nodes) { DEBUG_PRINT("Adding node to mutable output (%p)\n", &n); - result.emplace_back(&n); + v.emplace_back(&n); } - return result; + return v; } const std::vector getMutableEdges() { - std::vector result; - for (auto& e : edges_) { + std::vector v; + for (auto& e : Edges) { DEBUG_PRINT("Adding edge to mutable output (%p)\n", &e); - result.emplace_back(&e); + v.emplace_back(&e); } - return result; - } - - private: - std::list> nodes_; - std::list> edges_; - std::unordered_set nodeRefs_; - - NodeRef createNodeInternal(Node&& node) { - nodes_.emplace_back(std::move(node)); - NodeRef nodeRef = &nodes_.back(); - DEBUG_PRINT("Creating node (%p)\n", nodeRef); - nodeRefs_.insert(nodeRef); - return nodeRef; + return v; } void printEdges() { - for (const auto& edge : edges_) { + for (const auto& edge : Edges) { printf("Edge: %p (%p -> %p)\n", &edge, edge.tail(), edge.head()); } } void printNodes() const { - for (const auto& node : nodes_) { + for (const auto& node : Nodes) { printf("Node: %p\n", &node); } } + + private: + std::list> Nodes; + std::list> Edges; + std::unordered_set NodeRefs; + + NodeRef createNodeInternal(Node&& node) { + Nodes.emplace_back(std::move(node)); + NodeRef nodeRef = &Nodes.back(); + DEBUG_PRINT("Creating node (%p)\n", nodeRef); + NodeRefs.insert(nodeRef); + return nodeRef; + } }; } // namespace nom diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h b/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h deleted file mode 100644 index 08ead742950740..00000000000000 --- a/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h +++ /dev/null @@ -1,174 +0,0 @@ -#ifndef NOM_TRANFORMATIONS_SUBGRAPH_MATCHER_H -#define NOM_TRANFORMATIONS_SUBGRAPH_MATCHER_H - -namespace nom { - -namespace matcher { - -/* - * Subtree matching criteria consists of - * - Node matching criteria for the subtree's root. - * - Children subtree matching criteria - * - A count, which means we may want more than one of this subtree. The count - * can be unlimited. The count is only used when we match children of a - * subtree root, not matching the subtree itself. - */ -template -class SubtreeMatchCriteria { - public: - static const int kStarCount = -1; - SubtreeMatchCriteria( - const NodeMatchCriteria& root, - const std::vector& children, - int count) - : root_(root), children_(children), count_(count){}; - - private: - NodeMatchCriteria root_; - std::vector children_; - int count_; - - template - friend class SubgraphMatcher; -}; - -/* - * Utilities for subgraph matching. - */ -template < - typename GraphType, - typename NodeMatchCriteria, - typename NodeMatcherClass> -struct SubgraphMatcher { - static bool isNodeMatch( - typename GraphType::NodeRef node, - const NodeMatchCriteria& criteria) { - return NodeMatcherClass::isMatch(node, criteria); - } - - // Check if there can be a sub-tree that matches the given criteria that - // is rooted at the given rootNode. - // The flag invertGraphTraversal specify if we should follow out edges or - // in edges. The default is true which is useful for a functional - // intepretation of a dataflow graph. - static bool isSubtreeMatch( - typename GraphType::NodeRef root, - const SubtreeMatchCriteria& criteria, - bool invertGraphTraversal = true) { - if (!isNodeMatch(root, criteria.root_)) { - return false; - } - auto& edges = - invertGraphTraversal ? root->getInEdges() : root->getOutEdges(); - - int numEdges = edges.size(); - int numChildrenCriteria = criteria.children_.size(); - - // The current algorithm implies that the ordering of the children is - // important. The children nodes will be matched with the children subtree - // criteria in the given order. - - int currentEdgeIdx = 0; - for (int criteriaIdx = 0; criteriaIdx < numChildrenCriteria; - criteriaIdx++) { - auto childrenCriteria = criteria.children_[criteriaIdx]; - - int expectedCount = childrenCriteria.count_; - bool isStarCount = - expectedCount == SubtreeMatchCriteria::kStarCount; - - int countMatch = 0; - - // Continue to match subsequent edges with the current children criteria. - // Note that if the child criteria is a * pattern, this greedy algorithm - // will attempt to find the longest possible sequence that matches the - // children criteria. - for (; currentEdgeIdx < numEdges && - (isStarCount || countMatch < expectedCount); - currentEdgeIdx++) { - auto edge = edges[currentEdgeIdx]; - auto nextNode = invertGraphTraversal ? edge->tail() : edge->head(); - - if (!isSubtreeMatch(nextNode, childrenCriteria, invertGraphTraversal)) { - if (!isStarCount) { - // If the current criteria isn't a * pattern, this indicates a - // failure. - return false; - } else { - // Otherwise, we should move on to the next children criteria. - break; - } - } - - countMatch++; - } - - if (countMatch < expectedCount) { - // Fails because there are not enough matches as specified by the - // criteria. - return false; - } - } - - if (currentEdgeIdx < numEdges) { - // Fails because there are unmatched edges. - return false; - } - return true; - } - - // Utility to transform a graph by looking for subtrees that match - // a given pattern and then allow callers to mutate the graph based on - // subtrees that are found. - // The current implementation doesn't handle any graph transformation - // itself. Callers should be responsible for all intended mutation, including - // deleting nodes in the subtrees found by this algorithm. - // Note: if the replaceFunction lambda returns false, the entire procedure - // is aborted. This maybe useful in certain cases when we want to terminate - // the subtree search early. - // invertGraphTraversal flag: see documentation in isSubtreeMatch - static void replaceSubtree( - GraphType& graph, - const SubtreeMatchCriteria& criteria, - const std::function< - bool(GraphType& g, typename GraphType::NodeRef subtreeRoot)>& - replaceFunction, - bool invertGraphTraversal = true) { - for (auto nodeRef : graph.getMutableNodes()) { - // Make sure the node is still in the graph. - if (!graph.hasNode(nodeRef)) { - continue; - } - if (isSubtreeMatch(nodeRef, criteria, invertGraphTraversal)) { - if (!replaceFunction(graph, nodeRef)) { - // If replaceFunction returns false, it means that we should abort - // the entire procedure. - break; - } - } - } - } -}; - -// Convenient methods to create subtree matching criteria. -template -SubtreeMatchCriteria tree( - const NodeMatchCriteria& root, - const std::vector>& children = {}, - int count = 1) { - return SubtreeMatchCriteria(root, children, count); -} - -template -SubtreeMatchCriteria treeStar( - const NodeMatchCriteria& root, - const std::vector>& children = {}) { - return tree( - root, children, SubtreeMatchCriteria::kStarCount); -} - -} // namespace matcher - -} // namespace nom - -#endif // NOM_TRANFORMATIONS_SUBGRAPH_MATCHER_H diff --git a/caffe2/core/nomnigraph/op_gen.py b/caffe2/core/nomnigraph/op_gen.py index 2d1125f5762ad4..c62148ea52cff5 100755 --- a/caffe2/core/nomnigraph/op_gen.py +++ b/caffe2/core/nomnigraph/op_gen.py @@ -6,8 +6,6 @@ from __future__ import unicode_literals import argparse -from textwrap import dedent -from subprocess import call def parse_lines(lines): @@ -24,27 +22,25 @@ def parse_lines(lines): index = 0 while index < len(lines): line = lines[index] - if line.lower().startswith("macro"): - assert parse_state == EMPTY - macro_line = line.split(" ") + if line.lower().startswith('macro'): + assert (parse_state == EMPTY) + macro_line = line.split(' ') # Support macros that look like attributes # e.g. macro - CONV_LIKE - curr_macro = " ".join(macro_line[1:]) - assert curr_macro not in macros, 'Macro "{}" defined twice.'.format( - curr_macro - ) + curr_macro = ' '.join(macro_line[1:]) + assert (curr_macro not in macros) macros[curr_macro] = [] parse_state = MACRO - lines = lines[:index] + lines[index + 1 :] + lines = lines[:index] + lines[index + 1:] continue - elif line.lower().startswith("endmacro"): - assert parse_state == MACRO + elif line.lower().startswith('endmacro'): + assert (parse_state == MACRO) parse_state = EMPTY - lines = lines[:index] + lines[index + 1 :] + lines = lines[:index] + lines[index + 1:] continue elif parse_state == MACRO: macros[curr_macro].append(line) - lines = lines[:index] + lines[index + 1 :] + lines = lines[:index] + lines[index + 1:] continue index += 1 @@ -52,7 +48,7 @@ def parse_lines(lines): while index < len(lines): line = lines[index] if line in macros: - lines = lines[:index] + macros[line] + lines[index + 1 :] + lines = lines[:index] + macros[line] + lines[index + 1:] index += len(macros[line]) - 1 index += 1 @@ -67,20 +63,20 @@ def parse_lines(lines): for line in lines: if not len(line): continue - if line[0] == "-": - assert parse_state is OP - attr = [_.strip() for _ in line[1:].split(":")] - assert attr[0][0].isupper() - if len(attr) == 2: # attribute : type + if line[0] == '-': + assert (parse_state is OP) + attr = [_.strip() for _ in line[1:].split(':')] + assert (attr[0][0].isupper()) + if (len(attr) == 2): # attribute : type ops[curr_op]["attributes"].append((attr[0], attr[1])) - elif len(attr) == 3: # attribute : type + elif (len(attr) == 3): # attribute : type ops[curr_op]["attributes"].append((attr[0], attr[1], attr[2])) else: - op = [l.strip() for l in line.split(":")] - assert len(op[0].split(" ")) == 1 + op = [l.strip() for l in line.split(':')] + assert (len(op[0].split(' ')) == 1) parse_state = OP curr_op = op[0] - assert curr_op not in ops + assert (curr_op not in ops) ops[curr_op] = {} op_list.append(curr_op) if len(op) > 1: @@ -105,26 +101,20 @@ def gen_class(op, op_def): attr_arg = "{type} {lower_name}".format( type=t, lower_name=lower_name + default_arg ) - attr_init = "{name}({lower_name})".format(name=name, lower_name=lower_name) - attr_declare = "{type} {name};".format(type=t, name=name) - attr_get = dedent( - """ - {type} get{name}() const {{ - return {name}; - }} - """.format( - type=t, name=name - ) - ) - attr_set = dedent( - """ - void set{name}({type} {lower_name}) {{ - {name} = {lower_name}; - }} - """.format( - type=t, name=name, lower_name=lower_name - ) + attr_init = "{name}({lower_name})".format( + name=name, lower_name=lower_name ) + attr_declare = "{type} {name};".format(type=t, name=name) + attr_get = """ + {type} get{name}() const {{ + return {name}; + }} +""".format(type=t, name=name) + attr_set = """ + void set{name}({type} {lower_name}) {{ + {name} = {lower_name}; + }} +""".format(type=t, name=name, lower_name=lower_name) attribute_args.append(attr_arg) attribute_init.append(attr_init) attribute_declarations.append(attr_declare) @@ -142,43 +132,38 @@ def gen_class(op, op_def): name=attr[0], other_op=lower_other_op ) ) - init = dedent( - """ - {op}(const {other_op}& {lower_other_op}) : - {other_init} {{}} - """.format( - op=op, - other_op=other_op, - lower_other_op=lower_other_op, - other_init=",\n ".join(other_init), - ) + init = """ + {op}(const {other_op}& {lower_other_op}) : + {other_init} {{}} +""".format( + op=op, + other_op=other_op, + lower_other_op=lower_other_op, + other_init=',\n '.join(other_init) ) extra_init += init - return dedent( - """ - class {op} : public NeuralNetOperator {{ - public: - {op}({attribute_args}) : - {attribute_init} {{}} - {extra_init} - ~{op}() {{}} - - NOMNIGRAPH_DEFINE_NN_RTTI({op}); - {getters}{setters} - private: - {attribute_declarations} - }}; - - """.format( - op=op, - extra_init=extra_init, - getters="".join(attribute_getters), - setters="".join(attribute_setters), - attribute_args=",\n".join(attribute_args), - attribute_init=",\n".join(attribute_init), - attribute_declarations="\n".join(attribute_declarations), - ) + return """class {op} : public NeuralNetOperator {{ + public: + {op}({attribute_args}) : + {attribute_init} {{}} + {extra_init} + ~{op}() {{}} + + NOMNIGRAPH_DEFINE_NN_RTTI({op}); +{getters}{setters} + private: + {attribute_declarations} +}}; + +""".format( + op=op, + extra_init=extra_init, + getters=''.join(attribute_getters), + setters=''.join(attribute_setters), + attribute_args=',\n '.join(attribute_args), + attribute_init=',\n '.join(attribute_init), + attribute_declarations='\n '.join(attribute_declarations) ) @@ -190,51 +175,33 @@ def gen_classes(ops, op_list): def gen_enum(op_list): - return ",\n".join([op for op in op_list]) + "\n" + return ',\n'.join([op for op in op_list]) + '\n' def gen_names(op_list): f = "" for op in op_list: - f += dedent( - """ - case NNKind::{name}: - return \"{name}\"; - """.format( - name=op - ) - ) + f += """case NNKind::{name}: + return \"{name}\"; +""".format(name=op) return f if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Generate op files.") - parser.add_argument("--install_dir", help="installation directory") - parser.add_argument("--source_def", help="ops.def", action="append") + parser = argparse.ArgumentParser(description='Generate op files.') + parser.add_argument('--install_dir', help='installation directory') + parser.add_argument('--source_def', help='ops.def') args = parser.parse_args() install_dir = args.install_dir - sources = args.source_def - lines = [] - for source in sources: - with open(source, "rb") as f: - lines_tmp = f.readlines() - lines += [l.strip().decode("utf-8") for l in lines_tmp] + with open(args.source_def, 'rb') as f: + lines = f.readlines() + lines = [l.strip().decode("utf-8") for l in lines] ops, op_list = parse_lines(lines) - with open(install_dir + "/OpClasses.h", "wb") as f: + with open(install_dir + '/OpClasses.h', 'wb') as f: f.write(gen_classes(ops, op_list).encode("utf-8")) - with open(install_dir + "/OpNames.h", "wb") as f: + with open(install_dir + '/OpNames.h', 'wb') as f: f.write(gen_names(op_list).encode("utf-8")) - with open(install_dir + "/OpEnum.h", "wb") as f: + with open(install_dir + '/OpEnum.h', 'wb') as f: f.write(gen_enum(op_list).encode("utf-8")) - - try: - cmd = ["clang-format", "-i", install_dir + "/OpClasses.h"] - call(cmd) - cmd = ["clang-format", "-i", install_dir + "/OpNames.h"] - call(cmd) - cmd = ["clang-format", "-i", install_dir + "/OpEnum.h"] - call(cmd) - except Exception: - pass diff --git a/caffe2/core/nomnigraph/ops.def b/caffe2/core/nomnigraph/ops.def index 6183e3c25726a3..53dd951c8fc1c2 100644 --- a/caffe2/core/nomnigraph/ops.def +++ b/caffe2/core/nomnigraph/ops.def @@ -69,3 +69,30 @@ CopyFromOpenCL NCHW2NHWC NHWC2NCHW +Int8Quantize +Int8Dequantize +Int8AveragePool : AveragePool +Int8Conv : Conv +Int8ConvTranspose : ConvTranspose +Int8FC : FC +Int8MaxPool : MaxPool +Int8Relu : Relu +Int8GivenTensorFill : GivenTensorFill +Int8Concat : Concat +Int8Softmax : Softmax +Int8ChannelShuffle : ChannelShuffle +Int8Sum : Sum +Int8Add : Add +Int8Reshape : Reshape +Int8Flatten : Flatten +Int8ConvRelu : ConvRelu +Int8SumRelu : SumRelu +Int8AveragePoolRelu : AveragePoolRelu +Int8MaxPoolRelu : MaxPoolRelu + +BatchMatMul +- TransA : bool : false +- TransB : bool : true +- Broadcast: bool : false +BatchGather +ConcatBatchMatMulBatchGatherOp diff --git a/caffe2/core/nomnigraph/tests/binary_match_test.cc b/caffe2/core/nomnigraph/tests/binary_match_test.cc index ca3fd11b3a9126..4834cea30f3e23 100644 --- a/caffe2/core/nomnigraph/tests/binary_match_test.cc +++ b/caffe2/core/nomnigraph/tests/binary_match_test.cc @@ -19,7 +19,7 @@ TEST(BinaryMatch, AllMatch) { auto matches = nom::algorithm::binaryMatch( &graph, [](decltype(graph)::NodeRef n) { return true; }); EXPECT_EQ(matches.size(), 1); - EXPECT_EQ(matches.front().getNodesCount(), graph.getMutableNodes().size()); + EXPECT_EQ(matches.front().Nodes.size(), graph.getMutableNodes().size()); } TEST(BinaryMatch, EmptyGraph) { @@ -58,9 +58,9 @@ TEST(BinaryMatch, Basic) { EXPECT_EQ(matches.size(), 1); auto match = matches.front(); - EXPECT_EQ(match.getNodesCount(), 4); + EXPECT_EQ(match.Nodes.size(), 4); std::set exp{"2", "3", "4", "6"}; - for (auto n : match.getNodes()) { + for (auto n : match.Nodes) { EXPECT_EQ(exp.count(n->data()), 1); exp.erase(n->data()); } @@ -104,16 +104,16 @@ TEST(BinaryMatch, RemovedMiddleNode) { auto match1 = matches.front(); auto match2 = matches.back(); - EXPECT_EQ(match1.getNodesCount(), 2); - EXPECT_EQ(match2.getNodesCount(), 1); + EXPECT_EQ(match1.Nodes.size(), 2); + EXPECT_EQ(match2.Nodes.size(), 1); std::set exp1{"2", "4"}; std::set exp2{"6"}; - for (auto n : match1.getNodes()) { + for (auto n : match1.Nodes) { EXPECT_EQ(exp1.count(n->data()), 1); exp1.erase(n->data()); } - for (auto n : match2.getNodes()) { + for (auto n : match2.Nodes) { EXPECT_EQ(exp2.count(n->data()), 1); exp2.erase(n->data()); } diff --git a/caffe2/core/nomnigraph/tests/subgraph_matcher_test.cc b/caffe2/core/nomnigraph/tests/subgraph_matcher_test.cc deleted file mode 100644 index ddd8a15fcdc2bc..00000000000000 --- a/caffe2/core/nomnigraph/tests/subgraph_matcher_test.cc +++ /dev/null @@ -1,404 +0,0 @@ -#include - -#include "test_util.h" - -#include "nomnigraph/Transformations/SubgraphMatcher.h" - -#include - -namespace nom { - -namespace matcher { - -using NodeType = std::string; -using Criteria = std::string; - -// Node matches a criteria (string) if the data string is the same as the -// criteria. Special case: "*" will match any thing. -struct TestNodeMatch { - static bool isMatch( - const nom::Graph::NodeRef& node, - const Criteria& criteria) { - return criteria == "*" || criteria == node->data(); - } -}; - -using TestGraph = Graph; -using TestMatcher = SubgraphMatcher; - -Criteria any() { - return Criteria("*"); -} - -// Make it more concise to create matching criteria in dataflow graph. -// For example, operatorTree("opA", ...) will refer to a tree like this: -// ... -> opA -> opA_Output -SubtreeMatchCriteria operatorTree( - const Criteria& root, - const std::vector>& childrenCriteria = {}, - int count = 1) { - return tree(any(), {tree(root, childrenCriteria)}, count); -} - -std::map TestGraphNodePrinter( - TestGraph::NodeRef node) { - std::map labelMap; - labelMap["label"] = node->data(); - return labelMap; -}; - -// Attempts to create a realistic dataflow graph that shows a fuse procedure. -struct DataFlowTestGraph { - const int numInputs = 4; - - TestGraph graph; - - TestGraph::NodeRef opB; - TestGraph::NodeRef opF; - TestGraph::NodeRef opC; - TestGraph::NodeRef opG; - TestGraph::NodeRef dataOut; - - // Realistic data flow test graph. - /* - - - +---------------+ - | | - | +---------+ | +---------+ - +---------------------+ | input_A | | | input_B | - | +---------+ | +---------+ - | | | | - | | | | - | v v v - +---------++---------+ +-------------------------+ +--------+ - | input_C || input_D | --> | opC | --> | dataC2 | - +---------++---------+ +-------------------------+ +--------+ - | - | - v - +---------+ - | dataC | -+ - +---------+ | - | | - | | - v | - +---------+ | - | opB | <+ - +---------+ - | - | - v - +---------+ - | dataB | - +---------+ - | - | - v - +---------+ - | opF | - +---------+ - | - | - v - +---------+ - | dataF | - +---------+ - | - | - v - +---------+ +---------+ - | dataI | --> | opG | - +---------+ +---------+ - | - | - v - +---------+ - | dataOut | - +---------+ - */ - DataFlowTestGraph() { - opC = graph.createNode("opC"); - - for (int i = 0; i < numInputs; i++) { - auto dataInput = graph.createNode("input"); - graph.createEdge(dataInput, opC); - } - - auto dataC = graph.createNode("dataC"); - auto dataC2 = graph.createNode("dataC2"); - graph.createEdge(opC, dataC); - graph.createEdge(opC, dataC2); - - opB = graph.createNode("opB"); - // There are 2 edges - graph.createEdge(dataC, opB); - graph.createEdge(dataC, opB); - - auto dataB = graph.createNode("dataB"); - graph.createEdge(opB, dataB); - - opF = graph.createNode("opF"); - graph.createEdge(dataB, opF); - - auto dataF = graph.createNode("dataF"); - graph.createEdge(opF, dataF); - - auto dataI = graph.createNode("dataI"); - - opG = graph.createNode("opG"); - graph.createEdge(dataF, opG); - graph.createEdge(dataI, opG); - - dataOut = graph.createNode("dataOut"); - graph.createEdge(opG, dataOut); - - // Use nom::converters::convertToDotString(&graph, TestGraphNodePrinter) - // to visualize the graph. - } -}; - -SubtreeMatchCriteria DataFlowTestGraphCriteria() { - // clang-format off - return tree( - Criteria("opG"),{ - operatorTree("opF", { - // Note: we currently don't enforce that these 2 opC nodes - // have to be the same. - operatorTree("opB", { - operatorTree("opC", { - treeStar(Criteria("input")) - }, 2), - }) - }), - tree(any()) // matches dataI - }); - // clang-format on -} - -TestGraph::NodeRef getInNode(TestGraph::NodeRef node, int index) { - return node->getInEdges()[index]->tail(); -} - -} // namespace matcher - -} // namespace nom - -using namespace nom::matcher; - -// Simple test cases for node matching criteria. -TEST(SubgraphMatcher, IsNodeMatch) { - TestGraph graph; - auto n1 = graph.createNode("Hello"); - auto n2 = graph.createNode("Le"); - graph.createEdge(n1, n2); - - EXPECT_TRUE(TestMatcher::isNodeMatch(n1, "Hello")); - EXPECT_FALSE(TestMatcher::isNodeMatch(n1, "G")); - EXPECT_TRUE(TestMatcher::isNodeMatch(n2, "Le")); - EXPECT_FALSE(TestMatcher::isNodeMatch(n2, "le")); -} - -// Test subtree matching with a simple tree graph. -TEST(SubgraphMatcher, IsSubtreeMatch) { - TestGraph graph; - auto n1 = graph.createNode("1"); - auto n2 = graph.createNode("2"); - auto n3 = graph.createNode("3"); - auto n4 = graph.createNode("4"); - auto n5 = graph.createNode("5"); - auto n6 = graph.createNode("6"); - auto n7 = graph.createNode("7"); - - graph.createEdge(n1, n2); - graph.createEdge(n2, n3); - graph.createEdge(n2, n4); - graph.createEdge(n1, n5); - graph.createEdge(n5, n6); - graph.createEdge(n5, n7); - /* N1 - / \ - N2 N5 - / \ / \ - N3 N4 N6 N7 - */ - - auto subtree = tree(any(), {tree(any()), tree(any())}); - EXPECT_FALSE(TestMatcher::isSubtreeMatch(n1, subtree, false)); - EXPECT_FALSE(TestMatcher::isSubtreeMatch(n4, subtree, false)); - - EXPECT_TRUE(TestMatcher::isSubtreeMatch(n2, subtree, false)); - EXPECT_TRUE(TestMatcher::isSubtreeMatch(n5, subtree, false)); - - subtree = tree(Criteria("5"), {tree(any()), tree(any())}); - EXPECT_FALSE(TestMatcher::isSubtreeMatch(n2, subtree, false)); - EXPECT_TRUE(TestMatcher::isSubtreeMatch(n5, subtree, false)); - - subtree = tree(any(), {tree(any()), tree(Criteria("4"))}); - EXPECT_TRUE(TestMatcher::isSubtreeMatch(n2, subtree, false)); - EXPECT_FALSE(TestMatcher::isSubtreeMatch(n5, subtree, false)); -} - -// Test subtree matching in which * (repeated) matching of children is allowed. -TEST(SubgraphMatcher, IsSubtreeMatchRepeated) { - TestGraph graph; - auto n1 = graph.createNode("1"); - auto n2 = graph.createNode("2"); - auto n3A = graph.createNode("3"); - auto n3B = graph.createNode("3"); - auto n4 = graph.createNode("4"); - auto n5A = graph.createNode("5"); - auto n5B = graph.createNode("5"); - auto n5C = graph.createNode("5"); - graph.createEdge(n1, n2); - graph.createEdge(n1, n3A); - graph.createEdge(n1, n3B); - graph.createEdge(n1, n4); - graph.createEdge(n1, n4); - graph.createEdge(n1, n5A); - graph.createEdge(n1, n5B); - graph.createEdge(n1, n5C); - - auto subtree = tree(any(), {tree(Criteria("2"))}); - EXPECT_FALSE(TestMatcher::isSubtreeMatch(n1, subtree, false)); - - subtree = tree(any(), {treeStar(Criteria("2"))}); - EXPECT_FALSE(TestMatcher::isSubtreeMatch(n1, subtree, false)); - - // clang-format off - subtree = tree(any(), { - tree(Criteria("2")), - tree(Criteria("3"), {}, 2), - tree(Criteria("4"), {}, 2), - tree(Criteria("5"), {}, 3) - }); - EXPECT_TRUE(TestMatcher::isSubtreeMatch(n1, subtree, false)); - - subtree = tree(any(), { - tree(Criteria("2")), - tree(Criteria("3"), {}, 2), - tree(Criteria("4"), {}, 2), - treeStar(Criteria("5")) - }); - EXPECT_TRUE(TestMatcher::isSubtreeMatch(n1, subtree, false)); - - subtree = tree(any(), { - tree(Criteria("2")), - treeStar(Criteria("3")), - tree(Criteria("4"), {}, 2), - treeStar(Criteria("5")) - }); - EXPECT_TRUE(TestMatcher::isSubtreeMatch(n1, subtree, false)); - - subtree = tree(any(), { - tree(Criteria("2")), - treeStar(Criteria("3")), - }); - // Fails because there are unmatched edges. - EXPECT_FALSE(TestMatcher::isSubtreeMatch(n1, subtree, false)); - - subtree = tree(any(), { - tree(Criteria("2")), - tree(Criteria("3"), {}, 2), - tree(Criteria("4")), - tree(Criteria("5"), {}, 3) - }); - // Fails because the count is wrong; we have 2 edges to node N4 while - // the pattern expects only 1. - EXPECT_FALSE(TestMatcher::isSubtreeMatch(n1, subtree, false)); - // clang-format on -} - -TEST(SubgraphMatcher, IsSubtreeMatchRealistic) { - auto graph = DataFlowTestGraph(); - auto subtree = DataFlowTestGraphCriteria(); - - EXPECT_FALSE(TestMatcher::isSubtreeMatch(graph.opF, subtree)); - EXPECT_FALSE(TestMatcher::isSubtreeMatch(graph.opC, subtree)); - EXPECT_FALSE(TestMatcher::isSubtreeMatch(graph.opB, subtree)); - EXPECT_FALSE(TestMatcher::isSubtreeMatch(graph.dataOut, subtree)); - - EXPECT_TRUE(TestMatcher::isSubtreeMatch(graph.opG, subtree)); -} - -TEST(SubgraphMatcher, ReplaceSubtreeRealistic) { - auto graph = DataFlowTestGraph(); - auto subtree = DataFlowTestGraphCriteria(); - - TestMatcher::replaceSubtree( - graph.graph, subtree, [](TestGraph& g, TestGraph::NodeRef opG) { - auto opFused = g.createNode("opFused"); - - auto dataF = getInNode(opG, 0); - auto opF = getInNode(dataF, 0); - auto dataB = getInNode(opF, 0); - auto opB = getInNode(dataB, 0); - auto dataC = getInNode(opB, 0); - auto opC = getInNode(dataC, 0); - - g.deleteNode(dataF); - g.replaceNode(opG, opFused); - - auto outEdgesC = opC->getOutEdges(); - g.deleteNode(outEdgesC[0]->head()); - g.deleteNode(outEdgesC[1]->head()); - g.replaceNode(opC, opFused); - - g.deleteNode(opC); - g.deleteNode(opB); - g.deleteNode(dataB); - g.deleteNode(opF); - g.deleteNode(opG); - - return true; - }); - - // Now the nodes are: - // - NumInputs input nodes - // - dataI node - // - fused node - // - output node - auto nodes = graph.graph.getMutableNodes(); - - // Test that the graph is transformed as expected. - EXPECT_EQ(nodes.size(), graph.numInputs + 3); - TestGraph::NodeRef opFused; - TestGraph::NodeRef dataI; - TestGraph::NodeRef dataOut; - for (auto node : nodes) { - if (node->data() == "opFused") { - opFused = node; - } else if (node->data() == "dataOut") { - dataOut = node; - } else if (node->data() == "dataI") { - dataI = node; - } - } - - EXPECT_EQ(getInNode(dataOut, 0), opFused); - EXPECT_EQ(getInNode(opFused, 0), dataI); - for (int i = 1; i <= graph.numInputs; i++) { - EXPECT_EQ(getInNode(opFused, i)->data(), "input"); - } - - // Use nom::converters::convertToDotString(&graph.graph, TestGraphNodePrinter) - // to visualize. The transformed graph looks like This - /* - - +---------++---------+ - | input_A || input_D | - +---------++---------+ - | | - | | - v v -+---------+ +--------------------+ +---------+ -| input_B | --> | opFused | <-- | input_C | -+---------+ +--------------------+ +---------+ - | ^ - | | - v | - +---------++---------+ - | dataOut || dataI | - +---------++---------+ - */ -} diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h index 9f88f192936fe4..734d38d75e680d 100644 --- a/caffe2/core/operator.h +++ b/caffe2/core/operator.h @@ -323,10 +323,6 @@ class OperatorBase : public Observable { return !event_; } - virtual void SyncDevice() { - CAFFE_NOT_IMPLEMENTED; - } - // Checks whether stream is ready to execute new computation, // used in stream allocation optimization to skip stream that is currently // busy. Depends on context and operator's device, returns true by default @@ -581,8 +577,6 @@ class Operator : public OperatorBase { return &context_; } - void SyncDevice() final {} - virtual std::vector> InputFillers( const std::vector>& shapes) { CAFFE_ENFORCE(shapes.size() == Inputs().size()); diff --git a/caffe2/core/operator_gpu.cc b/caffe2/core/operator_gpu.cc deleted file mode 100644 index 03f227f7453524..00000000000000 --- a/caffe2/core/operator_gpu.cc +++ /dev/null @@ -1,26 +0,0 @@ -#include "caffe2/core/context_gpu.h" -#include "caffe2/core/operator.h" - -namespace caffe2 { - -template <> -void Operator::SyncDevice() { - auto* context = getContext(); - int device; - cudaGetDevice(&device); - - cudaEvent_t ev; - cudaSetDevice(context->cuda_gpu_id()); - cudaEventCreateWithFlags(&ev, cudaEventDisableTiming); - cudaEventRecord(ev, context->cuda_stream()); - cudaEventSynchronize(ev); - cudaEventDestroy(ev); - cudaSetDevice(device); - - cudaError_t error = cudaGetLastError(); - if (error != cudaSuccess) { - CAFFE_THROW("Encountered CUDA error Stop: ", cudaGetErrorString(error)); - } -} - -} // namespace caffe2 diff --git a/caffe2/core/typeid.h b/caffe2/core/typeid.h index facea9fa64d2fa..b4a01b57cc11e3 100644 --- a/caffe2/core/typeid.h +++ b/caffe2/core/typeid.h @@ -14,9 +14,8 @@ #include -#include "ATen/core/Half.h" #include "caffe2/core/common.h" -#include "ATen/core/IdWrapper.h" +#include "caffe2/utils/IdWrapper.h" namespace caffe2 { class CaffeTypeId; @@ -33,16 +32,16 @@ class TypeMeta; * You need to register your types using CAFFE_KNOWN_TYPE(MyType) to be able to use CaffeTypeId with custom types. * This is for example used to store the dtype of tensors. */ -class CaffeTypeId final : public at::IdWrapper { +class CaffeTypeId final : public c10::guts::IdWrapper { public: static CaffeTypeId createTypeId(); friend std::ostream& ::operator<<(std::ostream& stream, CaffeTypeId typeId); friend bool operator<(CaffeTypeId lhs, CaffeTypeId rhs); - // This is 8, because 0 is uint8_t (due to ScalarType BC constraint) + // TODO Can we get rid of uninitialized? static constexpr CaffeTypeId uninitialized() { - return CaffeTypeId(8); + return CaffeTypeId(0); } private: @@ -58,7 +57,7 @@ inline bool operator<(CaffeTypeId lhs, CaffeTypeId rhs) { } -AT_DEFINE_HASH_FOR_IDWRAPPER(caffe2::CaffeTypeId) +C10_DEFINE_HASH_FOR_IDWRAPPER(caffe2::CaffeTypeId) inline std::ostream& operator<<(std::ostream& stream, caffe2::CaffeTypeId typeId) { return stream << typeId.underlyingId(); @@ -440,41 +439,35 @@ inline bool operator!=(const TypeMeta& lhs, const TypeMeta& rhs) noexcept { class Tensor; -// Note: we have preallocated the numbers 0-8 so they line up exactly -// with at::ScalarType's numbering. All other numbers do not matter. -// -// Notably, the "uninitialized" type id is 8, not 0, for hysterical raisins. - +// note: first preallocated id is 1, because 0 is used for uninitialized type +// ids. struct _CaffeHighestPreallocatedTypeId final {}; -CAFFE_DECLARE_KNOWN_TYPE(0, uint8_t); -CAFFE_DECLARE_KNOWN_TYPE(1, int8_t); -CAFFE_DECLARE_KNOWN_TYPE(2, int16_t); +CAFFE_DECLARE_KNOWN_TYPE(1, Tensor); +CAFFE_DECLARE_KNOWN_TYPE(2, float); CAFFE_DECLARE_KNOWN_TYPE(3, int); -CAFFE_DECLARE_KNOWN_TYPE(4, int64_t); -CAFFE_DECLARE_KNOWN_TYPE(5, at::Half); -CAFFE_DECLARE_KNOWN_TYPE(6, float); -CAFFE_DECLARE_KNOWN_TYPE(7, double); -// 8 = undefined type id - -CAFFE_DECLARE_KNOWN_TYPE(9, Tensor); -CAFFE_DECLARE_KNOWN_TYPE(10, std::string); -CAFFE_DECLARE_KNOWN_TYPE(11, bool); -CAFFE_DECLARE_KNOWN_TYPE(12, uint16_t); -CAFFE_DECLARE_KNOWN_TYPE(13, char); -CAFFE_DECLARE_KNOWN_TYPE(14, std::unique_ptr); -CAFFE_DECLARE_KNOWN_TYPE(15, std::unique_ptr>); -CAFFE_DECLARE_KNOWN_TYPE(16, std::vector); -CAFFE_DECLARE_KNOWN_TYPE(17, std::vector); -CAFFE_DECLARE_KNOWN_TYPE(18, std::vector); -CAFFE_DECLARE_KNOWN_TYPE(19, bool*); -CAFFE_DECLARE_KNOWN_TYPE(20, char*); -CAFFE_DECLARE_KNOWN_TYPE(21, int*); +CAFFE_DECLARE_KNOWN_TYPE(4, std::string); +CAFFE_DECLARE_KNOWN_TYPE(5, bool); +CAFFE_DECLARE_KNOWN_TYPE(6, uint8_t); +CAFFE_DECLARE_KNOWN_TYPE(7, int8_t); +CAFFE_DECLARE_KNOWN_TYPE(8, uint16_t); +CAFFE_DECLARE_KNOWN_TYPE(9, int16_t); +CAFFE_DECLARE_KNOWN_TYPE(10, int64_t); +CAFFE_DECLARE_KNOWN_TYPE(11, double); +CAFFE_DECLARE_KNOWN_TYPE(12, char); +CAFFE_DECLARE_KNOWN_TYPE(13, std::unique_ptr); +CAFFE_DECLARE_KNOWN_TYPE(14, std::unique_ptr>); +CAFFE_DECLARE_KNOWN_TYPE(15, std::vector); +CAFFE_DECLARE_KNOWN_TYPE(16, std::vector); +CAFFE_DECLARE_KNOWN_TYPE(17, std::vector); +CAFFE_DECLARE_KNOWN_TYPE(18, bool*); +CAFFE_DECLARE_KNOWN_TYPE(19, char*); +CAFFE_DECLARE_KNOWN_TYPE(20, int*); #ifdef CAFFE2_UNIQUE_LONG_TYPEMETA -CAFFE_DECLARE_KNOWN_TYPE(22, long); -CAFFE_DECLARE_KNOWN_TYPE(23, std::vector); +CAFFE_DECLARE_KNOWN_TYPE(21, long); +CAFFE_DECLARE_KNOWN_TYPE(22, std::vector); #endif // CAFFE2_UNIQUE_LONG_TYPEMETA -CAFFE_DECLARE_KNOWN_TYPE(24, _CaffeHighestPreallocatedTypeId); +CAFFE_DECLARE_KNOWN_TYPE(23, _CaffeHighestPreallocatedTypeId); } diff --git a/caffe2/ideep/utils/ideep_context.h b/caffe2/ideep/utils/ideep_context.h index c7215e0ed28b32..35c2008d4fdab0 100644 --- a/caffe2/ideep/utils/ideep_context.h +++ b/caffe2/ideep/utils/ideep_context.h @@ -21,7 +21,7 @@ class IDEEPContext final : public BaseContext { CAFFE_ENFORCE_EQ(option.device_type(), IDEEP); } - ~IDEEPContext() noexcept override {} + ~IDEEPContext() noexcept {} BaseStaticContext* GetStaticContext() const override { return GetIDEEPStaticContext(); diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm index 755e1b5a57b8a9..45f55ab2407a2e 100644 --- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm +++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm @@ -489,7 +489,7 @@ bool RunOnDevice() override { "noise_size", 491 /* prime to avoid artifacts */); // Treaded as half4 in the kernel, so need half4 here. noiseSize = divRoundUp(noiseSize, 4) * 4; - if (!noiseBlob->IsType(CPU) || + if (!noiseBlob->IsType() || noiseBlob->Get().size() != noiseSize) { VLOG(2) << "Initializing stylizer with noise: " << noiseSize; caffe2::Timer rt; diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm index bcf588d8a384f0..9f032e6fe299d0 100644 --- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm +++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm @@ -94,7 +94,7 @@ void testMPSCNN() { Workspace ws; for (auto i = 0; i < N; ++i) { - auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU); + auto* t = ws.CreateBlob(cpu(i))->GetMutable(); t->Resize(BS, C, H, W); CPUContext ctx; math::RandGaussian( @@ -152,7 +152,7 @@ void testMPSCNN() { Workspace ws; for (auto i = 0; i < N; ++i) { - auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU); + auto* t = ws.CreateBlob(cpu(i))->GetMutable(); switch (ndim) { case 1: t->Resize(5); @@ -210,7 +210,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNNormalizePlanarYUV Test: "; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X_cpu")->GetMutable(); t->Resize(batch_size, channels, 8, 13); CPUContext ctx; math::RandGaussian( @@ -218,14 +218,14 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("mean")->GetMutable(); t->Resize(1, channels); CPUContext ctx; math::RandGaussian( t->size(), 0, 1, t->mutable_data(), &ctx); } { - auto* t = ws.CreateBlob("stddev")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("stddev")->GetMutable(); t->Resize(1, channels); CPUContext ctx; math::RandUniform( @@ -290,7 +290,7 @@ void testMPSCNN() { for (const auto dim : {10, 40}) { Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X_cpu")->GetMutable(); t->Resize(batchSize, channels, dim, dim); CPUContext ctx; // Too noisy. @@ -299,7 +299,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("W")->GetMutable(); t->Resize(channels); CPUContext ctx; for (auto i = 0; i < t->size(); ++i) { @@ -310,7 +310,7 @@ void testMPSCNN() { // t->mutable_data(), &ctx); } { - auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("b")->GetMutable(); t->Resize(channels); CPUContext ctx; for (auto i = 0; i < t->size(); ++i) { @@ -321,7 +321,7 @@ void testMPSCNN() { // t->mutable_data(), &ctx); } { - auto* t = ws.CreateBlob("pw")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("pw")->GetMutable(); t->Resize(prelu == PreluTy::SHARED ? 1 : channels); CPUContext ctx; // Too noisy. @@ -409,7 +409,7 @@ void testMPSCNN() { Workspace ws; const auto channels = array ? 12 : 3; { - auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X_cpu")->GetMutable(); t->Resize(batch_size, channels, 8, 13); CPUContext ctx; math::RandGaussian( @@ -417,7 +417,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("b")->GetMutable(); t->Resize(shared ? channels : 1); CPUContext ctx; math::RandGaussian( @@ -480,7 +480,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNSpatialBN Test: " << channels; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X_cpu")->GetMutable(); t->Resize(batch_size, channels, 8, 13); CPUContext ctx; math::RandGaussian( @@ -488,7 +488,7 @@ void testMPSCNN() { } for (const std::string name : {"scale", "bias", "mean", "var"}) { - auto* t = ws.CreateBlob(name)->GetMutableTensor(CPU); + auto* t = ws.CreateBlob(name)->GetMutable(); t->Resize(channels); CPUContext ctx; // High mean to avoid var division by zero. @@ -575,7 +575,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNFC Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X_cpu")->GetMutable(); t->Resize(batchSize, CIn, H, W); CPUContext ctx; math::RandGaussian( @@ -583,7 +583,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("W")->GetMutable(); t->Resize(COut, CIn * H * W); CPUContext ctx; math::RandGaussian( @@ -591,7 +591,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("b")->GetMutable(); t->Resize(COut); CPUContext ctx; math::RandGaussian( @@ -683,7 +683,7 @@ void testMPSCNN() { Workspace ws; { auto* t = - ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); + ws.CreateBlob("X_cpu")->GetMutable(); t->Resize(batchSize, 8, 8, 13); CPUContext ctx; math::RandGaussian( @@ -784,7 +784,7 @@ void testMPSCNN() { std::vector>{{1, 3, 50, 80}, {1, 12, 50, 80}}) { Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X_cpu")->GetMutable(); t->Resize(dims); CPUContext ctx; math::RandGaussian( @@ -860,7 +860,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNPreprocess Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X_cpu")->GetMutable(); t->Resize(1, 8, 13, 4); CPUContext ctx; for (auto i = 0; i < t->size(); ++i) { @@ -869,7 +869,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("mean")->GetMutable(); t->Resize(3); CPUContext ctx; t->mutable_data()[0] = 100; @@ -940,7 +940,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNDeprocess Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X_cpu")->GetMutable(); t->Resize(1, 3, 8, 24); CPUContext ctx; for (auto i = 0; i < t->size(); ++i) { @@ -949,7 +949,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("mean")->GetMutable(); t->Resize(3); CPUContext ctx; t->mutable_data()[0] = 100; @@ -999,7 +999,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNDeprocess Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X_cpu")->GetMutable(); t->Resize(1, 3, 1280, 720); CPUContext ctx; for (auto i = 0; i < t->size(); ++i) { @@ -1008,7 +1008,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("mean")->GetMutable(); t->Resize(3); CPUContext ctx; t->mutable_data()[0] = 30; @@ -1072,7 +1072,8 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNConv Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); + auto* t = + ws.CreateBlob("X_cpu")->GetMutable(); t->Resize(batchSize, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -1080,7 +1081,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("W")->GetMutable(); t->Resize(8, 12, kernel_h, kernel_w); CPUContext ctx; math::RandGaussian( @@ -1092,7 +1093,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("b")->GetMutable(); t->Resize(8); CPUContext ctx; math::RandGaussian( @@ -1188,7 +1189,7 @@ void testMPSCNN() { Workspace ws; int output_channels = input_channels * channel_multiplier; { - auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X_cpu")->GetMutable(); t->Resize(batchSize, input_channels, 57, 72); CPUContext ctx; math::RandGaussian( @@ -1196,7 +1197,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("W")->GetMutable(); t->Resize(output_channels, 1, 3, 3); CPUContext ctx; math::RandGaussian( @@ -1204,7 +1205,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("b")->GetMutable(); t->Resize(output_channels); CPUContext ctx; math::RandGaussian( @@ -1275,7 +1276,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNConvRelu Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X_cpu")->GetMutable(); t->Resize(1, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -1283,7 +1284,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("W")->GetMutable(); t->Resize(8, 12, 3, 3); CPUContext ctx; math::RandGaussian( @@ -1291,7 +1292,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("b")->GetMutable(); t->Resize(8); CPUContext ctx; math::RandGaussian( @@ -1385,7 +1386,7 @@ void testMPSCNN() { LOG(INFO) << "MPSConv Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X_cpu")->GetMutable(); t->Resize(1, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -1393,7 +1394,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("W")->GetMutable(); t->Resize(8, 12, 3, 3); CPUContext ctx; math::RandGaussian( @@ -1401,7 +1402,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("b")->GetMutable(); t->Resize(8); CPUContext ctx; math::RandGaussian( @@ -1493,7 +1494,7 @@ void testMPSCNN() { LOG(INFO) << "MPSConv Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X_cpu")->GetMutable(); t->Resize(batchSize, C, 12, 16); CPUContext ctx; math::RandGaussian( @@ -1501,7 +1502,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("W")->GetMutable(); t->Resize(M, C, K, K); CPUContext ctx; math::RandGaussian( @@ -1509,7 +1510,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("b")->GetMutable(); t->Resize(M); CPUContext ctx; math::RandGaussian( @@ -1607,7 +1608,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNConv Test - group"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X_cpu")->GetMutable(); t->Resize(batchSize, C, 12, 16); CPUContext ctx; math::RandGaussian( @@ -1615,7 +1616,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("W")->GetMutable(); t->Resize(M, C / group, K, K); CPUContext ctx; math::RandGaussian( @@ -1623,7 +1624,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("b")->GetMutable(); t->Resize(M); CPUContext ctx; math::RandGaussian( @@ -1726,7 +1727,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNMul Test"; Workspace ws; { - auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X0_cpu")->GetMutable(); t->Resize(1, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -1734,7 +1735,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X1_cpu")->GetMutable(); t->Resize(72); CPUContext ctx; math::RandGaussian( @@ -1791,7 +1792,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNSub Test"; Workspace ws; { - auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X0_cpu")->GetMutable(); t->Resize(1, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -1799,7 +1800,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X1_cpu")->GetMutable(); t->Resize(72); CPUContext ctx; math::RandGaussian( @@ -1856,7 +1857,7 @@ void testMPSCNN() { LOG(INFO) << "MPSAdd Test"; Workspace ws; { - auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X0_cpu")->GetMutable(); t->Resize(1, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -1864,7 +1865,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X1_cpu")->GetMutable(); t->Resize(1, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -1921,7 +1922,7 @@ void testMPSCNN() { LOG(INFO) << "MPSAdd Test"; Workspace ws; { - auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X0_cpu")->GetMutable(); t->Resize(1, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -1929,7 +1930,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X1_cpu")->GetMutable(); t->Resize(1, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -2011,7 +2012,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNNeuron Test: " << n; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X_cpu")->GetMutable(); t->Resize(1, 4, 12, 12); CPUContext ctx; math::RandGaussian( @@ -2065,7 +2066,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNDropout Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X_cpu")->GetMutable(); t->Resize(1, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -2136,7 +2137,7 @@ void testMPSCNN() { << " - scale: " << scale; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X_cpu")->GetMutable(); t->Resize(1, channels, 40, 40); CPUContext ctx; math::RandGaussian( @@ -2144,7 +2145,7 @@ void testMPSCNN() { } { // Use the batch-first encoding (n, [bbox]) - auto* t = ws.CreateBlob("R")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("R")->GetMutable(); t->Resize(6, 5); for (auto i = 0; i < t->dim32(0); ++i) { t->mutable_data()[5 * i + 0] = 0; // batch @@ -2250,14 +2251,14 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNRoIWarp Test 2"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X_cpu")->GetMutable(); t->Resize(1, 8, 40, 40); CPUContext ctx; math::RandGaussian( t->size(), 4, 2, t->mutable_data(), &ctx); } { - auto* t = ws.CreateBlob("R")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("R")->GetMutable(); t->Resize(6, 4); for (auto i = 0; i < t->dim32(0); ++i) { t->mutable_data()[4 * i + 0] = (i % 4 + 1) * 1.0 / scale; @@ -2362,7 +2363,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNResizeNearestOp Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X_cpu")->GetMutable(); t->Resize(N, C, 37, 89); CPUContext ctx; math::RandGaussian( @@ -2497,7 +2498,7 @@ void testMPSCNN() { vector im_info{60, 80, 0.166667}; vector anchors{-38, -16, 53, 31, -120, -120, 135, 135}; { - auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X_cpu")->GetMutable(); t->Resize(num_images, A, H, W); for (auto i = 0; i < t->size(); ++i) { t->mutable_data()[i] = scores[i]; @@ -2505,7 +2506,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("bbox_delta_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("bbox_delta_cpu")->GetMutable(); t->Resize(num_images, 4 * A, H, W); for (auto i = 0; i < t->size(); ++i) { t->mutable_data()[i] = bbx[i]; @@ -2513,7 +2514,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("im_info")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("im_info")->GetMutable(); t->Resize(num_images, 3); for (auto i = 0; i < t->size(); ++i) { t->mutable_data()[i] = im_info[i]; @@ -2521,7 +2522,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("anchors")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("anchors")->GetMutable(); t->Resize(A, 4); for (auto i = 0; i < t->size(); ++i) { t->mutable_data()[i] = anchors[i]; @@ -2587,7 +2588,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNSoftmax Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X_cpu")->GetMutable(); // Only works for spatial dimension of (1, 1) - weird. t->Resize(batchSize, 12, 1, 1); CPUContext ctx; @@ -2661,8 +2662,8 @@ void testMPSCNN() { LOG(INFO) << "MPSConvTranspose Test"; Workspace ws; { - auto* t = - ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X_cpu") + ->GetMutable(); t->Resize(batchSize, inputChannels, 8, 12); CPUContext ctx; math::RandGaussian( @@ -2675,7 +2676,7 @@ void testMPSCNN() { { auto* t = - ws.CreateBlob("W")->GetMutableTensor(CPU); + ws.CreateBlob("W")->GetMutable(); t->Resize( inputChannels, outputChannels, @@ -2692,7 +2693,7 @@ void testMPSCNN() { { auto* t = - ws.CreateBlob("b")->GetMutableTensor(CPU); + ws.CreateBlob("b")->GetMutable(); t->Resize(outputChannels); CPUContext ctx; math::RandGaussian( @@ -2809,7 +2810,7 @@ void testMPSCNN() { << batchSize; Workspace ws; for (auto i = 0; i < numInputs; ++i) { - auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU); + auto* t = ws.CreateBlob(cpu(i))->GetMutable(); t->Resize(batchSize, array ? (i + 1) * 4 : 4, 10, 10); CPUContext ctx; math::RandGaussian( @@ -2891,7 +2892,7 @@ void testMPSCNN() { } Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); + auto* t = ws.CreateBlob("X_cpu")->GetMutable(); t->Resize(batchSize, inputChannels, 53, 47); CPUContext ctx; math::RandGaussian( @@ -2964,7 +2965,7 @@ void testMPSCNN() { << numInputs << ", " << batchSize; Workspace ws; for (auto i = 0; i < numInputs; ++i) { - auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU); + auto* t = ws.CreateBlob(cpu(i))->GetMutable(); t->Resize(batchSize, channelCount, 9, 17); CPUContext ctx; math::RandGaussian( @@ -3337,7 +3338,7 @@ void compareModels(const NetDef& initNet, NetDef predictNet) { cws.RunNetOnce(initNet); { auto* t = - cws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU); + cws.CreateBlob(predictNet.external_input(0))->GetMutable(); t->Resize(1, 224, 224, 4); for (auto i = 0; i < t->size(); ++i) { t->mutable_data()[i] = i % 225; @@ -3349,7 +3350,7 @@ void compareModels(const NetDef& initNet, NetDef predictNet) { mws.RunNetOnce(initNet); { auto* t = - mws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU); + mws.CreateBlob(predictNet.external_input(0))->GetMutable(); t->Resize(1, 224, 224, 4); for (auto i = 0; i < t->size(); ++i) { t->mutable_data()[i] = i % 225; @@ -3397,16 +3398,16 @@ void verifyRewrite( dumpDef(predictNet); dumpDef(metalPredictNet); -#define RUN_NET(ws, predictNet) \ - ws.RunNetOnce(initNet); \ - { \ - auto* t = \ - ws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU); \ - t->Resize(inputDims); \ - CPUContext ctx; \ - math::RandGaussian( \ - t->size(), 0, 1, t->mutable_data(), &ctx); \ - } \ +#define RUN_NET(ws, predictNet) \ + ws.RunNetOnce(initNet); \ + { \ + auto* t = \ + ws.CreateBlob(predictNet.external_input(0))->GetMutable(); \ + t->Resize(inputDims); \ + CPUContext ctx; \ + math::RandGaussian( \ + t->size(), 0, 1, t->mutable_data(), &ctx); \ + } \ ws.RunNetOnce(predictNet); // initialize diff --git a/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h index 2896bc26ac08d4..70b9ac05747511 100644 --- a/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h +++ b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h @@ -41,7 +41,7 @@ void RowsWhereRoILevelEquals(Eigen::Ref rois, // distribute those proposals to their appropriate FPN levels for Faster RCNN. // An anchor at one FPN level may predict an RoI that will map to another // level, hence the need to redistribute the proposals. -// Reference: facebookresearch/Detectron/detectron/ops/collect_and_distribute_fpn_rpn_proposals.py +// Reference: detectron/lib/ops/collect_and_distribute_fpn_rpn_proposals.py template class CollectAndDistributeFpnRpnProposalsOp final : public Operator { public: diff --git a/caffe2/operators/conv_op_cudnn.cc b/caffe2/operators/conv_op_cudnn.cc index 2f11645f21c5cc..ddb0f8f89c144b 100644 --- a/caffe2/operators/conv_op_cudnn.cc +++ b/caffe2/operators/conv_op_cudnn.cc @@ -602,12 +602,12 @@ bool CudnnConvOp::DoRunWithType() { kernel_w())); } else { vector dims(filter.dims().begin(), filter.dims().end()); + dims[0] /= group_; #if !CUDNN_VERSION_MIN(7, 0, 0) - // We only need to divide dims by group_ when CUDNN version < 7.0 - // see CUDA group convolution doc: https://fburl.com/dgj6dvpd order_ == StorageOrder::NCHW ? dims[1] /= group_ : dims[filter.ndim() - 1] /= group_; #endif + dims[filter.ndim() - 1] /= group_; CUDNN_ENFORCE(cudnnSetFilterNdDescriptor( filter_desc_, cudnnTypeWrapper::type, @@ -959,12 +959,10 @@ bool CudnnConvGradientOp::DoRunWithType() { } else { vector dims(filter.dims().begin(), filter.dims().end()); #if !CUDNN_VERSION_MIN(7, 0, 0) - // We only need to divide dims by group_ when CUDNN version < 7.0 - // see CUDA group convolution doc: https://fburl.com/dgj6dvpd + dims[0] /= group_; +#endif order_ == StorageOrder::NCHW ? dims[1] /= group_ : dims[filter.ndim() - 1] /= group_; -#endif - CUDNN_ENFORCE(cudnnSetFilterNdDescriptor( filter_desc_, cudnnTypeWrapper::type, diff --git a/caffe2/operators/generate_proposals_op.h b/caffe2/operators/generate_proposals_op.h index faf4936495244f..81f7d9ac43123f 100644 --- a/caffe2/operators/generate_proposals_op.h +++ b/caffe2/operators/generate_proposals_op.h @@ -59,7 +59,7 @@ ERMatXf ComputeAllAnchors( // regression result 'deltas' as well as predefined bounding box shapes // 'anchors'. Greedy non-maximum suppression is applied to generate the // final bounding boxes. -// Reference: facebookresearch/Detectron/detectron/ops/generate_proposals.py +// Reference: detectron/lib/ops/generate_proposals.py template class GenerateProposalsOp final : public Operator { public: diff --git a/caffe2/operators/generate_proposals_op_util_boxes.h b/caffe2/operators/generate_proposals_op_util_boxes.h index 333514102b7d4b..0c4c345d382cb1 100644 --- a/caffe2/operators/generate_proposals_op_util_boxes.h +++ b/caffe2/operators/generate_proposals_op_util_boxes.h @@ -5,7 +5,7 @@ #include "caffe2/utils/math.h" // Bounding box utils for generate_proposals_op -// Reference: facebookresearch/Detectron/detectron/utils/boxes.py +// Reference: detectron/lib/utils/boxes.py namespace caffe2 { namespace utils { diff --git a/caffe2/operators/generate_proposals_op_util_nms.h b/caffe2/operators/generate_proposals_op_util_nms.h index 7b38cd6a1420d6..5d6f87d4d30563 100644 --- a/caffe2/operators/generate_proposals_op_util_nms.h +++ b/caffe2/operators/generate_proposals_op_util_nms.h @@ -19,7 +19,7 @@ namespace utils { // Reject a bounding box if its region has an intersection-overunion (IoU) // overlap with a higher scoring selected bounding box larger than a // threshold. -// Reference: facebookresearch/Detectron/detectron/utils/cython_nms.pyx +// Reference: detectron/lib/utils/cython_nms.pyx // proposals: pixel coordinates of proposed bounding boxes, // size: (M, 4), format: [x1; y1; x2; y2] // scores: scores for each bounding box, size: (M, 1) @@ -78,7 +78,7 @@ std::vector nms_cpu_upright( /** * Soft-NMS implementation as outlined in https://arxiv.org/abs/1704.04503. - * Reference: facebookresearch/Detectron/detectron/utils/cython_nms.pyx + * Reference: detectron/lib/utils/cython_nms.pyx * out_scores: Output updated scores after applying Soft-NMS * proposals: pixel coordinates of proposed bounding boxes, * size: (M, 4), format: [x1; y1; x2; y2] @@ -426,7 +426,7 @@ std::vector nms_cpu( // Reject a bounding box if its region has an intersection-overunion (IoU) // overlap with a higher scoring selected bounding box larger than a // threshold. -// Reference: facebookresearch/Detectron/detectron/lib/utils/cython_nms.pyx +// Reference: detectron/lib/utils/cython_nms.pyx // proposals: pixel coordinates of proposed bounding boxes, // size: (M, 4), format: [x1; y1; x2; y2] // size: (M, 5), format: [ctr_x; ctr_y; w; h; angle (degrees)] for RRPN diff --git a/caffe2/opt/converter.cc b/caffe2/opt/converter.cc index 37d675eba83a49..b4866618b4e607 100644 --- a/caffe2/opt/converter.cc +++ b/caffe2/opt/converter.cc @@ -146,6 +146,9 @@ REGISTER_CONVERTER(SpatialBN, BatchNormalizationConverter); TRIVIAL_CONVERTER(Flatten); REGISTER_CONVERTER(Flatten, FlattenConverter); +TRIVIAL_CONVERTER(BatchGather); +REGISTER_CONVERTER(BatchGather, BatchGatherConverter); + class AveragePoolConverter : public Converter { std::unique_ptr convertToNeuralNetOperator( const OperatorDef& op) override { @@ -202,6 +205,37 @@ class ConcatConverter : public Converter { }; REGISTER_CONVERTER(Concat, ConcatConverter); +class BatchMatMulConverter : public Converter { + std::unique_ptr convertToNeuralNetOperator( + const OperatorDef& op) override { + std::unique_ptr nnOp = + util::make_unique(); + auto argMap = getArgumentsFromOperator(op); + + auto c = dyn_cast(nnOp.get()); + if (argMap.count("trans_a")) { + CAFFE_ENFORCE(argMap["trans_a"].has_i(), "Invalid axis argument"); + int trans_a = static_cast(argMap["trans_a"].i()); + c->setTransA(!!trans_a); + } + if (argMap.count("trans_b")) { + CAFFE_ENFORCE(argMap["trans_b"].has_i(), "Invalid add_axis argument"); + int trans_b = static_cast(argMap["trans_b"].i()); + c->setTransB(!!trans_b); + } + if (argMap.count("broadcast")) { + CAFFE_ENFORCE(argMap["broadcast"].has_i(), "Invalid add_axis argument"); + int broadcast = static_cast(argMap["broadcast"].i()); + c->setBroadcast(!!broadcast); + } + return nnOp; + } + // Does not override default converter to OperatorDef + + virtual ~BatchMatMulConverter() {} +}; +REGISTER_CONVERTER(BatchMatMul, BatchMatMulConverter); + } // namespace std::unique_ptr convertToNeuralNetOperator( @@ -236,6 +270,145 @@ std::unique_ptr convertToNeuralNetOperator( return nnOp; } +void handleWhileOp( + repr::NNGraph& dfg, + repr::NNCFGraph& cfg, + repr::NNGraph::NodeRef& opNode, + repr::NNCFGraph::NodeRef& bbNode, + OperatorDef& op, + std::unordered_map& blobMap +) { + opNode->resetData(util::make_unique()); + auto argMap = Converter::getArgumentsFromOperator(op); + std::string bodyNetSerialized = argMap["body"].s(); + auto bodyNet = caffe2::NetDef(); + bodyNet.ParseFromString(bodyNetSerialized); + + std::unordered_map bodyBlobMap; + auto bodyNN = convertToNNModule(bodyNet, &bodyBlobMap); + repr::NNGraph bodyGraph = std::move(bodyNN.dataFlow); + repr::NNCFGraph bodyCFGraph = std::move(bodyNN.controlFlow); + + auto rev_sorted = algorithm::tarjans(&bodyGraph); + + for (auto& k : bodyBlobMap) { + auto name = k.first; + if (blobMap.count(name)) { + auto oldNode = blobMap[name]; + printf("Exit tensor %s is in the parent scope, inserting Phi node...\n", k.first.c_str()); + auto phiNode = dfg.createNode(util::make_unique()); // NN variant of a Phi node + // Clone the operator. + auto tensor = dyn_cast(blobMap[name]->data().get()); + auto* clonedTensor = tensor->clone(); + auto phiOut = dfg.createNode(std::unique_ptr(clonedTensor)); + dfg.createEdge(phiNode, phiOut); + dfg.createEdge(oldNode, phiNode); + dfg.createEdge(bodyBlobMap[name], phiNode); + blobMap[name] = phiOut; + for (auto& inEdge : opNode->getInEdges()) { + if (inEdge->tail() == oldNode) { + dfg.deleteEdge(inEdge); + dfg.createEdge(phiOut, opNode); + } + } + } + } + + // Dependencies simply have no producers + std::unordered_map inNodeMap; + for (auto& n : bodyGraph.getMutableNodes()) { + if (!isa(n->data())) { continue; } + if (n->getInEdges().size() == 0) { + auto name = dyn_cast(n->data().get())->getName(); + // TODO(bwasti): this may be needed, depending on constraints + //assert(blobMap.count(name) != 0 && "Loop body takes undefined dependency."); + if (blobMap.count(name)) { + inNodeMap[n] = blobMap[name]; + } + } + } + + CAFFE_ENFORCE(rev_sorted.front().getNodes().size() == 1, + "More than one exit node."); + CAFFE_ENFORCE(rev_sorted.back().getNodes().size() == 1, + "More than one entry node."); + + auto exit_tensor = *(rev_sorted.front().getNodes().begin()); + CAFFE_ENFORCE(isa(exit_tensor->data()), + "Exit node is not a tensor."); + + auto bodyNodes = bodyGraph.getMutableNodes(); + auto bodyEdges = bodyGraph.getMutableEdges(); + + for (auto node : bodyNodes) { + bodyGraph.importNode(node, dfg); + } + + for (auto edge : bodyEdges) { + bodyGraph.importEdge(edge, dfg); + } + + // Merge all dependencies + for (auto node : dfg.getMutableNodes()) { + if (inNodeMap.count(node)) { + dfg.replaceNode(node, inNodeMap[node]); + dfg.deleteNode(node); + } + } + + for (const auto& inEdge : opNode->getInEdges()) { + auto* inputData = dyn_cast(inEdge->tail()->data().get()); + auto* exitData = dyn_cast(exit_tensor->data().get()); + if (inputData->getName() == exitData->getName()) { + dfg.replaceNode(exit_tensor, inEdge->tail()); + dfg.deleteNode(exit_tensor); + } + } + + // CFG Handling + auto bodyCFNodes = bodyCFGraph.getMutableNodes(); + auto bodyCFEdges = bodyCFGraph.getMutableEdges(); + + // Create a while loop CFG node. + auto whileBasicBlock = util::make_unique>(); + for (auto& inEdge : opNode->getInEdges()) { + auto node = inEdge->tail(); + for (auto& parentInEdge : node->getInEdges()) { + auto parentNode = parentInEdge->tail(); + if (isa(parentNode->data().get())) { + whileBasicBlock->pushInstructionNode(parentNode); + } + } + } + whileBasicBlock->pushInstructionNode(opNode); + + auto whileCFNode = cfg.createNode(std::move(whileBasicBlock)); + cfg.createEdge(bbNode, whileCFNode, 0); + + // The true path executes the body of the loop, so we + // take that BB and point to it. + for (auto cfNode : bodyCFNodes) { + bodyCFGraph.importNode(cfNode, cfg); + // If the CFG node has no children, we loop back to the top of the + // while loop. + if (cfNode->getOutEdges().size() == 0) { + cfg.createEdge(cfNode, whileCFNode, 0); + } + // TODO check for a single entry point + if (cfNode->getInEdges().size() == 0) { + cfg.createEdge(whileCFNode, cfNode, 1); + } + } + for (auto cfEdge : bodyCFEdges) { + bodyCFGraph.importEdge(cfEdge, cfg); + } + + // Now create the false case. + bbNode = + cfg.createNode(util::make_unique>()); + cfg.createEdge(whileCFNode, bbNode, -1); +} + /// \brief Ingest a caffe2 protobuf model and output an NNModule. /// \param net The caffe2 protobuf NetDef @@ -282,9 +455,13 @@ repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_mapresetData(convertToNeuralNetOperator(op)); - auto currentBasicBlock = bbNode->mutableData()->get(); - currentBasicBlock->pushInstructionNode(opNode); + if (op.type() == "While") { + handleWhileOp(dfg, cfg, opNode, bbNode, op, blobMap); + } else { + opNode->resetData(convertToNeuralNetOperator(op)); + auto currentBasicBlock = bbNode->mutableData()->get(); + currentBasicBlock->pushInstructionNode(opNode); + } } repr::NNModule module; diff --git a/caffe2/opt/converter_nomigraph_test.cc b/caffe2/opt/converter_nomigraph_test.cc index 0bab53f738d7c2..69f51df49cbf74 100644 --- a/caffe2/opt/converter_nomigraph_test.cc +++ b/caffe2/opt/converter_nomigraph_test.cc @@ -48,3 +48,65 @@ TEST(Converter, UnknownType) { auto new_netdef = caffe2::convertToCaffe2Proto(nn); } +/* Temporarily disabled While conversion tests +TEST(Converter, While) { + caffe2::NetDef net; + + caffe2::OperatorDef *def = net.add_op(); + def->set_type("While"); + def->add_input("X"); + + caffe2::NetDef body_net; + { + caffe2::OperatorDef *rdef = body_net.add_op(); + rdef->set_type("Relu"); + rdef->add_input("X"); + rdef->add_output("X"); + } + std::string body_net_serialized; + assert(body_net.SerializeToString(&body_net_serialized)); + ADD_ARG(def, "body", s, body_net_serialized); + + auto nn = caffe2::convertToNNModule(net); +} + +TEST(Converter, ComplexWhile) { + caffe2::NetDef net; + + { + caffe2::OperatorDef *rdef = net.add_op(); + rdef->set_type("Relu"); + rdef->add_input("X"); + rdef->add_output("X"); + } + + caffe2::OperatorDef *def = net.add_op(); + def->set_type("While"); + def->add_input("X"); + + caffe2::NetDef body_net; + { + caffe2::OperatorDef *rdef = body_net.add_op(); + rdef->set_type("Instr1"); + rdef->add_input("X"); + rdef->add_output("X"); + } + { + caffe2::OperatorDef *rdef = body_net.add_op(); + rdef->set_type("Instr2"); + rdef->add_input("X"); + rdef->add_output("X"); + } + { + caffe2::OperatorDef *rdef = body_net.add_op(); + rdef->set_type("Instr3"); + rdef->add_input("X"); + rdef->add_output("X"); + } + std::string body_net_serialized; + assert(body_net.SerializeToString(&body_net_serialized)); + ADD_ARG(def, "body", s, body_net_serialized); + + auto nn = caffe2::convertToNNModule(net); +} +*/ diff --git a/caffe2/opt/device.cc b/caffe2/opt/device.cc index 0cfdd6c1dc91a3..9abca6d67e08b3 100644 --- a/caffe2/opt/device.cc +++ b/caffe2/opt/device.cc @@ -9,14 +9,15 @@ std::vector getInputEdges( const NNGraph::SubgraphType& sg, const NNGraph& g) { std::vector inputTensorEdges; - for (const auto& node : sg.getNodes()) { + for (const auto& node : sg.Nodes) { NOM_REQUIRE_OR_CONT(nn::is(node)); NOM_REQUIRE_OR_CONT(nn::hasInputs(node)); // Check if tensor's parents are in the sg for (const auto& input : nn::getInputs(node)) { NOM_REQUIRE_OR_CONT( - !nn::hasProducer(input) || !sg.hasNode(nn::getProducer(input))); + !nn::hasProducer(input) || + sg.Nodes.count(nn::getProducer(input)) == 0); inputTensorEdges.emplace_back(g.getEdge(input, node)); } } @@ -27,13 +28,13 @@ std::vector getOutputEdges( const NNGraph::SubgraphType& sg, const NNGraph& g) { std::vector outputTensorEdges; - for (const auto& node : sg.getNodes()) { + for (const auto& node : sg.Nodes) { NOM_REQUIRE_OR_CONT(nn::is(node)); for (const auto& output : nn::getOutputs(node)) { auto consumers = nn::getConsumers(output); for (const auto& consumer : consumers) { - NOM_REQUIRE_OR_CONT(!sg.hasNode(consumer)); + NOM_REQUIRE_OR_CONT(sg.Nodes.count(consumer) == 0); outputTensorEdges.emplace_back(g.getEdge(node, output)); } NOM_REQUIRE_OR_CONT(consumers.size() == 0); diff --git a/caffe2/opt/fusion.cc b/caffe2/opt/fusion.cc index f5ea0f678ed515..8a1b736399562a 100644 --- a/caffe2/opt/fusion.cc +++ b/caffe2/opt/fusion.cc @@ -1,6 +1,5 @@ -#include "caffe2/opt/fusion.h" -#include "caffe2/core/logging.h" #include "caffe2/opt/converter.h" +#include "caffe2/opt/fusion.h" #include "caffe2/opt/passes.h" namespace caffe2 { @@ -19,25 +18,27 @@ bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) { for (auto convNode : repr::nn::nodeIterator(nn->dataFlow)) { auto output = repr::nn::getOutputs(convNode).front(); auto consumers = repr::nn::getConsumers(output); - NOM_REQUIRE_OR_CONT(consumers.size() == 1); - + if (consumers.size() != 1) { + continue; + } auto consumer = consumers.front(); - NOM_REQUIRE_OR_CONT(repr::nn::is(consumer)); - + if (!repr::nn::is(consumer)) { + continue; + } auto bnNode = consumer; auto bn = repr::nn::get(bnNode); - auto bnOutputs = nn::getOutputs(bnNode); - NOM_REQUIRE_OR_CONT(bnOutputs.size() == 1); - auto bnOutput = bnOutputs.front(); auto convInputs = repr::nn::getInputs(convNode); - CAFFE_ENFORCE( - convInputs.size() >= 3, - "Invalid convolution input size (TODO: optional bias)"); + if (convInputs.size() < 3) { + assert(0 && "Invalid convolution input size (TODO: optional bias)"); + continue; + } auto bnInputs = repr::nn::getInputs(bnNode); - CAFFE_ENFORCE( - bnInputs.size() >= 5, "Invalid batch normalization input size"); + if (bnInputs.size() < 5) { + assert(0 && "Invalid batch normalization input size"); + continue; + } #define EXPOSE_TENSOR_DATA(name, index, inputs) \ auto name = repr::nn::get(inputs[index]); \ @@ -68,8 +69,6 @@ bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) { biasConvData[c] = bias; } - nn->dataFlow.deleteNode(output); - nn->dataFlow.createEdge(convNode, bnOutput); nn->dataFlow.deleteNode(bnNode); return true; } diff --git a/caffe2/opt/mobile.cc b/caffe2/opt/mobile.cc index adbbbd19a1e367..6d0006818789bb 100644 --- a/caffe2/opt/mobile.cc +++ b/caffe2/opt/mobile.cc @@ -11,15 +11,23 @@ using namespace nom; void addNNPACK(repr::NNModule* nn, bool low_memory) { for (auto node : nn->dataFlow.getMutableNodes()) { + auto* nodeData = node->data().get(); // Let graph retain ownership. + // Skip blobs. - NOM_REQUIRE_OR_CONT(repr::nn::is(node)); + if (!isa(nodeData)) { + continue; + } // Check if it is a convolution. - auto nnOp = repr::nn::get(node); - NOM_REQUIRE_OR_CONT(isa(nnOp)); + auto nnOp = dyn_cast(nodeData); + if (!isa(nnOp)) { + continue; + } // Requires X, W, b for NNPACK - NOM_REQUIRE_OR_CONT(node->getInEdges().size() >= 3); + if (node->getInEdges().size() < 3) { + continue; + } std::string engine = "NNPACK"; @@ -27,7 +35,9 @@ void addNNPACK(repr::NNModule* nn, bool low_memory) { bool validTransformCandidate = true; auto conv = dyn_cast(nnOp); - NOM_REQUIRE_OR_CONT(conv->getLayout() == nom::repr::Conv::NNLayout::NCHW); + if (conv->getLayout() != nom::repr::Conv::NNLayout::NCHW) { + continue; + } // NNPACK only supports stride == 1 for (auto stride : conv->getStrides()) { @@ -36,21 +46,28 @@ void addNNPACK(repr::NNModule* nn, bool low_memory) { break; } } - NOM_REQUIRE_OR_CONT(validTransformCandidate); + if (!validTransformCandidate) { + continue; + } // NNPACK only supports 2DConv. const auto& kernelShape = conv->getKernelShape(); - NOM_REQUIRE_OR_CONT(kernelShape.size() == 2); + if (kernelShape.size() != 2) { + continue; + } // Kx1 and 1xK convs are inefficient in NNPACK. if (kernelShape[0] != kernelShape[1]) { - NOM_REQUIRE_OR_CONT(kernelShape[0] != 1 && kernelShape[1] != 1); + if (kernelShape[0] == 1 || kernelShape[1] == 1) { + continue; + } } // We're good to use our engine. auto annotation = conv->getMutableAnnotation(); - NOM_REQUIRE_OR_CONT(annotation && isa(annotation)); - + if (!annotation || !isa(annotation)) { + continue; + } auto* op = dyn_cast(annotation)->getMutableOperatorDef(); op->set_engine(engine); if (!low_memory) { diff --git a/caffe2/opt/onnxifi_transformer.cc b/caffe2/opt/onnxifi_transformer.cc index 09528b99b5da51..75baec0e9be66b 100644 --- a/caffe2/opt/onnxifi_transformer.cc +++ b/caffe2/opt/onnxifi_transformer.cc @@ -323,10 +323,8 @@ void OnnxifiTransformer::Transform( // function to tell whether the ONNXIFI backend supports a given C2 op or not // TODO: choose backend id - onnxifi_library* backend = lib_; - onnxBackendID backend_id = backend_ids_[0]; auto supports = - [&exporter, &shape_hints, backend, backend_id]( + [&exporter, &shape_hints, backend = lib_, backend_id = backend_ids_[0]]( const caffe2::OperatorDef& op) { const OpSchema* schema = OpSchemaRegistry::Schema(op.type()); // NB: this might not be a hard constraint as we can just export C2 diff --git a/caffe2/predictor/predictor.cc b/caffe2/predictor/predictor.cc index 03264daf50f6a7..4c1e13d1008ac8 100644 --- a/caffe2/predictor/predictor.cc +++ b/caffe2/predictor/predictor.cc @@ -2,7 +2,6 @@ #ifdef CAFFE2_OPTIMIZER #include "caffe2/opt/optimizer.h" #endif -#include "caffe2/utils/proto_utils.h" #include #include "caffe2/core/init.h" @@ -97,9 +96,7 @@ Predictor::Predictor( GlobalInit(); #endif auto predict_net = config_.predict_net; - - if (optimization && - !ArgumentHelper::HasArgument(*predict_net, "disable_nomnigraph")) { + if (optimization) { #ifdef CAFFE2_OPTIMIZER try { *predict_net = opt::optimize(*predict_net, &ws_, optimization); diff --git a/caffe2/predictor/predictor.h b/caffe2/predictor/predictor.h index 458bf4401476c4..a3f05d7aacac89 100644 --- a/caffe2/predictor/predictor.h +++ b/caffe2/predictor/predictor.h @@ -28,7 +28,7 @@ class Predictor { const NetDef& run_net, Workspace* parent = nullptr, bool run_init = true, - int optimization = 1); + int optimization = 0); ~Predictor() {} diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py index dd1734a587c1fc..d10bfe209f7b39 100644 --- a/caffe2/python/hypothesis_test.py +++ b/caffe2/python/hypothesis_test.py @@ -630,7 +630,7 @@ def _dense_gftrl(alpha, beta, lambda1, lambda2, w, nz, g): beta=st.floats(min_value=0.1, max_value=0.9), lambda1=st.floats(min_value=0.001, max_value=0.1), lambda2=st.floats(min_value=0.001, max_value=0.1), - engine=st.sampled_from([None, "SIMD"]), + engine=st.sampled_from([None]), **hu.gcs_cpu_only) def test_gftrl_sgd(self, inputs, in_place, alpha, beta, lambda1, lambda2, engine, gc, dc): diff --git a/caffe2/python/models/seq2seq/translate.py b/caffe2/python/models/seq2seq/translate.py index d2b6a4f6399fff..b1c0e1cd885ea4 100644 --- a/caffe2/python/models/seq2seq/translate.py +++ b/caffe2/python/models/seq2seq/translate.py @@ -5,12 +5,10 @@ from __future__ import print_function from __future__ import unicode_literals -from abc import ABCMeta, abstractmethod import argparse from future.utils import viewitems import logging import numpy as np -from six import with_metaclass import sys from caffe2.python import core, rnn_cell, workspace @@ -33,60 +31,7 @@ def _weighted_sum(model, values, weight, output_name): ) -class Seq2SeqModelCaffe2EnsembleDecoderBase(with_metaclass(ABCMeta, object)): - - @abstractmethod - def get_model_file(self, model): - pass - - @abstractmethod - def get_db_type(self): - pass - - def build_word_rewards(self, vocab_size, word_reward, unk_reward): - word_rewards = np.full([vocab_size], word_reward, dtype=np.float32) - word_rewards[seq2seq_util.PAD_ID] = 0 - word_rewards[seq2seq_util.GO_ID] = 0 - word_rewards[seq2seq_util.EOS_ID] = 0 - word_rewards[seq2seq_util.UNK_ID] = word_reward + unk_reward - return word_rewards - - def load_models(self): - db_reader = 'reader' - for model, scope_name in zip( - self.models, - self.decoder_scope_names, - ): - params_for_current_model = [ - param - for param in self.model.GetAllParams() - if str(param).startswith(scope_name) - ] - assert workspace.RunOperatorOnce(core.CreateOperator( - 'CreateDB', - [], [db_reader], - db=self.get_model_file(model), - db_type=self.get_db_type()) - ), 'Failed to create db {}'.format(self.get_model_file(model)) - assert workspace.RunOperatorOnce(core.CreateOperator( - 'Load', - [db_reader], - params_for_current_model, - load_all=1, - add_prefix=scope_name + '/', - strip_prefix='gpu_0/', - )) - logger.info('Model {} is loaded from a checkpoint {}'.format( - scope_name, self.get_model_file(model))) - - -class Seq2SeqModelCaffe2EnsembleDecoder(Seq2SeqModelCaffe2EnsembleDecoderBase): - - def get_model_file(self, model): - return model['model_file'] - - def get_db_type(self): - return 'minidb' +class Seq2SeqModelCaffe2EnsembleDecoder(object): def scope(self, scope_name, blob_name): return ( @@ -313,6 +258,14 @@ def _build_decoder( attention_weights, ) + def build_word_rewards(self, vocab_size, word_reward, unk_reward): + word_rewards = np.full([vocab_size], word_reward, dtype=np.float32) + word_rewards[seq2seq_util.PAD_ID] = 0 + word_rewards[seq2seq_util.GO_ID] = 0 + word_rewards[seq2seq_util.EOS_ID] = 0 + word_rewards[seq2seq_util.UNK_ID] = word_reward + unk_reward + return word_rewards + def __init__( self, translate_params, @@ -461,6 +414,36 @@ def __init__( for param in self.model.params: logger.info(param) + def load_models(self): + db_reader = 'reader' + for model, scope_name in zip( + self.models, + self.decoder_scope_names, + ): + params_for_current_model = [ + param + for param in self.model.GetAllParams() + if str(param).startswith(scope_name) + ] + assert workspace.RunOperatorOnce(core.CreateOperator( + 'CreateDB', + [], [db_reader], + db=model['model_file'], + db_type='minidb') + ), 'Failed to create db {}'.format(model['model_file']) + assert workspace.RunOperatorOnce(core.CreateOperator( + 'Load', + [db_reader], + params_for_current_model, + load_all=1, + add_prefix=scope_name + '/', + strip_prefix='gpu_0/', + )) + logger.info('Model {} is loaded from a checkpoint {}'.format( + scope_name, + model['model_file'], + )) + def decode(self, numberized_input, max_output_seq_len): workspace.FeedBlob( self.encoder_inputs, diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py index 93e45704fcfea6..dab79b8b1fb0b4 100644 --- a/caffe2/python/onnx/backend.py +++ b/caffe2/python/onnx/backend.py @@ -35,7 +35,6 @@ import onnx.defs import onnx.optimizer import onnx.shape_inference -import onnx.utils from onnx.backend.base import Backend, Device, DeviceType, namedtupledict from caffe2.python.onnx.workspace import Workspace @@ -877,7 +876,6 @@ def _graph_to_net(cls, onnx_graph, opset_version): def _onnx_model_to_caffe2_net(cls, onnx_model, device, opset_version, include_initializers): device_option = get_device_option(Device(device)) - onnx_model = onnx.utils.polish_model(onnx_model) init_model = cls.optimize_onnx(onnx_model, init=True) pred_model = cls.optimize_onnx(onnx_model, predict=True) diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py index ee60d776d55a82..db870972f83946 100644 --- a/caffe2/python/optimizer.py +++ b/caffe2/python/optimizer.py @@ -1421,8 +1421,7 @@ def build_ftrl(model, engine="SIMD", **kwargs): def build_gftrl(model, engine="", **kwargs): - if engine == "SIMD": - assert core.IsOperator('GFtrl_ENGINE_SIMD') + # SIMD version of GFTRL is not supported gftrl_optimizer = GFtrlOptimizer(engine=engine, **kwargs) return _build(model, gftrl_optimizer) diff --git a/caffe2/python/predictor/mobile_exporter.py b/caffe2/python/predictor/mobile_exporter.py index 3c42c2073163cd..07f88def015544 100644 --- a/caffe2/python/predictor/mobile_exporter.py +++ b/caffe2/python/predictor/mobile_exporter.py @@ -20,7 +20,6 @@ def add_tensor(net, name, blob): np.dtype('int32'): "GivenTensorIntFill", np.dtype('int64'): "GivenTensorInt64Fill", np.dtype('uint8'): "GivenTensorStringFill", - np.dtype('O'): "GivenTensorStringFill" } shape = blob.shape @@ -30,12 +29,6 @@ def add_tensor(net, name, blob): if blob.dtype == np.dtype('uint8'): shape = [1] values = [str(blob.data)] - # Only allow string arrays as objects. - # The only intended use case for this is to store arrays of strings in the - # model which can be used for post processing results in subsequent ops. - if blob.dtype == np.dtype('O'): - for blob_val in blob: - assert(isinstance(blob_val, bytes)) op = core.CreateOperator( kTypeNameMapper[blob.dtype], diff --git a/caffe2/python/predictor/mobile_exporter_test.py b/caffe2/python/predictor/mobile_exporter_test.py index 1c4cf77ea0512f..e7bbe2c90351c4 100644 --- a/caffe2/python/predictor/mobile_exporter_test.py +++ b/caffe2/python/predictor/mobile_exporter_test.py @@ -73,15 +73,11 @@ def test_mobile_exporter_datatypes(self): model = ModelHelper(name="mobile_exporter_test_model") model.Copy("data_int", "out") model.params.append("data_int") - model.Copy("data_obj", "out_obj") - model.params.append("data_obj") # Create our mobile exportable networks workspace.RunNetOnce(model.param_init_net) np_data_int = np.random.randint(100, size=(1, 1, 28, 28), dtype=np.int32) workspace.FeedBlob("data_int", np_data_int) - np_data_obj = np.array(['aa', 'bb']).astype(np.dtype('O')) - workspace.FeedBlob("data_obj", np_data_obj) init_net, predict_net = mobile_exporter.Export( workspace, model.net, model.params @@ -90,7 +86,6 @@ def test_mobile_exporter_datatypes(self): workspace.CreateNet(model.net) workspace.RunNet(model.net) ref_out = workspace.FetchBlob("out") - ref_out_obj = workspace.FetchBlob("out_obj") # Clear the workspace workspace.ResetWorkspace() @@ -102,11 +97,9 @@ def test_mobile_exporter_datatypes(self): workspace.CreateNet(predict_net, True) workspace.RunNet(predict_net.name) manual_run_out = workspace.FetchBlob("out") - manual_run_out_obj = workspace.FetchBlob("out_obj") np.testing.assert_allclose( ref_out, manual_run_out, atol=1e-10, rtol=1e-10 ) - np.testing.assert_equal(ref_out_obj, manual_run_out_obj) # Clear the workspace workspace.ResetWorkspace() @@ -116,17 +109,11 @@ def test_mobile_exporter_datatypes(self): init_net.SerializeToString(), predict_net.SerializeToString() ) - # Output is a vector of outputs. + # Output is a vector of outputs but we only care about the first and only result predictor_out = predictor.run([]) - assert len(predictor_out) == 2 - predictor_out_int = predictor_out[1] - predictor_out_obj = predictor_out[0] - # The order in predictor_out is non-deterministic. Use type of the entry - # to figure out what to compare it to. - if isinstance(predictor_out[1][0], bytes): - predictor_out_int = predictor_out[0] - predictor_out_obj = predictor_out[1] + assert len(predictor_out) == 1 + predictor_out = predictor_out[0] + np.testing.assert_allclose( - ref_out, predictor_out_int, atol=1e-10, rtol=1e-10 + ref_out, predictor_out, atol=1e-10, rtol=1e-10 ) - np.testing.assert_equal(ref_out_obj, predictor_out_obj) diff --git a/caffe2/python/transformations_test.py b/caffe2/python/transformations_test.py index 6e66cd75315716..2edc88ce0458d4 100644 --- a/caffe2/python/transformations_test.py +++ b/caffe2/python/transformations_test.py @@ -179,7 +179,6 @@ def test_transformer_SinkMaxPool(self): epsilon=st.floats(min_value=1e-5, max_value=1e-2), ) def test_transformer_FuseConvBN(self, size, input_channels, seed, order, epsilon): - workspace.ResetWorkspace() net = core.Net("net") c = input_channels h = size @@ -205,22 +204,16 @@ def test_transformer_FuseConvBN(self, size, input_channels, seed, order, epsilon workspace.FeedBlob("scale", np.random.rand(c).astype(np.float32)) workspace.FeedBlob("bias", np.random.rand(c).astype(np.float32)) workspace.FeedBlob("mean", np.random.rand(c).astype(np.float32)) - # This is necessary because 1/sqrt(var) is used and if var is too small - # we get floating point artifacts that cause test failures - workspace.FeedBlob("var", np.random.rand(c).astype(np.float32) + 0.5) + workspace.FeedBlob("var", np.random.rand(c).astype(np.float32)) workspace.RunNetOnce(net) - preTransformOutput = workspace.FetchBlob("Y2").flatten() - workspace.FeedBlob("Y2", np.zeros((1, 1))) + preTransformOutput = workspace.FetchBlob("Y2") transformer.FuseConvBN(net) # Ensure fusion assert len(net.Proto().op) == 1 workspace.RunNetOnce(net) - postTransformOutput = workspace.FetchBlob("Y2").flatten() + postTransformOutput = workspace.FetchBlob("Y2") # Check that there is no numerical difference assert np.allclose( - preTransformOutput, - postTransformOutput, - rtol=1e-02, - atol=1e-04 + preTransformOutput, postTransformOutput, rtol=1e-05, atol=1e-08 ) diff --git a/caffe2/requirements.txt b/caffe2/requirements.txt index 07fd95b72582a2..9a1d67efc7c2f3 100644 --- a/caffe2/requirements.txt +++ b/caffe2/requirements.txt @@ -1,4 +1,2 @@ numpy enum34 -pyyaml -typing diff --git a/caffe2/utils/Array.h b/caffe2/utils/Array.h index ad9a80ed9203b5..921deb9b0b41aa 100644 --- a/caffe2/utils/Array.h +++ b/caffe2/utils/Array.h @@ -38,10 +38,10 @@ #pragma once -#include +#include #include #include -#include +#include "caffe2/utils/C++17.h" namespace c10 { namespace guts { @@ -101,32 +101,32 @@ class array final { // No explicit construct/copy/destroy for aggregate type. // DR 776. - AT_CPP14_CONSTEXPR void fill(const value_type& __u) + C10_CPP14_CONSTEXPR void fill(const value_type& __u) { std::fill_n(begin(), size(), __u); } - AT_CPP14_CONSTEXPR void swap(array& __other) + C10_CPP14_CONSTEXPR void swap(array& __other) { std::swap_ranges(begin(), end(), __other.begin()); } // Iterators. - AT_CPP14_CONSTEXPR iterator begin() noexcept + C10_CPP14_CONSTEXPR iterator begin() noexcept { return iterator(data()); } constexpr const_iterator begin() const noexcept { return const_iterator(data()); } - AT_CPP14_CONSTEXPR iterator end() noexcept + C10_CPP14_CONSTEXPR iterator end() noexcept { return iterator(data() + _Nm); } constexpr const_iterator end() const noexcept { return const_iterator(data() + _Nm); } - AT_CPP14_CONSTEXPR reverse_iterator rbegin() noexcept + C10_CPP14_CONSTEXPR reverse_iterator rbegin() noexcept { return reverse_iterator(end()); } constexpr const_reverse_iterator rbegin() const noexcept { return const_reverse_iterator(end()); } - AT_CPP14_CONSTEXPR reverse_iterator rend() noexcept + C10_CPP14_CONSTEXPR reverse_iterator rend() noexcept { return reverse_iterator(begin()); } constexpr const_reverse_iterator rend() const noexcept @@ -152,13 +152,13 @@ class array final { constexpr bool empty() const noexcept { return size() == 0; } // Element access. - AT_CPP14_CONSTEXPR reference operator[](size_type __n) noexcept + C10_CPP14_CONSTEXPR reference operator[](size_type __n) noexcept { return _AT_Type::_S_ref(_M_elems, __n); } constexpr const_reference operator[](size_type __n) const noexcept { return _AT_Type::_S_ref(_M_elems, __n); } - AT_CPP14_CONSTEXPR reference at(size_type __n) { + C10_CPP14_CONSTEXPR reference at(size_type __n) { if (__n >= _Nm) { detail::__throw_out_of_range(std::string() + "array::at: __n (which is " + to_string(__n) + ") " + @@ -177,13 +177,13 @@ class array final { _AT_Type::_S_ref(_M_elems, 0)); } - AT_CPP14_CONSTEXPR reference front() noexcept + C10_CPP14_CONSTEXPR reference front() noexcept { return *begin(); } constexpr const_reference front() const noexcept { return _AT_Type::_S_ref(_M_elems, 0); } - AT_CPP14_CONSTEXPR reference back() noexcept + C10_CPP14_CONSTEXPR reference back() noexcept { return _Nm ? *(end() - 1) : *end(); } constexpr const_reference back() const noexcept @@ -192,7 +192,7 @@ class array final { : _AT_Type::_S_ref(_M_elems, 0); } - AT_CPP14_CONSTEXPR pointer data() noexcept + C10_CPP14_CONSTEXPR pointer data() noexcept { return _AT_Type::_S_ptr(_M_elems); } constexpr const_pointer data() const noexcept diff --git a/caffe2/utils/C++17.cpp b/caffe2/utils/C++17.cpp new file mode 100644 index 00000000000000..d75d9fc9dff490 --- /dev/null +++ b/caffe2/utils/C++17.cpp @@ -0,0 +1 @@ +#include "caffe2/utils/C++17.h" diff --git a/aten/src/ATen/core/C++17.h b/caffe2/utils/C++17.h similarity index 93% rename from aten/src/ATen/core/C++17.h rename to caffe2/utils/C++17.h index 5112d9070dcd5e..0186944e251159 100644 --- a/aten/src/ATen/core/C++17.h +++ b/caffe2/utils/C++17.h @@ -95,14 +95,10 @@ template using decay_t = typename std::decay::type; #ifdef __cpp_lib_logical_traits -template -using conjunction = std::conjunction; -template -using disjunction = std::disjunction; -template -using bool_constant = std::bool_constant; -template -using negation = std::negation; +using conjunction = std::conjunction; +using disjunction = std::disjunction; +using bool_constant = std::bool_constant; +using negation = std::negation; #else @@ -149,10 +145,7 @@ template using void_t = typename make_void::type; #ifdef __cpp_lib_apply -template -inline constexpr decltype(auto) apply(F&& f, Tuple&& t) { - return std::apply(std::forward(f), std::forward(t)); -} +using apply = std::apply; #else @@ -182,9 +175,9 @@ constexpr auto apply(F&& f, Tuple&& t) -> decltype(detail::apply_impl( #if defined(__cpp_constexpr) && __cpp_constexpr >= 201304 -# define AT_CPP14_CONSTEXPR constexpr +# define C10_CPP14_CONSTEXPR constexpr #else -# define AT_CPP14_CONSTEXPR +# define C10_CPP14_CONSTEXPR #endif diff --git a/caffe2/utils/CMakeLists.txt b/caffe2/utils/CMakeLists.txt index 67897c36fe485a..5db06663bf6403 100644 --- a/caffe2/utils/CMakeLists.txt +++ b/caffe2/utils/CMakeLists.txt @@ -63,6 +63,8 @@ set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} set(LIB_SOURCES_CPU Array.cpp + C++17.cpp + IdWrapper.cpp Optional.cpp Metaprogramming.cpp TypeList.cpp diff --git a/caffe2/utils/IdWrapper.cpp b/caffe2/utils/IdWrapper.cpp new file mode 100644 index 00000000000000..7646a1392d4a6b --- /dev/null +++ b/caffe2/utils/IdWrapper.cpp @@ -0,0 +1 @@ +#include "caffe2/utils/IdWrapper.h" diff --git a/caffe2/utils/IdWrapper.h b/caffe2/utils/IdWrapper.h new file mode 100644 index 00000000000000..0c8e548ca017f6 --- /dev/null +++ b/caffe2/utils/IdWrapper.h @@ -0,0 +1,67 @@ +#pragma once + +#include + +namespace c10 { namespace guts { + +/** + * This template simplifies generation of simple classes that wrap an id + * in a typesafe way. Namely, you can use it to create a very lightweight + * type that only offers equality comparators and hashing. Example: + * + * struct MyIdType final : IdWrapper { + * constexpr explicit MyIdType(uint32_t id): IdWrapper(id) {} + * }; + * + * Then in the global top level namespace: + * + * C10_DEFINE_IDWRAPPER(MyIdType); + * + * That's it - equality operators and hash functions are automatically defined + * for you, given the underlying type supports it. + */ +template +class IdWrapper { +public: + using underlying_type = UnderlyingType; + using concrete_type = ConcreteType; + +protected: + constexpr explicit IdWrapper(underlying_type id) noexcept(noexcept(underlying_type(std::declval()))) + : id_(id) {} + + constexpr underlying_type underlyingId() const noexcept(noexcept(underlying_type(std::declval()))) { + return id_; + } + +private: + friend size_t hash_value(const concrete_type& v) { + return std::hash()(v.id_); + } + + // TODO Making operator== noexcept if underlying type is noexcept equality comparable doesn't work with GCC 4.8. + // Fix this once we don't need GCC 4.8 anymore. + friend constexpr bool operator==(const concrete_type& lhs, const concrete_type& rhs) { + return lhs.id_ == rhs.id_; + } + + // TODO Making operator!= noexcept if operator== is noexcept doesn't work with GCC 4.8. + // Fix this once we don't need GCC 4.8 anymore. + friend constexpr bool operator!=(const concrete_type& lhs, const concrete_type& rhs) { + return !(lhs == rhs); + } + + underlying_type id_; +}; + +}} + +#define C10_DEFINE_HASH_FOR_IDWRAPPER(ClassName) \ + namespace std { \ + template <> \ + struct hash { \ + size_t operator()(ClassName x) const { \ + return hash_value(x); \ + } \ + }; \ + } diff --git a/caffe2/utils/TypeList.h b/caffe2/utils/TypeList.h index 7c20fa6613b966..3494843feae121 100644 --- a/caffe2/utils/TypeList.h +++ b/caffe2/utils/TypeList.h @@ -1,6 +1,6 @@ #pragma once -#include +#include "caffe2/utils/C++17.h" #include "caffe2/utils/TypeTraits.h" namespace c10 { namespace guts { namespace typelist { diff --git a/caffe2/utils/TypeTraits.h b/caffe2/utils/TypeTraits.h index c60f8a00b1ebdd..004586987a81f7 100644 --- a/caffe2/utils/TypeTraits.h +++ b/caffe2/utils/TypeTraits.h @@ -1,6 +1,6 @@ #pragma once -#include +#include "caffe2/utils/C++17.h" #include namespace c10 { diff --git a/caffe2/utils/math_cpu.cc b/caffe2/utils/math_cpu.cc index c573542af5763c..e0ae5cc0336e2a 100644 --- a/caffe2/utils/math_cpu.cc +++ b/caffe2/utils/math_cpu.cc @@ -2605,13 +2605,6 @@ bool TransposeWithHPTT( axes_cm[i] = cm_fn(axes[cm_fn(i)]); dims_cm[i] = dims[cm_fn(i)]; } - - // HPTT doesn't handle 0 sized inputs. - for (auto dim : dims_cm) { - if (dim <= 0) { - return false; - } - } auto plan = hptt::create_plan( axes_cm.data(), ndim, diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake index 3829219a933b5d..bc30f35f2a2eee 100644 --- a/cmake/Codegen.cmake +++ b/cmake/Codegen.cmake @@ -1,9 +1,3 @@ -# This ill-named file does a number of things: -# - Installs Caffe2 header files (this has nothing to do with code generation) -# - Configures caffe2/core/macros.h -# - Creates an ATen target for its generated C++ files and adds it -# as a dependency - if (DEFINED ENV{PYTORCH_PYTHON}) message(STATUS "Using python found in $ENV{PYTORCH_PYTHON}") set(PYCMD "$ENV{PYTORCH_PYTHON}") @@ -20,11 +14,6 @@ configure_file( install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../caffe2 DESTINATION include FILES_MATCHING PATTERN "*.h") -if (NOT BUILD_ATEN) - install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core - DESTINATION include/ATen/core - FILES_MATCHING PATTERN "*.h") -endif() install(FILES ${CMAKE_BINARY_DIR}/caffe2/core/macros.h DESTINATION include/caffe2/core) diff --git a/cmake/MiscCheck.cmake b/cmake/MiscCheck.cmake index 2f2628bb149866..2a4e61f97b0b18 100644 --- a/cmake/MiscCheck.cmake +++ b/cmake/MiscCheck.cmake @@ -83,26 +83,22 @@ endif() cmake_pop_check_state() # ---[ Check for NUMA support -if (USE_NUMA) - cmake_push_check_state(RESET) - set(CMAKE_REQUIRED_FLAGS "-std=c++11") - CHECK_CXX_SOURCE_COMPILES( +cmake_push_check_state(RESET) +set(CMAKE_REQUIRED_FLAGS "-std=c++11") +CHECK_CXX_SOURCE_COMPILES( "#include #include int main(int argc, char** argv) { }" CAFFE2_IS_NUMA_AVAILABLE) - if (CAFFE2_IS_NUMA_AVAILABLE) - message(STATUS "NUMA is available") - else() - message(STATUS "NUMA is not available") - set(CAFFE2_DISABLE_NUMA 1) - endif() - cmake_pop_check_state() + +if (CAFFE2_IS_NUMA_AVAILABLE) + message(STATUS "NUMA is available") else() - message(STATUS "NUMA is disabled") + message(STATUS "NUMA is not available") set(CAFFE2_DISABLE_NUMA 1) endif() +cmake_pop_check_state() # ---[ Check if we want to turn off deprecated warning due to glog. # Note(jiayq): on ubuntu 14.04, the default glog install uses ext/hash_set that @@ -161,15 +157,6 @@ if (${COMPILER_SUPPORTS_HIDDEN_INLINE_VISIBILITY}) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CAFFE2_VISIBILITY_FLAG}") endif() -# ---[ Checks if linker supports -rdynamic. `-rdynamic` tells linker -# -to add all (including unused) symbols into the dynamic symbol -# -table. We need this to get symbols when generating backtrace at -# -runtime. -check_cxx_compiler_flag("-rdynamic" COMPILER_SUPPORTS_RDYNAMIC) -if (${COMPILER_SUPPORTS_RDYNAMIC}) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -rdynamic") -endif() - # ---[ If we are using msvc, set no warning flags # Note(jiayq): if you are going to add a warning flag, check if this is # totally necessary, and only add when you see fit. If it is needed due to diff --git a/docs/libtorch.rst b/docs/libtorch.rst deleted file mode 100644 index 9ab59a4d749d66..00000000000000 --- a/docs/libtorch.rst +++ /dev/null @@ -1,19 +0,0 @@ -libtorch (C++-only) -=================== - -The core of pytorch can be built and used without Python. A -CMake-based build system compiles the C++ source code into a shared -object, libtorch.so. - -Building libtorch ------------------ - -There is a script which wraps the CMake build. Invoke it with - -:: - cd pytorch - BUILD_TORCH=ON ONNX_NAMESPACE=onnx_torch bash tools/build_pytorch_libs.sh --use-nnpack caffe2 - ls torch/lib/tmp_install # output is produced here - ls torch/lib/tmp_install/lib/libtorch.so # of particular interest - -Future work will simplify this further. diff --git a/docs/source/distributions.rst b/docs/source/distributions.rst index de541b467e819e..93224462e3177e 100644 --- a/docs/source/distributions.rst +++ b/docs/source/distributions.rst @@ -203,15 +203,6 @@ Probability distributions - torch.distributions :undoc-members: :show-inheritance: -:hidden:`NegativeBinomial` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. currentmodule:: torch.distributions.negative_binomial -.. autoclass:: NegativeBinomial - :members: - :undoc-members: - :show-inheritance: - :hidden:`Normal` ~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/nn.rst b/docs/source/nn.rst index 283409ea3676b8..987044bbd212f4 100644 --- a/docs/source/nn.rst +++ b/docs/source/nn.rst @@ -338,12 +338,6 @@ Non-linear activations (weighted sum, nonlinearity) .. autoclass:: SELU :members: -:hidden:`CELU` -~~~~~~~~~~~~~~ - -.. autoclass:: CELU - :members: - :hidden:`Sigmoid` ~~~~~~~~~~~~~~~~~ @@ -610,12 +604,6 @@ Loss functions .. autoclass:: CrossEntropyLoss :members: -:hidden:`CTCLoss` -~~~~~~~~~~~~~~~~~ - -.. autoclass:: CTCLoss - :members: - :hidden:`NLLLoss` ~~~~~~~~~~~~~~~~~ @@ -996,11 +984,6 @@ Non-linear activation functions .. autofunction:: selu -:hidden:`celu` -~~~~~~~~~~~~~~ - -.. autofunction:: celu - :hidden:`leaky_relu` ~~~~~~~~~~~~~~~~~~~~ @@ -1197,11 +1180,6 @@ Loss functions .. autofunction:: cross_entropy -:hidden:`ctc_loss` -~~~~~~~~~~~~~~~~~~ - -.. autofunction:: ctc_loss - :hidden:`hinge_embedding_loss` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/scripts/build_activation_images.py b/docs/source/scripts/build_activation_images.py index e973933e205692..ce424d1ff188fa 100644 --- a/docs/source/scripts/build_activation_images.py +++ b/docs/source/scripts/build_activation_images.py @@ -36,7 +36,6 @@ 'ReLU6', 'RReLU', 'SELU', - 'CELU', 'Sigmoid', 'Softplus', 'Softshrink', diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst index 06b0305d28aae8..c3c85797b4cd82 100644 --- a/docs/source/tensors.rst +++ b/docs/source/tensors.rst @@ -46,7 +46,7 @@ A tensor can be constructed from a Python :class:`list` or sequence using the If you have a numpy array and want to avoid a copy, use :func:`torch.as_tensor`. -A tensor of specific data type can be constructed by passing a +An tensor of specific data type can be constructed by passing a :class:`torch.dtype` and/or a :class:`torch.device` to a constructor or tensor creation op: diff --git a/docs/source/torch.rst b/docs/source/torch.rst index c68ec039d74ce3..c1e914c03c74e7 100644 --- a/docs/source/torch.rst +++ b/docs/source/torch.rst @@ -306,7 +306,3 @@ BLAS and LAPACK Operations .. autofunction:: svd .. autofunction:: symeig .. autofunction:: trtrs - -Utilities ----------------------------------- -.. autofunction:: compiled_with_cxx11_abi diff --git a/scripts/build_anaconda.sh b/scripts/build_anaconda.sh index 62185d1e9dc821..1db0f546724103 100755 --- a/scripts/build_anaconda.sh +++ b/scripts/build_anaconda.sh @@ -296,10 +296,6 @@ fi # Add packages required for all Caffe2 builds add_package 'glog' add_package 'gflags' -add_package 'mkl' '>=2018' -add_package 'mkl-include' -add_package 'typing' -append_to_section 'build' '- pyyaml' caffe2_cmake_args+=("-DUSE_LEVELDB=OFF") caffe2_cmake_args+=("-DUSE_LMDB=OFF") @@ -307,6 +303,10 @@ caffe2_cmake_args+=("-DUSE_LMDB=OFF") # Add packages required for pytorch if [[ -n $integrated ]]; then add_package 'cffi' + add_package 'mkl' '>=2018' + add_package 'mkl-include' + add_package 'typing' + append_to_section 'build' '- pyyaml' append_to_section 'build' '- setuptools' #caffe2_cmake_args+=("-DBLAS=MKL") if [[ -n $cuda_ver ]]; then diff --git a/setup.py b/setup.py index 2e2ef60fb41313..042d8668bb7b96 100644 --- a/setup.py +++ b/setup.py @@ -659,9 +659,7 @@ def run(self): # Clang has an unfixed bug leading to spurious missing # braces warnings, see # https://bugs.llvm.org/show_bug.cgi?id=21629 - '-Wno-missing-braces', - # gcc7 seems to report spurious warnings with this enabled - "-Wno-stringop-overflow", + '-Wno-missing-braces' ] if check_env_flag('WERROR'): extra_compile_args.append('-Werror') @@ -1025,7 +1023,6 @@ def make_relative_rpath(path): 'lib/torch_shm_manager', 'lib/*.h', 'lib/include/ATen/*.h', - 'lib/include/ATen/core/*.h', 'lib/include/ATen/detail/*.h', 'lib/include/ATen/cuda/*.h', 'lib/include/ATen/cuda/*.cuh', diff --git a/setup_caffe2.py b/setup_caffe2.py index d8ebf4fc7ed84f..0fd620549b31d8 100644 --- a/setup_caffe2.py +++ b/setup_caffe2.py @@ -131,7 +131,6 @@ def run(self): # configure cmake_args = [ find_executable('cmake'), - '-DUSE_ATEN=ON', '-DBUILD_SHARED_LIBS=OFF', '-DPYTHON_EXECUTABLE:FILEPATH={}'.format(sys.executable), '-DPYTHON_INCLUDE_DIR={}'.format(sysconfig.get_python_inc()), diff --git a/test/common.py b/test/common.py index 4dbe3c56c47c98..1eb4076dbf360b 100644 --- a/test/common.py +++ b/test/common.py @@ -118,6 +118,16 @@ def dec(fn): return dec +def skipIfNoZeroSize(fn): + @wraps(fn) + def wrapper(*args, **kwargs): + if torch._C._use_zero_size_dim(): + fn(*args, **kwargs) + else: + raise unittest.SkipTest('Compiled without arbitrary zero size dimension support') + return wrapper + + def get_cuda_memory_usage(): # we don't need CUDA synchronize because the statistics are not tracked at # actual freeing, but at when marking the block as free. diff --git a/test/common_nn.py b/test/common_nn.py index 0444ba4eb6ae46..6172f4b15adc3f 100644 --- a/test/common_nn.py +++ b/test/common_nn.py @@ -125,7 +125,6 @@ def get_weight(m): module_name='ELU', constructor_args=(2.,), input_size=(3, 2, 5), - reference_fn=lambda x, _: torch.where(x >= 0, x, 2 * (x.exp() - 1)) ), # TODO: reference function dict( @@ -449,43 +448,6 @@ def marginrankingloss_reference(input1, input2, target, margin=0, reduction='ele return output -# this directly follows Graves et al's paper, in contrast to the production implementation, it does not use log-space -def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0, reduction='elementwise_mean'): - input_lengths = torch.tensor(input_lengths, dtype=torch.long) - target_lengths = torch.tensor(target_lengths, dtype=torch.long) - dt = log_probs.dtype - log_probs = log_probs.double() # we need the accuracy as we are not in logspace - targets = targets.long() - cum_target_lengths = target_lengths.cumsum(0) - losses = [] - for i in range(log_probs.size(1)): - input_length = input_lengths[i].item() - target_length = target_lengths[i].item() - cum_target_length = cum_target_lengths[i].item() - targets_prime = targets.new_full((2 * target_length + 1,), blank) - if targets.dim() == 2: - targets_prime[1::2] = targets[i, :target_length] - else: - targets_prime[1::2] = targets[cum_target_length - target_length:cum_target_length] - probs = log_probs[:input_length, i].exp() - alpha = log_probs.new_zeros((target_length * 2 + 1,)) - alpha[0] = probs[0, blank] - alpha[1] = probs[0, targets_prime[1]] - mask_third = (targets_prime[:-2] != targets_prime[2:]) - for t in range(1, input_length): - alpha_next = alpha.clone() - alpha_next[1:] += alpha[:-1] - alpha_next[2:] += torch.where(mask_third, alpha[:-2], alpha.new_zeros(1)) - alpha = probs[t, targets_prime] * alpha_next - losses.append(-alpha[-2:].sum().log()[None]) - output = torch.cat(losses, 0) - if reduction == 'elementwise_mean': - return (output / target_lengths.to(dtype=output.dtype, device=output.device)).mean() - elif reduction == 'sum': - return output.sum() - output = output.to(dt) - return output - loss_reference_fns = { 'KLDivLoss': kldivloss_reference, 'NLLLoss': nllloss_reference, @@ -498,7 +460,6 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0 'CosineEmbeddingLoss': cosineembeddingloss_reference, 'TripletMarginLoss': tripletmarginloss_reference, 'MarginRankingLoss': marginrankingloss_reference, - 'CTCLoss': ctcloss_reference, } @@ -880,7 +841,7 @@ def check_criterion_jacobian(self, criterion, input, target): class TestBase(object): - _required_arg_names = {'constructor_args', 'input', 'extra_args'} + _required_arg_names = {'constructor_args', 'input'} def __init__(self, constructor, desc='', reference_fn=None, fullname=None, **kwargs): self.desc = desc @@ -889,8 +850,8 @@ def __init__(self, constructor, desc='', reference_fn=None, fullname=None, **kwa self.reference_fn = reference_fn for name in self._required_arg_names: if name not in kwargs and name + '_fn' not in kwargs and name + '_size' not in kwargs: - if name in {'constructor_args', 'extra_args'}: - kwargs[name] = tuple() + if name == 'constructor_args': + kwargs['constructor_args'] = tuple() else: raise ValueError("{}: Specify {} by a value, a function to generate it, or it's size!" .format(self.get_name(), name)) @@ -918,10 +879,6 @@ def _unpack(self, value): def constructor_args(self): return self._get_arg('constructor_args', True) - @property - def extra_args(self): - return self._get_arg('extra_args', True) - def _get_arg(self, name, unpack): assert name in self._required_arg_names @@ -1146,9 +1103,9 @@ def __call__(self, test_case): target = self._get_target() if self.reference_fn is not None: - out = test_case._forward_criterion(module, input, target, extra_args=self.extra_args) - ref_args = (deepcopy(input), deepcopy(target)) + self.extra_args + (module,) - expected_out = self.reference_fn(*ref_args) + out = test_case._forward_criterion(module, input, target) + expected_out = self.reference_fn(deepcopy(input), + deepcopy(target), module) if isinstance(expected_out, torch.Tensor): expected_out = expected_out.item() test_case.assertEqual(out, expected_out) diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp index be2fd6e0d969ba..8e66a66962d44f 100644 --- a/test/cpp/api/modules.cpp +++ b/test/cpp/api/modules.cpp @@ -237,7 +237,7 @@ TEST_CASE("modules") { REQUIRE(functional(torch::ones({}) * -1).toCFloat() == 0); } { - auto functional = Functional(torch::elu, /*alpha=*/1, /*scale=*/0, /*input_scale=*/1); + auto functional = Functional(torch::elu, /*alpha=*/1, /*scale=*/0); REQUIRE(functional(torch::ones({})).toCFloat() == 0); } } diff --git a/test/expect/TestBatched.test_for.expect b/test/expect/TestBatched.test_for.expect deleted file mode 100644 index bcbcffaee486a3..00000000000000 --- a/test/expect/TestBatched.test_for.expect +++ /dev/null @@ -1,22 +0,0 @@ -graph(%x.1_data : Dynamic - %x.1_mask : Dynamic - %x.1_dims : Dynamic - %y_data : Dynamic - %y_mask : Dynamic - %y_dims : Dynamic) { - %6 : int = prim::Constant[value=10]() - %7 : int = prim::Constant[value=1]() - %x : Dynamic, %21 : Dynamic, %22 : Dynamic = prim::Loop(%6, %7, %x.1_data, %x.1_mask, %x.1_dims) - block0(%loop_num : int, %5_data : Dynamic, %5_mask : Dynamic, %5_dims : Dynamic) { - %13 : int = prim::Constant[value=1]() - %14 : Long() = prim::NumToTensor(%13) - %alpha : float = prim::TensorToNum(%14) - %data.1 : Dynamic = aten::add(%5_data, %y_data, %alpha) - %mask : Dynamic = aten::mul(%5_mask, %y_mask) - %dims : Dynamic = aten::__or__(%5_dims, %y_dims) - %19 : int = prim::Constant[value=1]() - %data : Dynamic = aten::where(%mask, %data.1, %5_data) - -> (%19, %data, %mask, %dims) - } - return (%x, %21, %22); -} diff --git a/test/expect/TestBatched.test_if_else.expect b/test/expect/TestBatched.test_if_else.expect deleted file mode 100644 index 0698584377a433..00000000000000 --- a/test/expect/TestBatched.test_if_else.expect +++ /dev/null @@ -1,52 +0,0 @@ -graph(%a.1_data : Dynamic - %a.1_mask : Dynamic - %a.1_dims : Dynamic - %b_data : Dynamic - %b_mask : Dynamic - %b_dims : Dynamic) { - %6 : Dynamic = aten::gt(%a.1_data, %b_data) - %7 : Dynamic = aten::mul(%a.1_mask, %b_mask) - %8 : Dynamic = aten::__or__(%a.1_dims, %b_dims) - %9 : int = prim::TensorToNum(%6) - %10 : int = prim::Constant[value=1]() - %11 : Long() = prim::NumToTensor(%10) - %alpha.1 : float = prim::TensorToNum(%11) - %data.1 : Dynamic = aten::add(%a.1_data, %b_data, %alpha.1) - %mask.1 : Dynamic = aten::mul(%a.1_mask, %b_mask) - %dims.1 : Dynamic = aten::__or__(%a.1_dims, %b_dims) - %16 : int = prim::Constant[value=1]() - %17 : Long() = prim::NumToTensor(%16) - %alpha : float = prim::TensorToNum(%17) - %data.4 : Dynamic = aten::sub(%a.1_data, %b_data, %alpha) - %mask : Dynamic = aten::mul(%a.1_mask, %b_mask) - %dims : Dynamic = aten::__or__(%a.1_dims, %b_dims) - %22 : Dynamic = aten::type_as(%7, %6) - %cond_mask.1 : Dynamic = aten::mul(%6, %22) - %24 : int = aten::dim(%cond_mask.1) - %25 : int = prim::Constant[value=1]() - %26 : int = aten::eq(%24, %25) - %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%26) - block0() { - %30 : int = aten::dim(%data.1) - %31 : int = prim::Constant[value=1]() - %32 : int = aten::sub(%30, %31) - %33 : int = prim::Constant[value=1]() - %data.3 : Dynamic = prim::Loop(%32, %33, %cond_mask.1) - block0(%_ : int, %36 : Dynamic) { - %37 : int = aten::dim(%36) - %data.2 : Dynamic = aten::unsqueeze(%36, %37) - %39 : int = prim::Constant[value=1]() - -> (%39, %data.2) - } - %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1) - %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask.1) - -> (%cond_data.1, %cond_mask.2, %data.3) - } - block1() { - -> (%cond_mask.1, %cond_mask.1, %cond_mask.1) - } - %res_data : Dynamic = aten::where(%cond_data, %data.1, %data.4) - %res_mask : Dynamic = aten::where(%cond_mask, %mask.1, %mask) - %res_dims : Dynamic = aten::__or__(%dims.1, %dims) - return (%res_data, %res_mask, %res_dims); -} diff --git a/test/expect/TestBatched.test_if_else_with_scalar.expect b/test/expect/TestBatched.test_if_else_with_scalar.expect deleted file mode 100644 index c7755a5b5501fc..00000000000000 --- a/test/expect/TestBatched.test_if_else_with_scalar.expect +++ /dev/null @@ -1,53 +0,0 @@ -graph(%a.1_data : Dynamic - %a.1_mask : Dynamic - %a.1_dims : Dynamic - %b_data : Dynamic - %b_mask : Dynamic - %b_dims : Dynamic) { - %6 : float = prim::Constant[value=0.1]() - %7 : Float() = prim::NumToTensor(%6) - %other : float = prim::TensorToNum(%7) - %9 : Dynamic = aten::gt(%a.1_data, %other) - %10 : int = prim::TensorToNum(%9) - %11 : int = prim::Constant[value=1]() - %12 : Long() = prim::NumToTensor(%11) - %alpha.1 : float = prim::TensorToNum(%12) - %data.1 : Dynamic = aten::add(%a.1_data, %b_data, %alpha.1) - %mask.1 : Dynamic = aten::mul(%a.1_mask, %b_mask) - %dims.1 : Dynamic = aten::__or__(%a.1_dims, %b_dims) - %17 : int = prim::Constant[value=1]() - %18 : Long() = prim::NumToTensor(%17) - %alpha : float = prim::TensorToNum(%18) - %data.4 : Dynamic = aten::sub(%a.1_data, %b_data, %alpha) - %mask : Dynamic = aten::mul(%a.1_mask, %b_mask) - %dims : Dynamic = aten::__or__(%a.1_dims, %b_dims) - %23 : Dynamic = aten::type_as(%a.1_mask, %9) - %cond_mask.1 : Dynamic = aten::mul(%9, %23) - %25 : int = aten::dim(%cond_mask.1) - %26 : int = prim::Constant[value=1]() - %27 : int = aten::eq(%25, %26) - %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%27) - block0() { - %31 : int = aten::dim(%data.1) - %32 : int = prim::Constant[value=1]() - %33 : int = aten::sub(%31, %32) - %34 : int = prim::Constant[value=1]() - %data.3 : Dynamic = prim::Loop(%33, %34, %cond_mask.1) - block0(%_ : int, %37 : Dynamic) { - %38 : int = aten::dim(%37) - %data.2 : Dynamic = aten::unsqueeze(%37, %38) - %40 : int = prim::Constant[value=1]() - -> (%40, %data.2) - } - %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1) - %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask.1) - -> (%cond_data.1, %cond_mask.2, %data.3) - } - block1() { - -> (%cond_mask.1, %cond_mask.1, %cond_mask.1) - } - %res_data : Dynamic = aten::where(%cond_data, %data.1, %data.4) - %res_mask : Dynamic = aten::where(%cond_mask, %mask.1, %mask) - %res_dims : Dynamic = aten::__or__(%dims.1, %dims) - return (%res_data, %res_mask, %res_dims); -} diff --git a/test/expect/TestBatched.test_if_noelse.expect b/test/expect/TestBatched.test_if_noelse.expect deleted file mode 100644 index 1d98fe9d02f29c..00000000000000 --- a/test/expect/TestBatched.test_if_noelse.expect +++ /dev/null @@ -1,46 +0,0 @@ -graph(%a.1_data : Dynamic - %a.1_mask : Dynamic - %a.1_dims : Dynamic - %b_data : Dynamic - %b_mask : Dynamic - %b_dims : Dynamic) { - %6 : Dynamic = aten::gt(%a.1_data, %b_data) - %7 : Dynamic = aten::mul(%a.1_mask, %b_mask) - %8 : Dynamic = aten::__or__(%a.1_dims, %b_dims) - %9 : int = prim::TensorToNum(%6) - %10 : int = prim::Constant[value=1]() - %11 : Long() = prim::NumToTensor(%10) - %alpha : float = prim::TensorToNum(%11) - %data.1 : Dynamic = aten::add(%a.1_data, %b_data, %alpha) - %mask : Dynamic = aten::mul(%a.1_mask, %b_mask) - %dims : Dynamic = aten::__or__(%a.1_dims, %b_dims) - %16 : Dynamic = aten::type_as(%7, %6) - %cond_mask.1 : Dynamic = aten::mul(%6, %16) - %18 : int = aten::dim(%cond_mask.1) - %19 : int = prim::Constant[value=1]() - %20 : int = aten::eq(%18, %19) - %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%20) - block0() { - %24 : int = aten::dim(%data.1) - %25 : int = prim::Constant[value=1]() - %26 : int = aten::sub(%24, %25) - %27 : int = prim::Constant[value=1]() - %data.3 : Dynamic = prim::Loop(%26, %27, %cond_mask.1) - block0(%_ : int, %30 : Dynamic) { - %31 : int = aten::dim(%30) - %data.2 : Dynamic = aten::unsqueeze(%30, %31) - %33 : int = prim::Constant[value=1]() - -> (%33, %data.2) - } - %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1) - %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask) - -> (%cond_data.1, %cond_mask.2, %data.3) - } - block1() { - -> (%cond_mask.1, %cond_mask.1, %cond_mask.1) - } - %res_data : Dynamic = aten::where(%cond_data, %data.1, %a.1_data) - %res_mask : Dynamic = aten::where(%cond_mask, %mask, %a.1_mask) - %res_dims : Dynamic = aten::__or__(%dims, %a.1_dims) - return (%res_data, %res_mask, %res_dims); -} diff --git a/test/expect/TestBatched.test_if_noelse_with_scalar.expect b/test/expect/TestBatched.test_if_noelse_with_scalar.expect deleted file mode 100644 index 935bedb22b3f80..00000000000000 --- a/test/expect/TestBatched.test_if_noelse_with_scalar.expect +++ /dev/null @@ -1,47 +0,0 @@ -graph(%a.1_data : Dynamic - %a.1_mask : Dynamic - %a.1_dims : Dynamic - %b_data : Dynamic - %b_mask : Dynamic - %b_dims : Dynamic) { - %6 : float = prim::Constant[value=0.1]() - %7 : Float() = prim::NumToTensor(%6) - %other : float = prim::TensorToNum(%7) - %9 : Dynamic = aten::gt(%a.1_data, %other) - %10 : int = prim::TensorToNum(%9) - %11 : int = prim::Constant[value=1]() - %12 : Long() = prim::NumToTensor(%11) - %alpha : float = prim::TensorToNum(%12) - %data.1 : Dynamic = aten::add(%a.1_data, %b_data, %alpha) - %mask : Dynamic = aten::mul(%a.1_mask, %b_mask) - %dims : Dynamic = aten::__or__(%a.1_dims, %b_dims) - %17 : Dynamic = aten::type_as(%a.1_mask, %9) - %cond_mask.1 : Dynamic = aten::mul(%9, %17) - %19 : int = aten::dim(%cond_mask.1) - %20 : int = prim::Constant[value=1]() - %21 : int = aten::eq(%19, %20) - %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%21) - block0() { - %25 : int = aten::dim(%data.1) - %26 : int = prim::Constant[value=1]() - %27 : int = aten::sub(%25, %26) - %28 : int = prim::Constant[value=1]() - %data.3 : Dynamic = prim::Loop(%27, %28, %cond_mask.1) - block0(%_ : int, %31 : Dynamic) { - %32 : int = aten::dim(%31) - %data.2 : Dynamic = aten::unsqueeze(%31, %32) - %34 : int = prim::Constant[value=1]() - -> (%34, %data.2) - } - %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1) - %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask) - -> (%cond_data.1, %cond_mask.2, %data.3) - } - block1() { - -> (%cond_mask.1, %cond_mask.1, %cond_mask.1) - } - %res_data : Dynamic = aten::where(%cond_data, %data.1, %a.1_data) - %res_mask : Dynamic = aten::where(%cond_mask, %mask, %a.1_mask) - %res_dims : Dynamic = aten::__or__(%dims, %a.1_dims) - return (%res_data, %res_mask, %res_dims); -} diff --git a/test/expect/TestBatched.test_while.expect b/test/expect/TestBatched.test_while.expect deleted file mode 100644 index a32cd392044f00..00000000000000 --- a/test/expect/TestBatched.test_while.expect +++ /dev/null @@ -1,65 +0,0 @@ -graph(%a.1_data : Dynamic - %a.1_mask : Dynamic - %a.1_dims : Dynamic - %b_data : Dynamic - %b_mask : Dynamic - %b_dims : Dynamic) { - %6 : int = prim::Constant[value=2147483647]() - %7 : Dynamic = aten::gt(%a.1_data, %b_data) - %8 : Dynamic = aten::mul(%a.1_mask, %b_mask) - %9 : Dynamic = aten::__or__(%a.1_dims, %b_dims) - %10 : int = prim::TensorToNum(%7) - %11 : Dynamic = aten::mul(%7, %8) - %12 : Dynamic = aten::sum(%11) - %13 : int = prim::Constant[value=0]() - %14 : Dynamic = aten::gt(%12, %13) - %15 : int = prim::TensorToNum(%14) - %64 : Dynamic, %65 : Dynamic, %66 : Dynamic, %a : Dynamic, %62 : Dynamic, %63 : Dynamic = prim::Loop(%6, %15, %7, %8, %9, %a.1_data, %a.1_mask, %a.1_dims) - block0(%loop_num : int, %cond_data.2 : Dynamic, %cond_mask.3 : Dynamic, %cond_dims : Dynamic, %6_data : Dynamic, %6_mask : Dynamic, %6_dims : Dynamic) { - %24 : int = prim::Constant[value=1]() - %25 : Long() = prim::NumToTensor(%24) - %alpha : float = prim::TensorToNum(%25) - %data.1 : Dynamic = aten::sub(%6_data, %b_data, %alpha) - %mask : Dynamic = aten::mul(%6_mask, %b_mask) - %dims : Dynamic = aten::__or__(%6_dims, %b_dims) - %30 : Dynamic = aten::gt(%data.1, %b_data) - %31 : Dynamic = aten::mul(%mask, %b_mask) - %32 : Dynamic = aten::__or__(%dims, %b_dims) - %33 : int = prim::TensorToNum(%30) - %34 : Dynamic = aten::type_as(%cond_mask.3, %cond_data.2) - %cond_mask.1 : Dynamic = aten::mul(%cond_data.2, %34) - %36 : int = aten::dim(%cond_mask.1) - %37 : int = prim::Constant[value=1]() - %38 : int = aten::eq(%36, %37) - %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%38) - block0() { - %42 : int = aten::dim(%data.1) - %43 : int = prim::Constant[value=1]() - %44 : int = aten::sub(%42, %43) - %45 : int = prim::Constant[value=1]() - %data.3 : Dynamic = prim::Loop(%44, %45, %cond_mask.1) - block0(%_ : int, %48 : Dynamic) { - %49 : int = aten::dim(%48) - %data.2 : Dynamic = aten::unsqueeze(%48, %49) - %51 : int = prim::Constant[value=1]() - -> (%51, %data.2) - } - %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1) - %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask) - -> (%cond_data.1, %cond_mask.2, %data.3) - } - block1() { - -> (%cond_mask.1, %cond_mask.1, %cond_mask.1) - } - %res_data : Dynamic = aten::where(%cond_data, %data.1, %6_data) - %res_mask : Dynamic = aten::where(%cond_mask, %mask, %6_mask) - %res_dims : Dynamic = aten::__or__(%dims, %6_dims) - %57 : Dynamic = aten::mul(%30, %31) - %58 : Dynamic = aten::sum(%57) - %59 : int = prim::Constant[value=0]() - %60 : Dynamic = aten::gt(%58, %59) - %61 : int = prim::TensorToNum(%60) - -> (%61, %30, %31, %32, %res_data, %res_mask, %res_dims) - } - return (%a, %62, %63); -} diff --git a/test/expect/TestJit.test_concat_fusion.expect b/test/expect/TestJit.test_concat_fusion.expect index 454a84cba1db76..027c2de33e5926 100644 --- a/test/expect/TestJit.test_concat_fusion.expect +++ b/test/expect/TestJit.test_concat_fusion.expect @@ -3,11 +3,12 @@ graph(%0 : Float(3, 20) %2 : Float(6, 20) = prim::FusionGroup_0[device=0](%0, %1) return (%2); } -with prim::FusionGroup_0 = graph(%3 : Float(3, 20) - %4 : Float(3, 20)) { - %6 : int = prim::Constant[value=1]() - %7 : Float(3, 20) = aten::add(%3, %4, %6) - %5 : Float(3, 20) = aten::mul(%3, %4) - %2 : Float(6, 20) = prim::FusedConcat[dim=0](%7, %5) - return (%2); +with prim::FusionGroup_0 = graph(%4 : Float(3, 20) + %5 : Float(3, 20)) { + %7 : int = prim::Constant[value=1]() + %8 : Float(3, 20) = aten::add(%4, %5, %7) + %6 : Float(3, 20) = aten::mul(%4, %5) + %2 : int = prim::Constant[value=0]() + %3 : Float(6, 20) = aten::cat(%8, %6, %2) + return (%3); } diff --git a/test/expect/TestJit.test_constant_prop_nested.expect b/test/expect/TestJit.test_constant_prop_nested.expect deleted file mode 100644 index 09ef82076edc4a..00000000000000 --- a/test/expect/TestJit.test_constant_prop_nested.expect +++ /dev/null @@ -1,15 +0,0 @@ -graph(%a : Dynamic) { - %1 : int = prim::Constant[value=2]() - %2 : Dynamic = aten::lt(%a, %1) - %3 : int = prim::TensorToNum(%2) - %c : int = prim::If(%3) - block0() { - %5 : int = prim::Constant[value=5]() - -> (%5) - } - block1() { - %6 : int = prim::Constant[value=1]() - -> (%6) - } - return (%c); -} diff --git a/test/expect/TestJit.test_constant_prop_print.expect b/test/expect/TestJit.test_constant_prop_print.expect deleted file mode 100644 index 7cadfdbbc6b3ea..00000000000000 --- a/test/expect/TestJit.test_constant_prop_print.expect +++ /dev/null @@ -1,12 +0,0 @@ -graph(%input_tensor : Dynamic) { - %1 : int = prim::Constant[value=6]() - %2 : Dynamic = ^FIXME_zerol()() - %a : Dynamic = aten::add(%1, %2) - = prim::Print(%a) - %4 : int = prim::Constant[value=2]() - %5 : int = prim::Constant[value=1]() - %b : Dynamic = aten::add(%a, %4, %5) - %7 : int = prim::Constant[value=1]() - %8 : Dynamic = aten::add(%b, %input_tensor, %7) - return (%8); -} diff --git a/test/expect/TestJit.test_constant_prop_rand.expect b/test/expect/TestJit.test_constant_prop_rand.expect deleted file mode 100644 index a6c305258bff95..00000000000000 --- a/test/expect/TestJit.test_constant_prop_rand.expect +++ /dev/null @@ -1,11 +0,0 @@ -graph() { - %0 : int = prim::Constant[value=6]() - %1 : int = prim::Constant[value=0]() - %2 : int[] = prim::Constant[value=[0, -1]]() - %3 : int[] = prim::Constant[value=[3]]() - %a : Dynamic = aten::randn(%3, %0, %1, %2) - %5 : int = prim::Constant[value=2]() - %6 : int = prim::Constant[value=1]() - %b : Dynamic = aten::add(%a, %5, %6) - return (%b); -} diff --git a/test/expect/TestJit.test_constant_prop_simple.expect b/test/expect/TestJit.test_constant_prop_simple.expect deleted file mode 100644 index 029f9ac05a0783..00000000000000 --- a/test/expect/TestJit.test_constant_prop_simple.expect +++ /dev/null @@ -1,5 +0,0 @@ -graph(%input_tensor : Dynamic) { - %1 : int = prim::Constant[value=8]() - %2 : Dynamic = aten::add(%1, %input_tensor) - return (%2); -} diff --git a/test/expect/TestJit.test_lstm_fusion_concat.expect b/test/expect/TestJit.test_lstm_fusion_concat.expect index f0771c133c11d9..7884a95c48c9a1 100644 --- a/test/expect/TestJit.test_lstm_fusion_concat.expect +++ b/test/expect/TestJit.test_lstm_fusion_concat.expect @@ -16,33 +16,34 @@ graph(%0 : Float(3, 10) %21 : Float(6, 20) = prim::FusionGroup_0[device=0](%2, %16, %20, %15, %19, %14, %18, %13, %17) return (%21); } -with prim::FusionGroup_0 = graph(%15 : Float(3, 20) - %25 : Float(3!, 20) +with prim::FusionGroup_0 = graph(%16 : Float(3, 20) %26 : Float(3!, 20) - %29 : Float(3!, 20) + %27 : Float(3!, 20) %30 : Float(3!, 20) - %33 : Float(3!, 20) + %31 : Float(3!, 20) %34 : Float(3!, 20) - %37 : Float(3!, 20) - %38 : Float(3!, 20)) { - %39 : int = prim::Constant[value=1]() - %40 : Float(3, 20) = aten::add(%37, %38, %39) - %35 : int = prim::Constant[value=1]() - %36 : Float(3, 20) = aten::add(%33, %34, %35) - %31 : int = prim::Constant[value=1]() - %32 : Float(3, 20) = aten::add(%29, %30, %31) - %27 : int = prim::Constant[value=1]() - %28 : Float(3, 20) = aten::add(%25, %26, %27) - %24 : Float(3, 20) = aten::sigmoid(%40) - %22 : Float(3, 20) = aten::sigmoid(%36) - %20 : Float(3, 20) = aten::tanh(%32) - %18 : Float(3, 20) = aten::sigmoid(%28) - %16 : Float(3, 20) = aten::mul(%22, %15) - %13 : Float(3, 20) = aten::mul(%24, %20) - %9 : int = prim::Constant[value=1]() - %10 : Float(3, 20) = aten::add(%16, %13, %9) - %6 : Float(3, 20) = aten::tanh(%10) - %5 : Float(3, 20) = aten::mul(%18, %6) - %2 : Float(6, 20) = prim::FusedConcat[dim=0](%5, %10) - return (%2); + %35 : Float(3!, 20) + %38 : Float(3!, 20) + %39 : Float(3!, 20)) { + %40 : int = prim::Constant[value=1]() + %41 : Float(3, 20) = aten::add(%38, %39, %40) + %36 : int = prim::Constant[value=1]() + %37 : Float(3, 20) = aten::add(%34, %35, %36) + %32 : int = prim::Constant[value=1]() + %33 : Float(3, 20) = aten::add(%30, %31, %32) + %28 : int = prim::Constant[value=1]() + %29 : Float(3, 20) = aten::add(%26, %27, %28) + %25 : Float(3, 20) = aten::sigmoid(%41) + %23 : Float(3, 20) = aten::sigmoid(%37) + %21 : Float(3, 20) = aten::tanh(%33) + %19 : Float(3, 20) = aten::sigmoid(%29) + %17 : Float(3, 20) = aten::mul(%23, %16) + %14 : Float(3, 20) = aten::mul(%25, %21) + %10 : int = prim::Constant[value=1]() + %11 : Float(3, 20) = aten::add(%17, %14, %10) + %7 : Float(3, 20) = aten::tanh(%11) + %6 : Float(3, 20) = aten::mul(%19, %7) + %2 : int = prim::Constant[value=0]() + %3 : Float(6, 20) = aten::cat(%6, %11, %2) + return (%3); } diff --git a/test/expect/TestScript.test_cat_lifts.expect b/test/expect/TestScript.test_cat_lifts.expect index c8c82e5199c030..ea2fa3737c0556 100644 --- a/test/expect/TestScript.test_cat_lifts.expect +++ b/test/expect/TestScript.test_cat_lifts.expect @@ -1,18 +1,15 @@ graph(%x : Dynamic) { %1 : int = prim::Constant[value=1]() - %2 : Dynamic[] = prim::ListConstruct(%x, %x) - %3 : Dynamic = aten::cat(%2, %1) - return (%3); + %2 : Dynamic = aten::cat(%x, %x, %1) + return (%2); } graph(%x : Dynamic) { %1 : int = prim::Constant[value=1]() - %2 : Dynamic[] = prim::ListConstruct() - %3 : Dynamic = aten::cat(%2, %1) - return (%3); + %2 : Dynamic = aten::cat(%1) + return (%2); } graph(%x : Dynamic) { %1 : int = prim::Constant[value=1]() - %2 : Dynamic[] = prim::ListConstruct(%x) - %3 : Dynamic = aten::cat(%2, %1) - return (%3); + %2 : Dynamic = aten::cat(%x, %1) + return (%2); } diff --git a/test/expect/TestScript.test_index_put_trace_with_view.expect b/test/expect/TestScript.test_index_put_trace_with_view.expect index 37f08643f139a4..591e499da96671 100644 --- a/test/expect/TestScript.test_index_put_trace_with_view.expect +++ b/test/expect/TestScript.test_index_put_trace_with_view.expect @@ -6,7 +6,6 @@ graph(%0 : Double(100) %5 : Double(4) = aten::view(%2, %4) %6 : int = prim::Constant[value=0]() %7 : Long(4) = aten::_cast_Long(%1, %6) - %8 : Dynamic[] = prim::ListConstruct(%7) - %20 : Double(100) = aten::index_put(%0, %8, %5) - return (%20); + %19 : Double(100) = aten::index_put(%0, %7, %5) + return (%19); } diff --git a/test/expect/TestScript.test_index_put_trace_without_view.expect b/test/expect/TestScript.test_index_put_trace_without_view.expect index 772308223b454b..42f8e49142942e 100644 --- a/test/expect/TestScript.test_index_put_trace_without_view.expect +++ b/test/expect/TestScript.test_index_put_trace_without_view.expect @@ -3,7 +3,6 @@ graph(%0 : Double(100) %2 : Double(4)) { %3 : int = prim::Constant[value=0]() %4 : Long(4) = aten::_cast_Long(%1, %3) - %5 : Dynamic[] = prim::ListConstruct(%4) - %17 : Double(100) = aten::index_put(%0, %5, %2) - return (%17); + %16 : Double(100) = aten::index_put(%0, %4, %2) + return (%16); } diff --git a/test/onnx/expect/TestOperators.test_elu.expect b/test/onnx/expect/TestOperators.test_elu.expect deleted file mode 100644 index a8eff9ab2c1387..00000000000000 --- a/test/onnx/expect/TestOperators.test_elu.expect +++ /dev/null @@ -1,63 +0,0 @@ -ir_version: 3 -producer_name: "pytorch" -producer_version: "0.3" -graph { - node { - input: "0" - output: "1" - op_type: "Elu" - attribute { - name: "alpha" - f: 1 - type: FLOAT - } - } - name: "torch-jit-export" - input { - name: "0" - type { - tensor_type { - elem_type: FLOAT - shape { - dim { - dim_value: 1 - } - dim { - dim_value: 2 - } - dim { - dim_value: 3 - } - dim { - dim_value: 4 - } - } - } - } - } - output { - name: "1" - type { - tensor_type { - elem_type: FLOAT - shape { - dim { - dim_value: 1 - } - dim { - dim_value: 2 - } - dim { - dim_value: 3 - } - dim { - dim_value: 4 - } - } - } - } - } -} -opset_import { - version: 7 -} diff --git a/test/onnx/expect/TestOperators.test_equal.expect b/test/onnx/expect/TestOperators.test_equal.expect index fc23156d1cbf47..3d8210b14bcbee 100644 --- a/test/onnx/expect/TestOperators.test_equal.expect +++ b/test/onnx/expect/TestOperators.test_equal.expect @@ -45,7 +45,7 @@ graph { name: "2" type { tensor_type { - elem_type: UINT8 + elem_type: INT8 shape { dim { dim_value: 3 diff --git a/test/onnx/expect/TestOperators.test_ge.expect b/test/onnx/expect/TestOperators.test_ge.expect index 204a59e88ef5a6..e50f2e12537d56 100644 --- a/test/onnx/expect/TestOperators.test_ge.expect +++ b/test/onnx/expect/TestOperators.test_ge.expect @@ -50,7 +50,7 @@ graph { name: "3" type { tensor_type { - elem_type: UINT8 + elem_type: INT8 shape { dim { dim_value: 3 diff --git a/test/onnx/expect/TestOperators.test_gt.expect b/test/onnx/expect/TestOperators.test_gt.expect index d3eb9cf08c30a6..3cda8f244819b7 100644 --- a/test/onnx/expect/TestOperators.test_gt.expect +++ b/test/onnx/expect/TestOperators.test_gt.expect @@ -45,7 +45,7 @@ graph { name: "2" type { tensor_type { - elem_type: UINT8 + elem_type: INT8 shape { dim { dim_value: 3 diff --git a/test/onnx/expect/TestOperators.test_le.expect b/test/onnx/expect/TestOperators.test_le.expect index 39ba6940e2289c..2aefbc6dbc8622 100644 --- a/test/onnx/expect/TestOperators.test_le.expect +++ b/test/onnx/expect/TestOperators.test_le.expect @@ -50,7 +50,7 @@ graph { name: "3" type { tensor_type { - elem_type: UINT8 + elem_type: INT8 shape { dim { dim_value: 3 diff --git a/test/onnx/expect/TestOperators.test_lt.expect b/test/onnx/expect/TestOperators.test_lt.expect index cd9c4eaaaf50a7..83656cb3a5ce04 100644 --- a/test/onnx/expect/TestOperators.test_lt.expect +++ b/test/onnx/expect/TestOperators.test_lt.expect @@ -45,7 +45,7 @@ graph { name: "2" type { tensor_type { - elem_type: UINT8 + elem_type: INT8 shape { dim { dim_value: 3 diff --git a/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect b/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect index 3c1321664dd3fd..b1ff53c2e4e7d8 100644 --- a/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect +++ b/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect @@ -10,33 +10,33 @@ graph { t { dims: 4 data_type: INT64 - raw_data: "\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\003\000\000\000\000\000\000\000\004\000\000\000\000\000\000\000" + raw_data: "\001\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000" } type: TENSOR } } node { + input: "0" + input: "1" output: "2" + op_type: "Reshape" + } + node { + output: "3" op_type: "Constant" attribute { name: "value" t { dims: 4 data_type: INT64 - raw_data: "\001\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000" + raw_data: "\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\003\000\000\000\000\000\000\000\004\000\000\000\000\000\000\000" } type: TENSOR } } node { - input: "0" input: "2" - output: "3" - op_type: "Reshape" - } - node { input: "3" - input: "1" output: "4" op_type: "Tile" } diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py index ba8292e616686a..1e2c401dcc3ac0 100644 --- a/test/onnx/test_operators.py +++ b/test/onnx/test_operators.py @@ -364,10 +364,6 @@ def test_pow(self): y = Variable(torch.randn(1, 2, 3, 4), requires_grad=True) self.assertONNX(lambda x, y: x.pow(y), (x, y)) - def test_elu(self): - x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True) - self.assertONNX(nn.ELU(), x) - def test_selu(self): x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True) self.assertONNX(nn.SELU(), x) diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py index 7130a7695cc69b..85ef2eac5bf2ce 100644 --- a/test/onnx/test_pytorch_onnx_caffe2.py +++ b/test/onnx/test_pytorch_onnx_caffe2.py @@ -676,52 +676,6 @@ def forward(self, x): x = Variable(torch.randn(*shape)) self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False) - def test_cumsum(self): - shape = (3, 4, 5) - for params in [{'dim': i} for i in range(len(shape))]: - class MyModel(torch.nn.Module): - def __init__(self): - super(MyModel, self).__init__() - - def forward(self, x): - return torch.cumsum(x, **params) - x = Variable(torch.randn(*shape)) - self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False) - - def test_repeat(self): - class MyModel(torch.nn.Module): - def __init__(self): - super(MyModel, self).__init__() - - def forward(self, x): - return x.repeat(1, 2, 3, 4) - - x = Variable(torch.randn(4, 3, 2, 1), requires_grad=True) - self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False) - - def test_repeat_dim_overflow(self): - class MyModel(torch.nn.Module): - def __init__(self): - super(MyModel, self).__init__() - - def forward(self, x): - return x.repeat(1, 2, 3, 4) - - x = Variable(torch.randn(1, 2), requires_grad=True) - self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False) - - def test_repeat_dynamic(self): - class MyModel(torch.nn.Module): - def __init__(self): - super(MyModel, self).__init__() - - def forward(self, x, y): - return x.repeat(y.size()[0] / 2, y.size()[1] * 2) - - x = Variable(torch.randn(1, 2), requires_grad=True) - y = Variable(torch.randn(2, 4), requires_grad=True) - self.run_model_test(MyModel(), train=False, input=(x, y), batch_size=BATCH_SIZE, use_gpu=False) - def test_mean(self): shape = (3, 4, 5) for params in [{}] + [{'dim': i} for i in range(len(shape))]: diff --git a/test/test_autograd.py b/test/test_autograd.py index 9d39043db9b56d..3ef7c21d49fc90 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -15,7 +15,7 @@ from torch.autograd.function import once_differentiable from torch.autograd.profiler import profile from common import TEST_MKL, TestCase, run_tests, skipIfNoLapack, \ - suppress_warnings, TEST_WITH_ROCM + suppress_warnings, skipIfNoZeroSize, TEST_WITH_ROCM from torch.autograd import Variable, Function, detect_anomaly from torch.autograd.function import InplaceFunction from torch.testing import make_non_contiguous, randn_like @@ -1851,16 +1851,6 @@ def backward(ctx, grad_output): out.sum().backward() self.assertEqual(x.grad.data, y_data) - def test_broadcast_tensors(self): - f_args_variable = (torch.randn(3, requires_grad=True), - torch.randn(1, 2, 1, requires_grad=True), - torch.randn(1, 1, requires_grad=True), - torch.randn(5, 1, 1, requires_grad=True)) - f_args_tensor = deepcopy(unpack_variables(f_args_variable)) - run_functional_checks(self, "test_broadcast_tensors", "broadcast", - lambda a, b, c, d: torch.broadcast_tensors(a, b, c, d), - True, f_args_variable, f_args_tensor) - def test_cat(self): f_args_variable = (torch.randn(1, S, S, requires_grad=True), torch.randn(2, S, S, requires_grad=True), @@ -1902,6 +1892,7 @@ def test_cat_empty_legacy(self): False, f_args_variable, f_args_tensor) self.assertTrue(gradcheck(lambda a, b: torch.cat((a, b)), f_args_variable, eps=1e-6, atol=PRECISION)) + @skipIfNoZeroSize def test_cat_empty(self): f_args_variable = (torch.randn(0, S, requires_grad=True), torch.randn(S, S, requires_grad=True)) @@ -1910,6 +1901,7 @@ def test_cat_empty(self): lambda a, b: torch.cat((a, b)), True, f_args_variable, f_args_tensor) + @skipIfNoLapack def test_potrf(self): root = Variable(torch.tril(torch.rand(S, S)), requires_grad=True) @@ -3131,7 +3123,7 @@ class dont_convert(tuple): ('select', (S, S, S), (1, -1), 'wrap_dim', [0]), ('select', (S,), (0, 2), '1d'), ('narrow', (S, S, S), (1, 2, 2), 'dim', [0]), - ('narrow', (S, S, S), (1, 0, 0), 'empty_dim', [0]), + ('narrow', (S, S, S), (1, 0, 0), 'empty_dim', [0], [skipIfNoZeroSize]), ('squeeze', (S, 1, S, 1), NO_ARGS), ('squeeze', (1, 1, 1, 1), NO_ARGS, 'input_sizes_are_ones'), ('squeeze', (S, 1, S, 1), (1,), '1_dim', [0]), diff --git a/test/test_distributions.py b/test/test_distributions.py index 8a607ece6931c5..7effb9012e9fc6 100644 --- a/test/test_distributions.py +++ b/test/test_distributions.py @@ -42,8 +42,8 @@ Independent, Laplace, LogisticNormal, LogNormal, LowRankMultivariateNormal, Multinomial, MultivariateNormal, - NegativeBinomial, Normal, OneHotCategorical, Pareto, - Poisson, RelaxedBernoulli, RelaxedOneHotCategorical, + Normal, OneHotCategorical, Pareto, Poisson, + RelaxedBernoulli, RelaxedOneHotCategorical, StudentT, TransformedDistribution, Uniform, Weibull, constraints, kl_divergence) from torch.distributions.constraint_registry import biject_to, transform_to @@ -123,16 +123,6 @@ def is_all_nan(tensor): {'probs': torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True), 'total_count': torch.tensor(0.)}, ]), - Example(NegativeBinomial, [ - {'probs': torch.tensor([[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True), 'total_count': 10}, - {'probs': torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True), 'total_count': 10}, - {'probs': torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True), 'total_count': torch.tensor([10])}, - {'probs': torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True), 'total_count': torch.tensor([10, 8])}, - {'probs': torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True), - 'total_count': torch.tensor([[10., 8.], [5., 3.]])}, - {'probs': torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True), - 'total_count': torch.tensor(0.)}, - ]), Example(Multinomial, [ {'probs': torch.tensor([[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True), 'total_count': 10}, {'probs': torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True), 'total_count': 10}, @@ -452,12 +442,6 @@ def is_all_nan(tensor): {'probs': torch.tensor([[1.0, 0.0], [0.0, 2.0]], requires_grad=True), 'total_count': 10}, ]), - Example(NegativeBinomial, [ - {'probs': torch.tensor([[-0.0000001, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True), - 'total_count': 10}, - {'probs': torch.tensor([[1.0, 0.0], [0.0, 2.0]], requires_grad=True), - 'total_count': 10}, - ]), Example(Cauchy, [ {'loc': 0.0, 'scale': -1.0}, {'loc': torch.tensor([0.0]), 'scale': 0.0}, @@ -927,37 +911,6 @@ def test_binomial_enumerate_support(self): bin1 = Binomial(torch.tensor(5), torch.tensor(0.5)) self.assertEqual(bin1.enumerate_support(), torch.arange(6)) - def test_negative_binomial(self): - p = torch.tensor(torch.arange(0.05, 1, 0.1), requires_grad=True) - for total_count in [1, 2, 10]: - self._gradcheck_log_prob(lambda p: NegativeBinomial(total_count, p), [p]) - self._gradcheck_log_prob(lambda p: NegativeBinomial(total_count, None, p.log()), [p]) - self.assertRaises(NotImplementedError, NegativeBinomial(10, p).rsample) - self.assertRaises(NotImplementedError, NegativeBinomial(10, p).entropy) - - @unittest.skipIf(not TEST_NUMPY, "NumPy not found") - def test_negative_binomial_log_prob(self): - probs = torch.tensor(torch.arange(0.05, 1, 0.1)) - for total_count in [1, 2, 10]: - - def ref_log_prob(idx, x, log_prob): - p = probs.view(-1)[idx].item() - expected = scipy.stats.nbinom(total_count, 1 - p).logpmf(x) - self.assertAlmostEqual(log_prob, expected, places=3) - - self._check_log_prob(NegativeBinomial(total_count, probs), ref_log_prob) - logits = probs_to_logits(probs, is_binary=True) - self._check_log_prob(NegativeBinomial(total_count, logits=logits), ref_log_prob) - - @unittest.skipIf(not TEST_NUMPY, "NumPy not found") - def test_negative_binomial_log_prob_vectorized_count(self): - probs = torch.tensor([0.2, 0.7, 0.9]) - for total_count, sample in [(torch.tensor([10]), torch.tensor([7., 3., 9.])), - (torch.tensor([1, 2, 10]), torch.tensor([0., 1., 9.]))]: - log_prob = NegativeBinomial(total_count, probs).log_prob(sample) - expected = scipy.stats.nbinom(total_count.cpu().numpy(), 1 - probs.cpu().numpy()).logpmf(sample) - self.assertAlmostEqual(log_prob, expected, places=4) - def test_multinomial_1d(self): total_count = 10 p = torch.tensor([0.1, 0.2, 0.3], requires_grad=True) @@ -3522,7 +3475,7 @@ def setUp(self): ), ( Binomial(10, simplex_tensor), - scipy.stats.binom(10 * np.ones(simplex_tensor.shape), simplex_tensor.numpy()) + scipy.stats.binom(10 * np.ones(simplex_tensor.shape), simplex_tensor) ), ( Cauchy(random_var, positive_var), @@ -3909,9 +3862,6 @@ def get_constraints(self, is_cuda=False): constraints.greater_than(0), constraints.greater_than(2), constraints.greater_than(-2), - constraints.greater_than_eq(0), - constraints.greater_than_eq(2), - constraints.greater_than_eq(-2), constraints.less_than(tensor([-10., -2, 0, 2, 10])), constraints.less_than(0), constraints.less_than(2), @@ -3921,10 +3871,6 @@ def get_constraints(self, is_cuda=False): tensor([-3., 3, 1, 5, 5])), constraints.interval(-2, -1), constraints.interval(1, 2), - constraints.half_open_interval(tensor([-4., -2, 0, 2, 4]), - tensor([-3., 3, 1, 5, 5])), - constraints.half_open_interval(-2, -1), - constraints.half_open_interval(1, 2), constraints.simplex, constraints.lower_cholesky, ] diff --git a/test/test_indexing.py b/test/test_indexing.py index afe9e6d60c653c..00865d9f576b74 100644 --- a/test/test_indexing.py +++ b/test/test_indexing.py @@ -1,4 +1,4 @@ -from common import TestCase, run_tests +from common import TestCase, run_tests, skipIfNoZeroSize import torch import warnings from torch import tensor @@ -93,6 +93,7 @@ def test_empty_index(self): y[mask] = -1 self.assertEqual(x, y) + @skipIfNoZeroSize def test_empty_ndim_index(self): devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] for device in devices: @@ -103,12 +104,14 @@ def test_empty_ndim_index(self): self.assertEqual(torch.empty(2, 0, 6, 4, 5, device=device), x[:, torch.empty(0, 6, dtype=torch.int64, device=device)]) + @skipIfNoZeroSize def test_empty_ndim_index_bool(self): devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] for device in devices: x = torch.randn(5, device=device) self.assertRaises(IndexError, lambda: x[torch.empty(0, 2, dtype=torch.uint8, device=device)]) + @skipIfNoZeroSize def test_empty_slice(self): devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] for device in devices: @@ -472,18 +475,26 @@ def test_boolean_indexing_twodim(self): def test_boolean_indexing_weirdness(self): # Weird boolean indexing things a = torch.ones((2, 3, 4)) - self.assertEqual((0, 2, 3, 4), a[False, True, ...].shape) + if torch._C._use_zero_size_dim(): + self.assertEqual((0, 2, 3, 4), a[False, True, ...].shape) + else: + self.assertEqual((0,), a[False, True, ...].shape) self.assertEqual(torch.ones(1, 2), a[True, [0, 1], True, True, [1], [[2]]]) - self.assertRaises(RuntimeError, lambda: a[False, [0, 1], ...]) + if torch._C._use_zero_size_dim(): + self.assertRaises(RuntimeError, lambda: a[False, [0, 1], ...]) def test_boolean_indexing_weirdness_tensors(self): # Weird boolean indexing things false = torch.tensor(False) true = torch.tensor(True) a = torch.ones((2, 3, 4)) - self.assertEqual((0, 2, 3, 4), a[False, True, ...].shape) + if torch._C._use_zero_size_dim(): + self.assertEqual((0, 2, 3, 4), a[False, True, ...].shape) + else: + self.assertEqual((0,), a[False, True, ...].shape) self.assertEqual(torch.ones(1, 2), a[true, [0, 1], true, true, [1], [[2]]]) - self.assertRaises(RuntimeError, lambda: a[false, [0, 1], ...]) + if torch._C._use_zero_size_dim(): + self.assertRaises(RuntimeError, lambda: a[false, [0, 1], ...]) def test_boolean_indexing_alldims(self): true = torch.tensor(True) diff --git a/test/test_jit.py b/test/test_jit.py index b3bbe9892bc7db..ab4c907e72d19f 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -1122,95 +1122,13 @@ def test_fn(ten, mask): ten = torch.rand(3, 3) self.assertEqual(test_fn(ten, mask), traced_test_fn(ten, mask)) - def test_constant_prop_simple(self): - @torch.jit.script - def constant_prop(input_tensor): - a = 2 * 3 - b = a + 2 - return b + input_tensor - - x = torch.tensor(2) - out_ref = constant_prop(x) - self.run_pass('constant_propagation', constant_prop.graph) - out_test = constant_prop(torch.tensor(2)) - self.assertEqual(out_ref, out_test) - self.assertExpected(canonical(constant_prop.graph)) - - def test_constant_prop_nested(self): - @torch.jit.script - def constant_prop(a): - b = 2 + 1 - if a < 2: - c = b + 2 - else: - c = b - 2 - return c - - out_ref = constant_prop(torch.tensor(2)) - self.run_pass('constant_propagation', constant_prop.graph) - out_test = constant_prop(torch.tensor(2)) - self.assertEqual(out_ref, out_test) - self.assertExpected(canonical(constant_prop.graph)) - - def test_constant_prop_print(self): - @torch.jit.script - def constant_prop(input_tensor): - a = 2 * 3 + FIXME_zerol() - print(a) - b = a + 2 - return b + input_tensor - - self.run_pass('constant_propagation', constant_prop.graph) - self.assertExpected(canonical(constant_prop.graph)) - - def test_constant_prop_rand(self): - @torch.jit.script - def constant_prop(): - a = torch.randn([3]) - b = a + 2 - return b - - self.run_pass('constant_propagation', constant_prop.graph) - self.assertExpected(canonical(constant_prop.graph)) - - # TODO: implement - @unittest.expectedFailure - def test_constant_prop_if_constant(self): - @torch.jit.script - def constant_prop(): - b = 3 - if True: - b = 1 - if False: - b = 2 - return b - - self.run_pass('constant_propagation', constant_prop.graph) - self.assertExpected(canonical(constant_prop.graph)) - - # TODO: implement - @unittest.expectedFailure - def test_constant_prop_loop_constant(self): - @torch.jit.script - def constant_prop(): - b = 0 - while True: - b = 1 - while False: - b = 2 - return b - - self.run_pass('constant_propagation', constant_prop.graph) - self.assertExpected(canonical(constant_prop.graph)) - class TestBatched(TestCase): # generate random examples and create an batchtensor with them def rand_batch(self, *dims): dims = [dim for dim in dims if dim != ()] - xs = [torch.rand(1, *(random.randint(1, size) if b else size for b, size in dims[1:]), - requires_grad=True) for i in range(dims[0])] - xb = BatchTensor(xs, torch.tensor([b for b, d in dims[1:]]).byte()) + xs = [torch.rand(1, *(random.randint(1, size) if b else size for b, size in dims[1:])) for i in range(dims[0])] + xb = BatchTensor(xs, torch.tensor([b for b, d in dims[1:]])) return xs, xb def test_create_batchtensor(self): @@ -1238,20 +1156,20 @@ def tanh(a): def test_batch_elementwise_binary(self): @torch.jit.batch(batch_size=4) - def add(a, b): - return a + b + def mul(a, b): + return a * b xs, batch = self.rand_batch(4, (True, 3), (False, 2)) xs2, batch2 = xs, batch - res_batch = add(batch, batch2) - res = [torch.add(xs[j], xs2[j]) for j in range(4)] + res_batch = mul(batch, batch2) + res = [torch.mul(xs[j], xs2[j]) for j in range(4)] self.assertEqual(res, res_batch.examples()) # test broadcast xs, batch = self.rand_batch(4, (False, 3), (False, 2)) b = torch.rand(3, 2) - res_batch = add(batch, b) - res = [torch.add(xs[j], b) for j in range(4)] + res_batch = mul(batch, b) + res = [torch.mul(xs[j], b) for j in range(4)] self.assertEqual(res, res_batch.examples()) def test_batch_mm(self): @@ -1298,33 +1216,6 @@ def matmul_test(xs, batch, xs2, batch2): xs2, batch2 = self.rand_batch(4, (False, 2), (True, 3)) matmul_test(xs, batch, xs2, batch2) - def test_batch_select(self): - @torch.jit.batch(batch_size=4) - def select(x): - return torch.select(x, 1, 0) - - xs, batch = self.rand_batch(4, (True, 3), (True, 2)) - res_batch = select(batch) - res = [torch.select(xs[j], 1, 0) for j in range(4)] - self.assertEqual(res, res_batch.examples()) - - xs, batch = self.rand_batch(4, (False, 3), (True, 2)) - res_batch = select(batch) - res = [torch.select(xs[j], 1, 0) for j in range(4)] - self.assertEqual(res, res_batch.examples()) - - def test_batch_index_select(self): - @torch.jit.batch(batch_size=4) - def index_select(x, ind): - return x.index_select(1, ind) - - xs, batch = self.rand_batch(4, (False, 5), (True, 2)) - ind = [torch.randint(0, 4, (1,), dtype=torch.long) for i in range(4)] - ind_batch = BatchTensor(ind, torch.tensor([]).byte()) - res_batch = index_select(batch, ind_batch) - res = [torch.index_select(xs[j], 1, ind[j]) for j in range(4)] - self.assertEqual(res, res_batch.examples()) - def test_batch_where(self): @torch.jit.batch(batch_size=4) def where(c, a, b): @@ -1341,300 +1232,43 @@ def where(c, a, b): res = [torch.where(xs_cond[j], xs[j], xs2[j]) for j in range(4)] self.assertEqual(res, res_batch.examples()) - def test_batch_argmax(self): - @torch.jit.batch(batch_size=4) - def argmax(a): - return torch.argmax(a, 1) - - xs, batch = self.rand_batch(4, (True, 5), (True, 6)) - res_batch = argmax(batch) - res = [torch.argmax(xs[j], 1) for j in range(4)] - self.assertEqual(res, res_batch.examples()) - - @torch.jit.batch(batch_size=4) - def argmax(a): - return torch.argmax(a, 1, False) - - res_batch = argmax(batch) - res = [torch.argmax(xs[j], 1, False) for j in range(4)] - self.assertEqual(res, res_batch.examples()) - - def test_batch_topk(self): - @torch.jit.batch(batch_size=4) - def topk(a): - return torch.topk(a, 3, 1) - - xs, batch = self.rand_batch(4, (False, 5), (True, 6)) - - # along static dim - res_batch = topk(batch) - res = [torch.topk(xs[j], 3, 1)[0] for j in range(4)] - res_idx = [torch.topk(xs[j], 3, 1)[1] for j in range(4)] - self.assertEqual(res, res_batch[0].examples()) - self.assertEqual(res_idx, res_batch[1].examples()) - - @torch.jit.batch(batch_size=4) - def topk(a): - return torch.topk(a, 1, 2) - - # along dynamic dim - res_batch = topk(batch) - res = [torch.topk(xs[j], 1, 2)[0] for j in range(4)] - res_idx = [torch.topk(xs[j], 1, 2)[1] for j in range(4)] - self.assertEqual(res, res_batch[0].examples()) - self.assertEqual(res_idx, res_batch[1].examples()) - - def test_batch_softmax(self): - @torch.jit.batch(batch_size=4) - def softmax(a): - return torch.softmax(a, 1) - - xs, batch = self.rand_batch(4, (False, 5), (True, 6)) - - # along static dim - res_batch = softmax(batch) - res = [torch.softmax(xs[j], 1) for j in range(4)] - self.assertEqual(res, res_batch.examples()) - - @torch.jit.batch(batch_size=4) - def softmax(a): - return torch.softmax(a, 2) - - # along dynamic dim - res_batch = softmax(batch) - res = [torch.softmax(xs[j], 2) for j in range(4)] - self.assertEqual(res, res_batch.examples()) - - def test_batch_view(self): - @torch.jit.batch(batch_size=4) - def view(a): - return a.view([4, -1, 3]) - - xs, batch = self.rand_batch(4, (True, 5), (False, 3)) - res_batch = view(batch) - res = [xs[j].view([1, -1, 3]) for j in range(4)] - self.assertEqual(res, res_batch.examples()) - - def test_batch_cat(self): - @torch.jit.batch(batch_size=4) - def cat2(a, b): - return torch.cat([a, b], 2) - - xs, batch = self.rand_batch(4, (True, 5), (False, 3)) - xs2, batch2 = xs, batch - res_batch = cat2(batch, batch2) - res = [torch.cat([xs[j], xs2[j]], 2) for j in range(4)] - self.assertEqual(res, res_batch.examples()) + @unittest.skip("Need support for scalar arguments") + def test_lstm_cell(self): + def LSTMCell(x, h, c, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c): + i_t = torch.matmul(x, w_xi) + torch.matmul(h, w_hi) + b_i + f_t = torch.matmul(x, w_xf) + torch.matmul(h, w_hf) + b_f + o_t = torch.matmul(x, w_xo) + torch.matmul(h, w_ho) + b_o + # activations + i_t = torch.sigmoid(i_t) + f_t = torch.sigmoid(f_t) + o_t = torch.sigmoid(o_t) + # cell computations + c_t = torch.matmul(x, w_xc) + torch.matmul(h, w_hc) + b_c + c_t = torch.tanh(c_t) + c_t = torch.mul(c, f_t) + torch.mul(i_t, c_t) + h_t = torch.mul(o_t, torch.tanh(c_t)) + return h_t - def test_batch_sum(self): @torch.jit.batch(batch_size=4) - def batch_sum(a): - return a.sum() - - xs, batch = self.rand_batch(4, (True, 5), (False, 3)) - res_batch = batch_sum(batch) - res = [xs[j].sum().unsqueeze(0) for j in range(4)] - self.assertEqual(res, res_batch.examples()) - - def test_if_else(self): - def single_if(a, b): - if a > b: - a = a + b - else: - a = a - b - return a - - batch_if = torch.jit.batch(batch_size=4)(single_if) - - a, batch_a = self.rand_batch(4, ()) - b, batch_b = self.rand_batch(4, ()) - res_batch = batch_if(batch_a, batch_b) - res = [single_if(a[j], b[j]) for j in range(4)] - self.assertEqual(res, res_batch.examples()) - - script_if = torch.jit.script(single_if) - graph = torch.to_batch_graph(script_if.graph) - self.assertExpected(str(graph)) - - def test_if_else_with_scalar(self): - def single_if(a, b): - if a > 0.1: - a = a + b - else: - a = a - b - return a - - batch_if = torch.jit.batch(batch_size=4)(single_if) - - a, batch_a = self.rand_batch(4, ()) - b, batch_b = self.rand_batch(4, ()) - res_batch = batch_if(batch_a, batch_b) - res = [single_if(a[j], b[j]) for j in range(4)] - self.assertEqual(res, res_batch.examples()) - - script_if = torch.jit.script(single_if) - graph = torch.to_batch_graph(script_if.graph) - self.assertExpected(str(graph)) - - def test_if_noelse(self): - def single_if(a, b): - if a > b: - a = a + b - return a - - batch_if = torch.jit.batch(batch_size=4)(single_if) - - a, batch_a = self.rand_batch(4, ()) - b, batch_b = self.rand_batch(4, ()) - res_batch = batch_if(batch_a, batch_b) - res = [single_if(a[j], b[j]) for j in range(4)] - self.assertEqual(res, res_batch.examples()) - - script_if = torch.jit.script(single_if) - graph = torch.to_batch_graph(script_if.graph) - self.assertExpected(str(graph)) - - def test_if_noelse_with_scalar(self): - def single_if(a, b): - if a > 0.1: - a = a + b - return a - - batch_if = torch.jit.batch(batch_size=4)(single_if) - - a, batch_a = self.rand_batch(4, ()) - b, batch_b = self.rand_batch(4, ()) - res_batch = batch_if(batch_a, batch_b) - res = [single_if(a[j], b[j]) for j in range(4)] - self.assertEqual(res, res_batch.examples()) - - script_if = torch.jit.script(single_if) - graph = torch.to_batch_graph(script_if.graph) - self.assertExpected(str(graph)) - - def test_while(self): - def single_while(a, b): - while a > b: - a = a - b - return a - - batch_while = torch.jit.batch(batch_size=4)(single_while) - - a, batch_a = self.rand_batch(4, ()) - b = [torch.abs(torch.rand(1)) for i in range(4)] - batch_b = BatchTensor(b, torch.tensor([]).byte()) - res_batch = batch_while(batch_a, batch_b) - res = [single_while(a[j], b[j]) for j in range(4)] - self.assertEqual(res, res_batch.examples()) - - script_while = torch.jit.script(single_while) - graph = torch.to_batch_graph(script_while.graph) - self.assertExpected(str(graph)) - - def test_for(self): - def single_for(x, y): - for _ in range(10): - x = x + y - return x - - batch_for = torch.jit.batch(batch_size=4)(single_for) - - a, batch_a = self.rand_batch(4, ()) - b, batch_b = self.rand_batch(4, ()) - res_batch = batch_for(batch_a, batch_b) - res = [single_for(a[j], b[j]) for j in range(4)] - self.assertEqual(res, res_batch.examples()) - - script_for = torch.jit.script(single_for) - graph = torch.to_batch_graph(script_for.graph) - self.assertExpected(str(graph)) - - def test_lstm(self): - def LSTM(x_all, h, c, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c): - for i in range(x_all.size(1)): - x = x_all.select(1, i) - i_t = torch.matmul(x, w_xi) + torch.matmul(h, w_hi) + b_i - f_t = torch.matmul(x, w_xf) + torch.matmul(h, w_hf) + b_f - o_t = torch.matmul(x, w_xo) + torch.matmul(h, w_ho) + b_o - # activations - i_t = torch.sigmoid(i_t) - f_t = torch.sigmoid(f_t) - o_t = torch.sigmoid(o_t) - # cell computations - c_t = torch.matmul(x, w_xc) + torch.matmul(h, w_hc) + b_c - c_t = torch.tanh(c_t) - c_t = torch.mul(c_t, f_t) + torch.mul(i_t, c_t) - h_t = torch.mul(o_t, torch.tanh(c_t)) - h = h_t - c = c_t - return h - - LSTM_batch = torch.jit.batch(batch_size=4)(LSTM) + def LSTMCell_batch(x, h, c, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c): + i_t = torch.matmul(x, w_xi) + torch.matmul(h, w_hi) + b_i + f_t = torch.matmul(x, w_xf) + torch.matmul(h, w_hf) + b_f + o_t = torch.matmul(x, w_xo) + torch.matmul(h, w_ho) + b_o + # activations + i_t = torch.sigmoid(i_t) + f_t = torch.sigmoid(f_t) + o_t = torch.sigmoid(o_t) + # cell computations + c_t = torch.matmul(x, w_xc) + torch.matmul(h, w_hc) + b_c + c_t = torch.tanh(c_t) + c_t = torch.mul(c, f_t) + torch.mul(i_t, c_t) + h_t = torch.mul(o_t, torch.tanh(c_t)) + return h_t batch_size, input_size, hidden_size = 4, 3, 2 - xs, batch = self.rand_batch(batch_size, (True, 4), (False, input_size)) - hx, h_batch = self.rand_batch(batch_size, (False, hidden_size)) - cx, c_batch = self.rand_batch(batch_size, (False, hidden_size)) - - # input to hidden weights - w_xi = torch.rand(input_size, hidden_size) - w_xf = torch.rand(input_size, hidden_size) - w_xo = torch.rand(input_size, hidden_size) - w_xc = torch.rand(input_size, hidden_size) - # hidden to hidden weights - w_hi = torch.rand(hidden_size, hidden_size) - w_hf = torch.rand(hidden_size, hidden_size) - w_ho = torch.rand(hidden_size, hidden_size) - w_hc = torch.rand(hidden_size, hidden_size) - # bias terms - b_i = torch.rand(hidden_size) - b_f = torch.rand(hidden_size) - b_o = torch.rand(hidden_size) - b_c = torch.rand(hidden_size) - - ys = [LSTM(xs[j], hx[j], cx[j], w_xi, w_xf, w_xo, w_xc, - w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c) for j in range(batch_size)] - ybs = LSTM_batch(batch, h_batch, c_batch, w_xi, w_xf, w_xo, w_xc, - w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c) - self.assertEqual(ys, ybs.examples()) - - def test_greedy_search(self): - def greedy(x, h, c, embed, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc, - b_i, b_f, b_o, b_c, w_hs, b_s, iter_num): - iter_count = torch.zeros_like(iter_num) - while(iter_count < iter_num): - iter_count += 1 - # LSTM Cell - i_t = torch.matmul(x, w_xi) + torch.matmul(h, w_hi) + b_i - f_t = torch.matmul(x, w_xf) + torch.matmul(h, w_hf) + b_f - o_t = torch.matmul(x, w_xo) + torch.matmul(h, w_ho) + b_o - # activations - i_t = torch.sigmoid(i_t) - f_t = torch.sigmoid(f_t) - o_t = torch.sigmoid(o_t) - # cell computations - c_t = torch.matmul(x, w_xc) + torch.matmul(h, w_hc) + b_c - c_t = torch.tanh(c_t) - c_t = torch.mul(c_t, f_t) + torch.mul(i_t, c_t) - h_t = torch.mul(o_t, torch.tanh(c_t)) - h = h_t - c = c_t - # calculate feature with max probability - s_t = torch.matmul(h_t, w_hs) + b_s - p_t = torch.softmax(s_t, 1) - i_t = torch.argmax(p_t, 1) - x = embed.index_select(1, i_t).squeeze(1) - return h - - greedy_batch = torch.jit.batch(batch_size=4)(greedy) - - batch_size, input_size, hidden_size, vocab_size = 4, 6, 8, 7 xs, batch = self.rand_batch(batch_size, (False, input_size)) hx, h_batch = self.rand_batch(batch_size, (False, hidden_size)) cx, c_batch = self.rand_batch(batch_size, (False, hidden_size)) - embed, embed_batch = self.rand_batch(batch_size, (False, vocab_size), (False, input_size)) - iter_num = [torch.randint(2, 5, (1,)) for i in range(batch_size)] - iter_num_batch = BatchTensor(iter_num, torch.tensor([]).byte()) # input to hidden weights w_xi = torch.rand(input_size, hidden_size) @@ -1651,102 +1285,11 @@ def greedy(x, h, c, embed, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc, b_f = torch.rand(hidden_size) b_o = torch.rand(hidden_size) b_c = torch.rand(hidden_size) - # hidden to vocab weights, bias - w_hs = torch.rand(hidden_size, vocab_size) - b_s = torch.rand(vocab_size) - - ys = [greedy(xs[j], hx[j], cx[j], embed[j], w_xi, w_xf, w_xo, w_xc, - w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c, w_hs, b_s, iter_num[j]) for j in range(batch_size)] - ybs = greedy_batch(batch, h_batch, c_batch, embed_batch, w_xi, w_xf, w_xo, w_xc, - w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c, w_hs, b_s, iter_num_batch) - self.assertEqual(ys, ybs.examples()) - def test_beam_search(self): - def beam(x, h, c, embed, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc, - b_i, b_f, b_o, b_c, w_hs, b_s, iter_num, idx): - k = 5 - vocab_size = embed.size(1) - iter_count = torch.zeros_like(iter_num) - max_len = idx.size(2) - while(iter_count < iter_num): - iter_count += 1 - # LSTM Cell - i_t = torch.matmul(x, w_xi) + torch.matmul(h, w_hi) + b_i - f_t = torch.matmul(x, w_xf) + torch.matmul(h, w_hf) + b_f - o_t = torch.matmul(x, w_xo) + torch.matmul(h, w_ho) + b_o - # activations - i_t = torch.sigmoid(i_t) - f_t = torch.sigmoid(f_t) - o_t = torch.sigmoid(o_t) - # cell computations - c_t = torch.matmul(x, w_xc) + torch.matmul(h, w_hc) + b_c - c_t = torch.tanh(c_t) - c_t = torch.mul(c_t, f_t) + torch.mul(i_t, c_t) - h_t = torch.mul(o_t, torch.tanh(c_t)) - h = h_t - c = c_t - # calculate features with max probability - s_t = torch.matmul(h_t, w_hs) + b_s - s_t = s_t.view([1, s_t.size(1) * s_t.size(2)]) - p_t = torch.softmax(s_t, 1) - prob_t, idx_t = torch.topk(p_t, k, 1) - if(int(idx_t.dim()) > 1): - idx_t_tmp = idx_t.squeeze(0) - else: - idx_t_tmp = idx_t - new_y = torch.fmod(idx_t_tmp, vocab_size) - pre_y = idx_t_tmp / vocab_size - x = embed.index_select(1, new_y) - h = h_t.index_select(1, pre_y) - c = c_t.index_select(1, pre_y) - iter = int(iter_count[0]) - idx = torch.cat([idx.narrow(2, 0, iter).index_select(1, pre_y), - torch.fmod(idx_t, vocab_size).unsqueeze(-1), - idx.narrow(2, iter, max_len - iter)], 2) - idx = idx.narrow(2, 0, max_len) - return idx - - beam_batch = torch.jit.batch(batch_size=4)(beam) - - k = 5 - batch_size, input_size, hidden_size, vocab_size = 4, 6, 8, 7 - max_len = 5 - xs, batch = self.rand_batch(batch_size, (False, 1), (False, input_size)) - hx, h_batch = self.rand_batch(batch_size, (False, 1), (False, hidden_size)) - cx, c_batch = self.rand_batch(batch_size, (False, 1), (False, hidden_size)) - embed, embed_batch = self.rand_batch(batch_size, (False, vocab_size), (False, input_size)) - iter_num = [torch.randint(2, max_len + 1, (1,)) for i in range(batch_size)] - iter_num_batch = BatchTensor(iter_num, torch.tensor([]).byte()) - - # input to hidden weights - w_xi = torch.rand(input_size, hidden_size) - w_xf = torch.rand(input_size, hidden_size) - w_xo = torch.rand(input_size, hidden_size) - w_xc = torch.rand(input_size, hidden_size) - # hidden to hidden weights - w_hi = torch.rand(hidden_size, hidden_size) - w_hf = torch.rand(hidden_size, hidden_size) - w_ho = torch.rand(hidden_size, hidden_size) - w_hc = torch.rand(hidden_size, hidden_size) - # bias terms - b_i = torch.rand(1, hidden_size) - b_f = torch.rand(1, hidden_size) - b_o = torch.rand(1, hidden_size) - b_c = torch.rand(1, hidden_size) - # hidden to vocab weights, bias - w_hs = torch.rand(hidden_size, vocab_size) - b_s = torch.rand(1, vocab_size) - - idx_batch = torch.jit.BatchTensor(torch.zeros([batch_size, k, max_len], dtype=torch.long), - torch.zeros([batch_size, 1, max_len]).byte(), - torch.tensor([0, 1]).byte()) - idx = [torch.zeros([1, k, max_len], dtype=torch.long) for _ in range(batch_size)] - - ys = [beam(xs[j], hx[j], cx[j], embed[j], w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc, - b_i, b_f, b_o, b_c, w_hs, b_s, iter_num[j], idx[j]).narrow(2, 0, int(iter_num[j])) - for j in range(batch_size)] - ybs = beam_batch(batch, h_batch, c_batch, embed_batch, w_xi, w_xf, w_xo, w_xc, - w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c, w_hs, b_s, iter_num_batch, idx_batch) + ys = [LSTMCell(xs[j].squeeze(0), hx[j], cx[j], w_xi, w_xf, w_xo, w_xc, + w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c) for j in range(batch_size)] + ybs = LSTMCell_batch(batch, h_batch, c_batch, w_xi, w_xf, w_xo, w_xc, + w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c) self.assertEqual(ys, ybs.examples()) @@ -4105,10 +3648,10 @@ def test_unknown_builtin(self): def unknown_builtin(x): return x.splork(3) - def test_return_tuple(self): - with self.assertRaisesRegex(RuntimeError, 'only supported return types'): + def test_expected_tensor_found_tuple(self): + with self.assertRaisesRegex(RuntimeError, 'expected a tensor value but found'): @torch.jit.script - def return_tuple(x): + def return_tuple_wrong(x): a = (x, x) return a, x @@ -4827,17 +4370,6 @@ def tuple_arg(x): # type: (Tuple[Tensor, Tensor]) -> Tensor return x + 1 - def test_script_non_tensor_args_outputs(self): - @torch.jit.script - def fn(x, y): - # type: (Tensor, float) -> float - return float((x + y).sum()) - - x = torch.ones(2, 2) - z = fn(x, 1) - self.assertIsInstance(z, float) - self.assertEqual(z, 8.) - @unittest.skip('https://github.com/pytorch/pytorch/issues/9595') def test_inline_and_run_annotated_script_fn(self): @torch.jit.script @@ -5380,9 +4912,11 @@ def forward(self, x, y): 'test_expand_new_dim', 'test_expand_new_dim_front_old_front_1', 'test_expand_scalar_to_dims', + 'test_expand_scalar_to_scalar', 'test_expand_size', 'test_permute', 'test_permute_neg_dim', + 'test_permute_scalar', 'test_repeat', 'test_repeat_scalar', 'test_repeat_single_number', @@ -5390,10 +4924,12 @@ def forward(self, x, y): 'test_reshape', 'test_reshape_1d', 'test_reshape_scalar_to_1d', + 'test_reshape_scalar_to_scalar', 'test_reshape_size', 'test_view', 'test_view_1d', 'test_view_scalar_to_1d', + 'test_view_scalar_to_scalar', 'test_view_size', 'test_split_dim', 'test_split_dim_neg0', diff --git a/test/test_legacy_nn.py b/test/test_legacy_nn.py index de65e6fc8ce7a0..1463d15cf22d0c 100644 --- a/test/test_legacy_nn.py +++ b/test/test_legacy_nn.py @@ -693,18 +693,14 @@ def _backward(self, module, input, output, grad_output, create_graph=False): return module.backward(input, grad_output) - def _forward_criterion(self, criterion, input, target, extra_args=None): - if extra_args is None: - extra_args = tuple() + def _forward_criterion(self, criterion, input, target): with torch.no_grad(): - return criterion.forward(input, target, *extra_args) + return criterion.forward(input, target) - def _backward_criterion(self, criterion, input, target, gradOutput=None, extra_args=None): - if extra_args is None: - extra_args = tuple() + def _backward_criterion(self, criterion, input, target, gradOutput=None): # Ignore gradOutput. It's used for non-legacy tests. with torch.no_grad(): - return criterion.backward(input, target, *extra_args) + return criterion.backward(input, target) def _zero_grad_parameters(self, module): return module.zeroGradParameters() diff --git a/test/test_nn.py b/test/test_nn.py index 8682463cf9bc6c..ccd698747ae8d5 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -36,7 +36,7 @@ TEST_CUDNN_VERSION from common_nn import NNTestCase, ModuleTest, CriterionTest, TestBase, \ module_tests, criterion_tests, loss_reference_fns, get_reduction, \ - get_weight, smoothl1loss_reference, kldivloss_reference, ctcloss_reference + get_weight, smoothl1loss_reference, kldivloss_reference if TEST_SCIPY: @@ -383,8 +383,6 @@ class NewCriterionTest(InputVariableMixin, CriterionTest): def __init__(self, *args, **kwargs): super(NewCriterionTest, self).__init__(*args, **kwargs) self.check_gradgrad = kwargs.get('check_gradgrad', True) - self.check_half = kwargs.get('check_half', True) - self.convert_target = kwargs.get('convert_target', True) def _do_extra_tests(self, test_case, module, input, target): if not self.check_gradgrad: @@ -409,7 +407,7 @@ def apply_fn(input1, input2, *params): gradcheck(apply_fn, inputs) gradgradcheck(apply_fn, inputs) - def test_cuda(self, test_case, dtype=None, extra_args=None): + def test_cuda(self, test_case, dtype=None): def convert_dtype(obj, dtype, requires_grad=False): if isinstance(obj, torch.Tensor): return torch.tensor(obj.data, dtype=dtype, requires_grad=requires_grad) @@ -432,7 +430,7 @@ def convert_dtype(obj, dtype, requires_grad=False): if dtype is not None: cpu_input = convert_dtype(cpu_input, dtype, True) # NLLLoss requires target to be LongTensor - if not isinstance(cpu_target, torch.LongTensor) and self.convert_target: + if not isinstance(cpu_target, torch.LongTensor): cpu_target = convert_dtype(cpu_target, dtype) cpu_module.type(dtype) gpu_module.type(dtype) @@ -449,13 +447,13 @@ def convert_dtype(obj, dtype, requires_grad=False): # Loss modules with weights require consistent input/module weight types cpu_module = self.constructor(*self.constructor_args) - cpu_output = test_case._forward_criterion(cpu_module, cpu_input, cpu_target, extra_args=extra_args) - gpu_output = test_case._forward_criterion(gpu_module, gpu_input, gpu_target, extra_args=extra_args) + cpu_output = test_case._forward_criterion(cpu_module, cpu_input, cpu_target) + gpu_output = test_case._forward_criterion(gpu_module, gpu_input, gpu_target) # dtype can be None, so set precision in this way instead of a precision map test_case.assertEqual(cpu_output, gpu_output, 1e-1 if dtype == torch.half else 4e-4) - cpu_gradInput = test_case._backward_criterion(cpu_module, cpu_input, cpu_target, extra_args=extra_args) - gpu_gradInput = test_case._backward_criterion(gpu_module, gpu_input, gpu_target, extra_args=extra_args) + cpu_gradInput = test_case._backward_criterion(cpu_module, cpu_input, cpu_target) + gpu_gradInput = test_case._backward_criterion(gpu_module, gpu_input, gpu_target) test_case.assertEqual(cpu_gradInput, gpu_gradInput, 1e-1 if dtype == torch.half else 4e-4) except NotImplementedError: pass @@ -467,10 +465,6 @@ def _get_target(self): def constructor_args(self): return self._get_arg('constructor_args', False) - @property - def extra_args(self): - return self._get_arg('extra_args', False) - class TestNN(NNTestCase): _do_cuda_memory_leak_check = True @@ -485,24 +479,20 @@ def _backward(self, module, input, output, grad_output, create_graph=False): return None return input.grad.data - def _forward_criterion(self, criterion, input, target, extra_args=None): - if extra_args is None: - extra_args = tuple() + def _forward_criterion(self, criterion, input, target): if isinstance(input, tuple): - args = input + (target,) + extra_args + args = input + (target,) output = criterion(*args) else: - output = criterion(input, target, *extra_args) + output = criterion(input, target) return output.item() - def _backward_criterion(self, criterion, input, target, gradOutput=None, extra_args=None): - if extra_args is None: - extra_args = tuple() + def _backward_criterion(self, criterion, input, target, gradOutput=None): input_tuple = input if isinstance(input, tuple) else (input,) for i in input_tuple: if i.grad is not None: i.grad.data.zero_() - args = input_tuple + (target,) + extra_args + args = input_tuple + (target,) if gradOutput is None: gradOutput = torch.ones(()) criterion(*args).backward(gradOutput.type_as(input_tuple[0])) @@ -1595,7 +1585,6 @@ def test(nonlinearity, *args, **kwargs): test('relu6') test('elu') test('selu') - test('celu') test('rrelu') test('rrelu', inplace=True) test('hardtanh') @@ -3589,19 +3578,6 @@ def test_NLLLoss_mismatched_batch(self): with self.assertRaisesRegex(ValueError, 'Expected.*batch_size'): F.nll_loss(x, t) - @unittest.skipIf(not (TEST_CUDNN and TEST_CUDNN_VERSION >= 7000), "needs cudnn >= 7.0") - def test_CTCLoss_cudnn(self): - target_lengths = [30, 25, 20] - input_lengths = [50, 50, 50] - targets = torch.randint(1, 15, (sum(target_lengths),), dtype=torch.int) - log_probs = torch.randn(50, 3, 15, dtype=torch.float, device='cuda').log_softmax(2) - res = torch.nn.functional.ctc_loss(log_probs, targets, input_lengths, target_lengths) - expected = ctcloss_reference(log_probs, targets.cuda(), input_lengths, target_lengths).float() - with torch.backends.cudnn.flags(enabled=False): - res2 = torch.nn.functional.ctc_loss(log_probs, targets.cuda().long(), input_lengths, target_lengths) - self.assertEqual(res, expected) - self.assertEqual(res2, res) - def test_RNN_cell_no_broadcasting(self): def test(cell_module, input, hx, input_size, hidden_size): cell = cell_module(input_size, hidden_size) @@ -4375,7 +4351,7 @@ def _verify_pixel_shuffle(self, input, output, upscale_factor): self.assertEqual(output[:, c, h, w], input[:, channel_idx, height_idx, weight_idx]) def test_inplace_thnn(self): - modules = [nn.ReLU, nn.ELU, nn.SELU, nn.CELU, nn.RReLU] + modules = [nn.ReLU, nn.ELU, nn.SELU, nn.RReLU] for mod in modules: r = mod(inplace=True) input = torch.randn(5, 5, requires_grad=True) @@ -4836,12 +4812,6 @@ def test_triplet_margin_loss_swap_no_reduce(self): self.assertEqual(F.triplet_margin_loss(input1, input2, input3, swap=True, reduction='none'), loss_reference_fns['TripletMarginLoss'](input1, input2, input3, swap=True, reduction='none')) - def test_pointwise_loss_target_grad_none_reduction(self): - i = torch.randn(5, 10) - t = torch.randn(5, 10, requires_grad=True) - self.assertEqual(F.mse_loss(i, t, reduction='none').size(), t.size()) - self.assertEqual(F.l1_loss(i, t, reduction='none').size(), t.size()) - def test_cosine_similarity(self): input1 = torch.randn(4, 4, requires_grad=True) input2 = torch.randn(4, 4, requires_grad=True) @@ -4872,30 +4842,30 @@ def test_grid_sample(self): def test_cpu_against_cuda(N, C, H, W, padding_mode): def test_shape(N, C, IH, IW, H, W, padding_mode): - input_cpu = torch.randn(C, N, IH, IW).transpose(0, 1).requires_grad_() - grid_cpu = torch.randn(H, N, W, 2).transpose(0, 1).requires_grad_() + input_cpu = Variable(torch.randn(C, N, IH, IW).transpose(0, 1), requires_grad=True) + grid_cpu = Variable(torch.randn(H, N, W, 2).transpose(0, 1), requires_grad=True) out_cpu = F.grid_sample(input_cpu, grid_cpu, padding_mode=padding_mode) self.assertTrue(out_cpu.size() == torch.Size([N, C, H, W])) - input_cuda = input_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_() - grid_cuda = grid_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_() + input_cuda = Variable(input_cpu.data.transpose(0, 1).cuda().transpose(0, 1), requires_grad=True) + grid_cuda = Variable(grid_cpu.data.transpose(0, 1).cuda().transpose(0, 1), requires_grad=True) out_cuda = F.grid_sample(input_cuda, grid_cuda, padding_mode=padding_mode) self.assertEqual(out_cpu, out_cuda) - gradients = torch.randn_like(out_cpu) + gradients = out_cpu.data.new(out_cpu.size()).normal_() out_cpu.backward(gradients) out_cuda.backward(gradients.cuda()) self.assertEqual(input_cpu.grad, input_cuda.grad) self.assertEqual(grid_cpu.grad, grid_cuda.grad, prec=5e-5) # check that zero-dimensional input strides don't error out - base_input = torch.randn(N, C, 1, IW) - input_cpu = base_input.expand_as(input_cuda).requires_grad_() + base_input = torch.randn(C, IH, IW) + input_cpu = Variable(base_input.expand(input_cuda.size()), requires_grad=True) grid_cpu = torch.randn(N, H, W, 2, requires_grad=True) out_cpu = F.grid_sample(input_cpu, grid_cpu, padding_mode=padding_mode) - input_cuda = base_input.cuda().expand_as(input_cuda).requires_grad_() - grid_cuda = grid_cpu.detach().cuda().requires_grad_() + input_cuda = Variable(base_input.cuda().expand(input_cuda.size()), requires_grad=True) + grid_cuda = Variable(grid_cpu.data.cuda(), requires_grad=True) out_cuda = F.grid_sample(input_cuda, grid_cuda, padding_mode=padding_mode) self.assertEqual(out_cpu, out_cuda) @@ -4903,21 +4873,21 @@ def test_shape(N, C, IH, IW, H, W, padding_mode): test_shape(N, C, H, W, H, W, padding_mode) # test larger output - N = random.randint(2, 8) - C = random.randint(2, 8) - IH = random.randint(2, 8) - IW = random.randint(2, 8) + N = random.randint(1, 8) + C = random.randint(1, 8) + IH = random.randint(1, 8) + IW = random.randint(1, 8) H = random.randint(IH + 1, 12) W = random.randint(IW + 1, 12) test_shape(N, C, IH, IW, H, W, padding_mode) # test smaller output - N = random.randint(2, 8) - C = random.randint(2, 8) - IH = random.randint(2, 8) - IW = random.randint(2, 8) - H = random.randint(2, IH) - W = random.randint(2, IW) + N = random.randint(1, 8) + C = random.randint(1, 8) + IH = random.randint(1, 8) + IW = random.randint(1, 8) + H = random.randint(1, IH) + W = random.randint(1, IW) test_shape(N, C, IH, IW, H, W, padding_mode) # test known input on CPU @@ -4956,38 +4926,42 @@ def test_shape(N, C, IH, IW, H, W, padding_mode): # test CUDA against CPU if TEST_CUDA: test_cpu_against_cuda(N, C, H, W, padding_mode) - if TEST_CUDNN: - with cudnn.flags(enabled=False): - test_cpu_against_cuda(N, C, H, W, padding_mode) + + # test channels >1024, which doesn't work on cudnn 7102 and further + N, C, H, W = 1, 1025, 3, 3 + self.assertTrue(gradcheck( + lambda inp, grid: F.grid_sample(inp, grid, padding_mode=padding_mode), + (input, grid))) + test_cpu_against_cuda(N, C, H, W, padding_mode) def test_grid_sample_3d(self): def test_cpu_against_cuda(N, C, D, H, W, padding_mode): def test_shape(N, C, ID, IH, IW, D, H, W, padding_mode): - input_cpu = torch.randn(C, N, ID, IH, IW).transpose(0, 1).requires_grad_() - grid_cpu = torch.randn(D, N, H, W, 3).transpose(0, 1).requires_grad_() + input_cpu = Variable(torch.randn(C, N, ID, IH, IW).transpose(0, 1), requires_grad=True) + grid_cpu = Variable(torch.randn(D, N, H, W, 3).transpose(0, 1), requires_grad=True) out_cpu = F.grid_sample(input_cpu, grid_cpu, padding_mode=padding_mode) self.assertTrue(out_cpu.size() == torch.Size([N, C, D, H, W])) - input_cuda = input_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_() - grid_cuda = grid_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_() + input_cuda = Variable(input_cpu.data.transpose(0, 1).cuda().transpose(0, 1), requires_grad=True) + grid_cuda = Variable(grid_cpu.data.transpose(0, 1).cuda().transpose(0, 1), requires_grad=True) out_cuda = F.grid_sample(input_cuda, grid_cuda, padding_mode=padding_mode) self.assertEqual(out_cpu, out_cuda) - gradients = torch.randn_like(out_cpu) + gradients = out_cpu.data.new(out_cpu.size()).normal_() out_cpu.backward(gradients) out_cuda.backward(gradients.cuda()) self.assertEqual(input_cpu.grad, input_cuda.grad) self.assertEqual(grid_cpu.grad, grid_cuda.grad, prec=5e-5) # check that zero-dimensional input strides don't error out - base_input = torch.randn(N, C, 1, IH, IW) - input_cpu = base_input.expand_as(input_cuda).requires_grad_() + base_input = torch.randn(C, ID, IH, IW) + input_cpu = Variable(base_input.expand(input_cuda.size()), requires_grad=True) grid_cpu = torch.randn(N, D, H, W, 3, requires_grad=True) out_cpu = F.grid_sample(input_cpu, grid_cpu, padding_mode=padding_mode) - input_cuda = base_input.cuda().expand_as(input_cuda).requires_grad_() - grid_cuda = grid_cpu.detach().cuda().requires_grad_() + input_cuda = Variable(base_input.cuda().expand(input_cuda.size()), requires_grad=True) + grid_cuda = Variable(grid_cpu.data.cuda(), requires_grad=True) out_cuda = F.grid_sample(input_cuda, grid_cuda, padding_mode=padding_mode) self.assertEqual(out_cpu, out_cuda) @@ -4995,35 +4969,35 @@ def test_shape(N, C, ID, IH, IW, D, H, W, padding_mode): test_shape(N, C, D, H, W, D, H, W, padding_mode) # test larger output - N = random.randint(2, 8) - C = random.randint(2, 8) - ID = random.randint(2, 8) - IH = random.randint(2, 8) - IW = random.randint(2, 8) + N = random.randint(1, 8) + C = random.randint(1, 8) + ID = random.randint(1, 8) + IH = random.randint(1, 8) + IW = random.randint(1, 8) D = random.randint(ID + 1, 12) H = random.randint(IH + 1, 12) W = random.randint(IW + 1, 12) test_shape(N, C, ID, IH, IW, D, H, W, padding_mode) # test smaller output - N = random.randint(2, 8) - C = random.randint(2, 8) - ID = random.randint(2, 8) - IH = random.randint(2, 8) - IW = random.randint(2, 8) - D = random.randint(2, ID) - H = random.randint(2, IH) - W = random.randint(2, IW) + N = random.randint(1, 8) + C = random.randint(1, 8) + ID = random.randint(1, 8) + IH = random.randint(1, 8) + IW = random.randint(1, 8) + D = random.randint(1, ID) + H = random.randint(1, IH) + W = random.randint(1, IW) test_shape(N, C, ID, IH, IW, D, H, W, padding_mode) # test known input on CPU for padding_mode in ['zeros', 'border']: # do gradcheck - N = random.randint(2, 8) - C = random.randint(2, 8) - D = random.randint(2, 8) - H = random.randint(2, 8) - W = random.randint(2, 8) + N = random.randint(1, 8) + C = random.randint(1, 8) + D = random.randint(1, 8) + H = random.randint(1, 8) + W = random.randint(1, 8) input = torch.randn(N, C, D, H, W, requires_grad=True) grid = torch.randn(N, D, H, W, 3, requires_grad=True) self.assertTrue(gradcheck( @@ -5566,11 +5540,6 @@ def test_unfold_invalid_arg(self): unfold = nn.Unfold(kernel_size=(1, 3), padding=(1, 1), dilation=(1, 2)) unfold(torch.randn(1, 2, 2, 2)) - def test_softmin(self): - x = torch.randn(2, 16) - self.assertEqual(F.softmin(x, 1), F.softmax(-x, 1)) - self.assertEqual(F.softmin(x, 0), F.softmax(-x, 0)) - def test_adaptive_log_softmax(self): # args validation with self.assertRaises(ValueError): @@ -6037,20 +6006,15 @@ def add(test_name, fn): add(test_name, lambda self, test=test: test(self)) cuda_test_name = test_name + '_cuda' # With dtype enable, it's good enough to test against three floating types - kwargs = {} - if 'extra_args' in get_function_arglist(test.test_cuda): - kwargs['extra_args'] = test.extra_args - if 'dtype' in get_function_arglist(test.test_cuda): add(cuda_test_name + '_float', lambda self, - test=test, kwargs=kwargs: test.test_cuda(self, dtype=torch.float, **kwargs)) + test=test: test.test_cuda(self, dtype=torch.float)) add(cuda_test_name + '_double', lambda self, - test=test, kwargs=kwargs: test.test_cuda(self, dtype=torch.double, **kwargs)) - if getattr(test, 'check_half', True): - add(cuda_test_name + '_half', lambda self, - test=test: test.test_cuda(self, dtype=torch.half, **kwargs)) + test=test: test.test_cuda(self, dtype=torch.double)) + add(cuda_test_name + '_half', lambda self, + test=test: test.test_cuda(self, dtype=torch.half)) else: - add(cuda_test_name, lambda self, test=test, kwargs=kwargs: test.test_cuda(self, **kwargs)) + add(cuda_test_name, lambda self, test=test: test.test_cuda(self)) def wrap_functional(fn, **kwargs): @@ -6210,45 +6174,6 @@ def forward(self, *args): check_sum_reduction=True, check_gradgrad=False, ), - dict( - module_name='CTCLoss', - constructor_args=(14,), # blank=14 - extra_args=([50, 50, 50], [30, 25, 20]), # input_lengths, target_lengths - input_fn=lambda: torch.randn(50, 3, 15).log_softmax(2), - target_fn=lambda: torch.randint(0, 14, (3, 30), dtype=torch.long), - reference_fn=lambda i, t, il, tl, m: - ctcloss_reference(i, t, il, tl, blank=14, reduction=get_reduction(m)), - check_sum_reduction=True, - check_gradgrad=False, - check_half=False, - ), - dict( - module_name='CTCLoss', - desc='1d_target', - constructor_args=(14,), # blank=14 - extra_args=([50, 50, 50], [30, 25, 20]), # input_lengths, target_lengths - input_fn=lambda: torch.randn(50, 3, 15).log_softmax(2), - target_fn=lambda: torch.randint(0, 14, (3, 30), dtype=torch.long), - reference_fn=lambda i, t, il, tl, m: - ctcloss_reference(i, t, il, tl, blank=14, reduction=get_reduction(m)), - check_sum_reduction=True, - check_gradgrad=False, - check_half=False, - ), - dict( - module_name='CTCLoss', - desc='2d_int_target', - constructor_args=(0,), # blank=0 - extra_args=([50, 50, 50], [30, 25, 20]), # input_lengths, target_lengths - input_fn=lambda: torch.randn(50, 3, 15).log_softmax(2), - target_fn=lambda: torch.randint(1, 15, (3, 30), dtype=torch.int), - reference_fn=lambda i, t, il, tl, m: - ctcloss_reference(i, t, il, tl, blank=0, reduction=get_reduction(m)), - check_sum_reduction=True, - check_gradgrad=False, - check_half=False, - convert_target=False, - ), ] @@ -7841,21 +7766,6 @@ def multimarginloss_weights_no_reduce_test(): check_inplace=True, desc='scalar' ), - dict( - module_name='CELU', - input_size=(3, 2, 5), - constructor_args=(2.,), - check_inplace=True, - reference_fn=lambda x, _: torch.where(x >= 0, x, 2. * ((.5 * x).exp() - 1)) - ), - dict( - module_name='CELU', - input_size=(), - constructor_args=(2.,), - check_inplace=True, - reference_fn=lambda x, _: torch.where(x >= 0, x, 2. * ((.5 * x).exp() - 1)), - desc='scalar' - ), dict( module_name='GLU', input_size=(5, 6), diff --git a/test/test_optim.py b/test/test_optim.py index 2d5b876dd3a8e1..41c3bfc1964f33 100644 --- a/test/test_optim.py +++ b/test/test_optim.py @@ -31,6 +31,7 @@ def wrapper(closure, params, state): class TestOptim(TestCase): + def _test_rosenbrock(self, constructor, old_fn): params_t = torch.Tensor([1.5, 1.5]) state = {} @@ -504,20 +505,6 @@ def forward(self, x): return self.conv2(F.relu(self.conv1(x))) -class LambdaLRTestObject: - def __init__(self, value): - self.value = value - - def __call__(self, epoch): - return self.value * epoch - - def __eq__(self, other): - if isinstance(other, self.__class__): - return self.__dict__ == other.__dict__ - else: - return False - - class TestLRScheduler(TestCase): def setUp(self): self.net = SchedulerTestNet() @@ -685,28 +672,6 @@ def test_reduce_lr_on_plateau_state_dict(self): if key not in {'optimizer', 'is_better'}: self.assertEqual(scheduler.__dict__[key], scheduler_copy.__dict__[key], allow_inf=True) - def test_lambda_lr_state_dict_fn(self): - scheduler = LambdaLR(self.opt, lr_lambda=lambda x: x) - state = scheduler.state_dict() - self.assertIsNone(state['lr_lambdas'][0]) - - scheduler_copy = LambdaLR(self.opt, lr_lambda=lambda x: x) - scheduler_copy.load_state_dict(state) - for key in scheduler.__dict__.keys(): - if key not in {'optimizer', 'lr_lambdas'}: - self.assertEqual(scheduler.__dict__[key], scheduler_copy.__dict__[key], allow_inf=True) - - def test_lambda_lr_state_dict_obj(self): - scheduler = LambdaLR(self.opt, lr_lambda=LambdaLRTestObject(10)) - state = scheduler.state_dict() - self.assertIsNotNone(state['lr_lambdas'][0]) - - scheduler_copy = LambdaLR(self.opt, lr_lambda=LambdaLRTestObject(-1)) - scheduler_copy.load_state_dict(state) - for key in scheduler.__dict__.keys(): - if key not in {'optimizer'}: - self.assertEqual(scheduler.__dict__[key], scheduler_copy.__dict__[key], allow_inf=True) - def _check_scheduler_state_dict(self, constr, constr2, epochs=10): scheduler = constr() for _ in range(epochs): diff --git a/test/test_torch.py b/test/test_torch.py index edd69473f8505b..2a8c897713111f 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -22,7 +22,7 @@ from torch import multiprocessing as mp from common import TestCase, iter_indices, TEST_NUMPY, TEST_SCIPY, TEST_MKL, \ TEST_LIBROSA, run_tests, download_file, skipIfNoLapack, suppress_warnings, \ - IS_WINDOWS, PY3, NO_MULTIPROCESSING_SPAWN, TEST_WITH_ROCM + IS_WINDOWS, PY3, NO_MULTIPROCESSING_SPAWN, skipIfNoZeroSize, TEST_WITH_ROCM from multiprocessing.reduction import ForkingPickler if TEST_NUMPY: @@ -866,6 +866,7 @@ def test_multidim(x, dim): def test_dim_reduction(self): self._test_dim_reduction(self, lambda t: t) + @skipIfNoZeroSize def test_reduction_empty(self): fns_to_test = [ # name, function, identity @@ -929,6 +930,7 @@ def test_reduction_empty(self): self.assertEqual(torch.ones((2, 1, 4), device=device), xb.all(1, keepdim=True)) self.assertEqual(torch.ones((), device=device), xb.all()) + @skipIfNoZeroSize def test_pairwise_distance_empty(self): devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] for device in devices: @@ -1688,7 +1690,6 @@ def test_einsum(self): ("...ii->...i", I), # batch diagonal # -- Other ("bn,anm,bm->ba", l, w, r), # as torch.bilinear - ("... ii->...i ", I), # batch diagonal with spaces ] for test in test_list: actual = torch.einsum(test[0], test[1:]) @@ -2239,6 +2240,7 @@ def test_tensor_factory_cuda_type(self): self.assertTrue(x.is_cuda) torch.set_default_tensor_type(saved_type) + @skipIfNoZeroSize def test_tensor_factories_empty(self): # ensure we can create empty tensors from each factory function shapes = [(5, 0, 1), (0,), (0, 0, 1, 0, 2, 0, 0)] @@ -2925,6 +2927,7 @@ def _test_in_place_broadcastable(t0, t1, t2=None): def test_broadcast(self): self._test_broadcast(self, lambda t: t) + @skipIfNoZeroSize def test_broadcast_empty(self): # empty + empty self.assertRaises(RuntimeError, lambda: torch.randn(5, 0) + torch.randn(0, 5)) @@ -2940,17 +2943,6 @@ def test_broadcast_empty(self): torch.randn(0, 7, 0, 6, 5, 0, 1) + torch.randn(1, 1, 5, 1, 7)) self.assertRaises(RuntimeError, lambda: torch.randn(7, 0) + torch.randn(2, 1)) - def test_broadcast_tensors(self): - x0 = torch.randn(2, 1, 3) - x1 = torch.randn(3) - x2 = torch.randn(3, 1) - expected_size = (2, 3, 3) - - y0, y1, y2 = torch.broadcast_tensors(x0, x1, x2) - self.assertTrue(y0.size() == expected_size) - self.assertTrue(y1.size() == expected_size) - self.assertTrue(y2.size() == expected_size) - @staticmethod def _test_contiguous(self, cast): x = cast(torch.randn(1, 16, 5, 5)) @@ -2965,7 +2957,9 @@ def test_contiguous(self): return self._test_contiguous(self, lambda t: t) def test_empty_tensor_props(self): - sizes = [(0,), (0, 3), (5, 0), (5, 0, 3, 0, 2), (0, 3, 0, 2), (0, 5, 0, 2, 0)] + sizes = [(0,)] + if torch._C._use_zero_size_dim(): + sizes += [(0, 3), (5, 0), (5, 0, 3, 0, 2), (0, 3, 0, 2), (0, 5, 0, 2, 0)] devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] for size in sizes: for device in devices: @@ -3482,6 +3476,9 @@ def test_cat_empty_legacy(self): @staticmethod def _test_cat_empty(self, use_cuda=False): + if not torch._C._use_zero_size_dim(): + return + dtype = torch.float32 device = 'cuda' if use_cuda else 'cpu' @@ -3527,6 +3524,9 @@ def test_narrow(self): self.assertEqual(x.narrow(-2, -1, 1), torch.Tensor([[6, 7, 8]])) def test_narrow_empty(self): + if not torch._C._use_zero_size_dim(): + return + devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] for device in devices: x = torch.randn(2, 3, 4, device=device) @@ -3658,7 +3658,7 @@ def test_randn(self): self.assertEqual(res1, res2) def test_slice(self): - empty = torch.empty(0, 4) + empty = torch.empty(0, 4) if torch._C._use_zero_size_dim() else torch.Tensor() x = torch.arange(0., 16).view(4, 4) self.assertEqual(x[:], x) self.assertEqual(x[:4], x) @@ -4951,7 +4951,10 @@ def consec(size, start=1): reference = conv_fn(consec((3, 3, 3))) # empty tensor indexing - self.assertEqual(reference[conv_fn(torch.LongTensor())], reference.new(0, 3, 3)) + if torch._C._use_zero_size_dim(): + self.assertEqual(reference[conv_fn(torch.LongTensor())], reference.new(0, 3, 3)) + else: + self.assertEqual(reference[conv_fn(torch.LongTensor())], reference.new()) self.assertEqual(reference[0], consec((3, 3)), 0) self.assertEqual(reference[1], consec((3, 3), 10), 0) @@ -4997,9 +5000,14 @@ def consec(size, start=1): self.assertEqual(reference[None, 2:5, None, None], reference.unsqueeze(0)[:, 2:5].unsqueeze(2).unsqueeze(2)) # indexing 0-length slice - self.assertEqual(torch.empty(0, 5, 5), reference[slice(0)]) - self.assertEqual(torch.empty(0, 5), reference[slice(0), 2]) - self.assertEqual(torch.empty(0, 5), reference[2, slice(0)]) + if torch._C._use_zero_size_dim(): + self.assertEqual(torch.empty(0, 5, 5), reference[slice(0)]) + self.assertEqual(torch.empty(0, 5), reference[slice(0), 2]) + self.assertEqual(torch.empty(0, 5), reference[2, slice(0)]) + else: + self.assertEqual(torch.tensor([]), reference[slice(0)]) + self.assertEqual(torch.tensor([]), reference[slice(0), 2]) + self.assertEqual(torch.tensor([]), reference[2, slice(0)]) self.assertEqual(torch.tensor([]), reference[2, 1:1, 2]) # indexing with step @@ -5709,6 +5717,7 @@ def check(src, idx): check(src, idx) check(src.transpose(1, 2), idx) + @skipIfNoZeroSize def test_take_empty(self): devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] for device in devices: @@ -5739,6 +5748,7 @@ def test_put_accumulate(self): dst.put_(idx, src, accumulate=True) self.assertEqual(dst.tolist(), [[5, 7], [1, 1]]) + @skipIfNoZeroSize def test_put_empty(self): devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] for device in devices: @@ -6060,6 +6070,7 @@ def _test_view(self, cast): def test_view(self): TestTorch._test_view(self, lambda x: x) + @skipIfNoZeroSize def test_view_empty(self): x = torch.randn(0, 6) self.assertEqual((1, 0, 6, 1, 1), x.view(1, 0, 6, 1, 1).shape) @@ -6085,8 +6096,12 @@ def test_reshape(self): self.assertEqual(empty, empty.reshape(-1)) self.assertEqual(empty, empty.reshape([0])) # TODO: fix these once we have multi-dimensional empty tensors - self.assertEqual(empty.reshape([0, 1]).shape, (0, 1)) - self.assertEqual(empty.reshape([1, -1]).shape, (1, 0)) + if torch._C._use_zero_size_dim(): + self.assertEqual(empty.reshape([0, 1]).shape, (0, 1)) + self.assertEqual(empty.reshape([1, -1]).shape, (1, 0)) + else: + self.assertEqual(empty.reshape([0, 1]).shape, (0,)) + self.assertEqual(empty.reshape([1, -1]).shape, (0,)) self.assertRaises(RuntimeError, lambda: empty.reshape(1)) x = torch.randn(3, 3) @@ -6094,6 +6109,7 @@ def test_reshape(self): self.assertEqual(x.data_ptr(), x.reshape_as(torch.rand(1, 9, 1)).data_ptr()) self.assertRaises(RuntimeError, lambda: x.reshape_as(torch.rand(10))) + @skipIfNoZeroSize def test_empty_reshape(self): x = torch.randn(0, 6) self.assertEqual((1, 0, 6, 1, 1), x.reshape(1, 0, 6, 1, 1).shape) @@ -6103,6 +6119,7 @@ def test_empty_reshape(self): # match NumPy semantics -- don't infer the size of dimension with a degree of freedom self.assertRaises(RuntimeError, lambda: x.reshape(0, -1)) + @skipIfNoZeroSize def test_tensor_shape_empty(self): devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] for device in devices: @@ -6168,6 +6185,7 @@ def test_tensor_shape_empty(self): self.assertEqual([(0, 1, 3, 0)], [z.shape for z in torch.split(x, 0, dim=0)]) # functions that operate over a dimension but don't reduce. + @skipIfNoZeroSize def test_dim_function_empty(self): devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] for device in devices: @@ -6291,6 +6309,7 @@ def test_dim_function_empty(self): c = torch.randn((0, 1, 2), device=device) self.assertEqual(c, c.index_select(0, ind_empty)) + @skipIfNoZeroSize def test_blas_empty(self): devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] for device in devices: @@ -6360,6 +6379,7 @@ def fn(torchfn, *args): A_LU, pivots = fn(torch.btrifact, (2, 0, 0)) self.assertEqual([(2, 0, 0), (2, 0)], [A_LU.shape, pivots.shape]) + @skipIfNoZeroSize def test_blas_alpha_beta_empty(self): devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] for device in devices: @@ -6385,6 +6405,7 @@ def test_blas_alpha_beta_empty(self): self.assertEqual(torch.full((2, 3), beta * value, device=device), torch.addmm(input=input, mat1=mat, mat2=mat2, alpha=alpha, beta=beta, out=out)) + @skipIfNoZeroSize @skipIfNoLapack def test_lapack_empty(self): # FIXME: these are just a selection of LAPACK functions -- we need a general strategy here. @@ -6875,6 +6896,9 @@ def test_nonzero(self): self.assertNotEqual(tensor[dst1[i, 0], dst1[i, 1], dst1[i, 2]].item(), 0) def test_nonzero_empty(self): + if not torch._C._use_zero_size_dim(): + return + devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] for device in devices: x = torch.randn(0, 2, 0, 5, 0, device=device) @@ -7499,11 +7523,15 @@ def test_load_error_msg(self): expected_err_msg = (".*You can only torch.load from a file that is seekable. " + "Please pre-load the data into a buffer like io.BytesIO and " + "try to load from it instead.") - - resource = FilelikeMock(data=b"data") - delattr(resource, "tell") - delattr(resource, "seek") - self.assertRaisesRegex(AttributeError, expected_err_msg, lambda: torch.load(resource)) + if PY3: + import urllib.request + import io + resource = urllib.request.urlopen('https://download.pytorch.org/test_data/linear.pt') + self.assertRaisesRegex(io.UnsupportedOperation, expected_err_msg, lambda: torch.load(resource)) + else: + import urllib + resource = urllib.urlopen('https://download.pytorch.org/test_data/linear.pt') + self.assertRaisesRegex(AttributeError, expected_err_msg, lambda: torch.load(resource)) def test_from_buffer(self): a = bytearray([1, 2, 3, 4]) @@ -7866,7 +7894,10 @@ def test_from_numpy(self): # check zero dimensional x = np.zeros((0, 2)) - self.assertEqual(torch.from_numpy(x).shape, (0, 2)) + if torch._C._use_zero_size_dim(): + self.assertEqual(torch.from_numpy(x).shape, (0, 2)) + else: + self.assertEqual(torch.from_numpy(x).shape, (0,)) # check ill-sized strides raise exception x = np.array([3., 5., 8.]) @@ -7916,20 +7947,6 @@ def test_ctor_with_numpy_array(self): for i in range(len(array)): self.assertEqual(tensor[i], array[i]) - @unittest.skipIf(not TEST_NUMPY, "Numpy not found") - def test_ctor_with_numpy_scalar_ctor(self): - dtypes = [ - np.double, - np.float, - np.float16, - np.int64, - np.int32, - np.int16, - np.uint8 - ] - for dtype in dtypes: - self.assertEqual(dtype(42), torch.tensor(dtype(42)).item()) - @unittest.skipIf(not TEST_NUMPY, "Numpy not found") def test_numpy_index(self): i = np.int32([0, 1, 2]) @@ -8017,17 +8034,6 @@ def test_numpy_array_interface(self): for i in range(len(x)): self.assertEqual(geq2_x[i], geq2_array[i]) - @unittest.skipIf(not TEST_NUMPY, "Numpy not found") - def test_multiplication_numpy_scalar(self): - np_sc = np.float64(2.0) - t = torch.ones(2, requires_grad=True) - r1 = np_sc * t - self.assertIsInstance(r1, torch.Tensor) - self.assertTrue(r1.requires_grad) - r2 = t * np_sc - self.assertIsInstance(r2, torch.Tensor) - self.assertTrue(r2.requires_grad) - def test_error_msg_type_translation(self): with self.assertRaisesRegex( RuntimeError, diff --git a/third_party/eigen b/third_party/eigen index cafae68f33f7f4..e9e95489a0b241 160000 --- a/third_party/eigen +++ b/third_party/eigen @@ -1 +1 @@ -Subproject commit cafae68f33f7f41270b2e8c2dd181f510aa4d918 +Subproject commit e9e95489a0b241412e31f0525e85b2fab386c786 diff --git a/third_party/onnx b/third_party/onnx index 32ac71b1b9c1bd..c761845c7f6880 160000 --- a/third_party/onnx +++ b/third_party/onnx @@ -1 +1 @@ -Subproject commit 32ac71b1b9c1bd7f196eed3b311734ec6ab3c367 +Subproject commit c761845c7f6880ab7eb7e2866d673834c7149e89 diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index a66cb77f8ce9dd..14fd6d7cf5e09c 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -201,9 +201,6 @@ - name: conv_tbc(Tensor self, Tensor weight, Tensor bias, int64_t pad) self, weight, bias: conv_tbc_backward(grad, self, weight, bias, pad) -- name: _ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank) - log_probs: _ctc_loss_backward(grad, log_probs, targets, input_lengths, target_lengths, result0, result1, blank) - - name: det(Tensor self) self: det_backward(grad, self, result) @@ -311,12 +308,6 @@ self: gesv_backward_self(grad, self, A) A: gesv_backward_A(grad, self, A, result0) -- name: grid_sampler_2d(Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) - input, grid: grid_sampler_2d_backward(grad, input, grid, interpolation_mode, padding_mode) - -- name: grid_sampler_3d(Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) - input, grid: grid_sampler_3d_backward(grad, input, grid, interpolation_mode, padding_mode) - - name: gt_(Tensor self, Scalar other) self: zeros_like(self) @@ -811,8 +802,8 @@ - name: relu(Tensor self) self: threshold_backward(grad, self, 0, 0) -- name: elu_forward(Tensor self, Scalar alpha, Scalar scale, Scalar input_scale) - self: elu_backward(grad, alpha, scale, input_scale, output) +- name: elu_forward(Tensor self, Scalar alpha, Scalar scale) + self: elu_backward(grad, alpha, scale, output) - name: glu_forward(Tensor self, int64_t dim) self: glu_backward(grad, self, dim) @@ -983,6 +974,12 @@ - name: thnn_conv_dilated3d_backward(Tensor grad_output, Tensor self, Tensor weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, Tensor columns, Tensor ones, std::array output_mask) grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, dilation, false, {{0, 0, 0}}, 1, false, false, false, grad_input_mask) +- name: thnn_grid_sampler_bilinear2d_forward(Tensor self, Tensor grid, int64_t padding_mode) + self, grid: thnn_grid_sampler_bilinear2d_backward(grad, self, grid, padding_mode) + +- name: thnn_grid_sampler_bilinear3d_forward(Tensor self, Tensor grid, int64_t padding_mode) + self, grid: thnn_grid_sampler_bilinear3d_backward(grad, self, grid, padding_mode) + # NN double backwards support - name: adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) @@ -1009,9 +1006,9 @@ grad_output: avg_pool3d(grad, kernel_size, stride, padding, ceil_mode, count_include_pad) self: zeros_like(self) -- name: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output) - grad_output: elu_backward(grad, alpha, scale, input_scale, output) - output: grad * grad_output * input_scale * (output < 0).toType(grad.type()) +- name: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Tensor output) + grad_output: elu_backward(grad, alpha, scale, output) + output: grad * grad_output * (output < 0).toType(grad.type()) - name: fractional_max_pool2d_backward(Tensor grad_output, Tensor self, IntList kernel_size, IntList output_size, Tensor indices) grad_output: max_pool_double_backward(grad, indices, 2) @@ -1148,8 +1145,6 @@ output: -2 * output * grad * grad_output # cudnn -- name: _cudnn_ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank, bool deterministic) - log_probs: result1 - name: cudnn_convolution_transpose(Tensor self, Tensor weight, Tensor bias, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) self, weight, bias: cudnn_convolution_transpose_backward(self, grad, weight, padding, output_padding, stride, dilation, groups, benchmark, deterministic, grad_input_mask) diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py index 2bee61b024317e..45af42655f96cc 100644 --- a/tools/autograd/gen_variable_type.py +++ b/tools/autograd/gen_variable_type.py @@ -340,8 +340,6 @@ def save_variables(saved_variables, is_output): elif arg['type'] == 'TensorList': name += '_' expr = 'make_saved_variable_list({})'.format(arg['name']) - elif arg['type'] == 'IntList': - expr = expr + ".vec()" stmts.append('grad_fn->{} = {};'.format(name, expr)) return stmts diff --git a/tools/autograd/templates/Functions.cpp b/tools/autograd/templates/Functions.cpp index 0622fae5f2e8e8..f859f814b4f8bc 100644 --- a/tools/autograd/templates/Functions.cpp +++ b/tools/autograd/templates/Functions.cpp @@ -175,7 +175,7 @@ Tensor prod_safe_zeros_backward(const Tensor &grad, const Tensor& inp, int64_t d return grad; } - auto ones_size = inp.sizes().vec(); + std::vector ones_size(inp.sizes()); ones_size[dim] = 1; Tensor ones = at::ones(ones_size, grad.type()); Tensor exclusive_normal_nocp = at::cat({ones, inp.narrow(dim, 0, inp.size(dim) - 1)}, dim); @@ -328,7 +328,7 @@ Tensor cumprod_backward(const Tensor &grad, const Tensor &input, int64_t dim) { return sum_scan_exclusive(result * grad, dim) / input; } - auto ones_size = input.sizes().vec(); + std::vector ones_size(input.sizes()); ones_size[dim] = 1; Tensor ones = at::ones({1}, grad.type()).expand(ones_size); Tensor grad_input = at::zeros(input.sizes(), grad.type()); @@ -461,7 +461,7 @@ Tensor mm_mat2_backward(const Tensor & grad, const Tensor & mat1, IntList sizes, } Tensor renorm_backward(const Tensor & grad, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) { - auto transposed_sizes = self.transpose(dim, 0).sizes().vec(); + auto transposed_sizes = std::vector(self.transpose(dim, 0).sizes()); auto flatten = [&](const Tensor & t) { return t.transpose(dim, 0).contiguous().view({t.size(dim), -1}); }; @@ -637,7 +637,7 @@ Tensor split_with_sizes_backward(const std::vector &g grads_all_defined[j] = grads[j]; } else { auto length = split_sizes[j]; - auto grad_size = sizes.vec(); + std::vector grad_size(sizes); grad_size[dim] = length; grads_all_defined[j] = at::zeros(grad_size, type); } @@ -659,7 +659,7 @@ Tensor split_backward(const std::vector &grads, Tensor max_pool_double_backward(const Tensor & grad, const Tensor & indices, int dim) { AT_ASSERT(indices.dim() >= dim); - auto size = indices.sizes().slice(0, indices.dim() - dim).vec(); + auto size = std::vector(indices.sizes().slice(0, indices.dim() - dim)); size.push_back(-1); auto indices_view = indices.view(size); return grad.contiguous().view(size).gather(-1, indices_view).view(indices.sizes()); @@ -686,7 +686,7 @@ Tensor glu_double_backward(const Tensor & grad, const Tensor & grad_output, cons Tensor glu_double_backward_grad_output(const Tensor & grad, const Tensor & input, int64_t dim) { if (dim < 0) dim += input.dim(); - auto sizes = input.sizes().vec(); + std::vector sizes = input.sizes(); sizes[dim] /= 2; auto tmp = grad * glu_backward(at::ones(sizes, input.type()), input, dim); return tmp.narrow(dim, 0, sizes[dim]) + tmp.narrow(dim, sizes[dim], sizes[dim]); @@ -1545,27 +1545,27 @@ Tensor symeig_backward(const std::vector &grads, cons bool eigenvectors, bool upper, const Tensor& lambda, const Tensor& v) { auto glambda = grads[0]; auto gv = grads[1]; - + auto vt = v.t(); - + if (!eigenvectors) { throw std::runtime_error(std::string("cannot compute backward without " "computing eigenvectors in forward pass")); } - + Tensor result; if (gv.defined()) { Tensor F = lambda.unsqueeze(0).expand_as(self).clone(); F.sub_(at::unsqueeze(lambda, 1)); F.diagonal().fill_(INFINITY); F.pow_(-1); - + F.mul_(vt.mm(gv)); result = v.mm(F.mm(vt)); } else { result = at::zeros_like(self); } - + if (glambda.defined()) { result.add_((v * glambda).mm(vt)); } diff --git a/tools/autograd/templates/Functions.h b/tools/autograd/templates/Functions.h index 00d927f1fdf7f8..ae95bf7197770e 100644 --- a/tools/autograd/templates/Functions.h +++ b/tools/autograd/templates/Functions.h @@ -29,7 +29,7 @@ struct TypeAndSize { TypeAndSize() : type(nullptr) {} /* implicit */ TypeAndSize(const Tensor & t) - : sizes(t.sizes().vec()) + : sizes(t.sizes()) , type(&t.type()) {} Tensor zeros() { return at::zeros(sizes, *type); } diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp index bd4c59cfe9d380..2f1adf0ab59f4b 100644 --- a/tools/autograd/templates/VariableType.cpp +++ b/tools/autograd/templates/VariableType.cpp @@ -398,7 +398,7 @@ Tensor VariableType::contiguous(const Tensor & self) const { static std::vector> to_args_sizes(TensorList tensors) { std::vector> args_sizes(tensors.size()); for (size_t i = 0; i < tensors.size(); ++i) { - args_sizes[i] = tensors[i].sizes().vec(); + args_sizes[i] = tensors[i].sizes(); } return args_sizes; } diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh index 8f79c2830e96c0..4a0dbd04c905f1 100755 --- a/tools/build_pytorch_libs.sh +++ b/tools/build_pytorch_libs.sh @@ -97,11 +97,7 @@ if [[ $(uname) == 'Darwin' ]]; then LDFLAGS="$LDFLAGS -Wl,-rpath,@loader_path" LD_POSTFIX=".dylib" else - if [[ $USE_ROCM -eq 1 ]]; then - LDFLAGS="$LDFLAGS -Wl,-rpath,\\\\\\\$ORIGIN" - else - LDFLAGS="$LDFLAGS -Wl,-rpath,\$ORIGIN" - fi + LDFLAGS="$LDFLAGS -Wl,-rpath,\$ORIGIN" fi CPP_FLAGS=" -std=c++11 " GLOO_FLAGS="" diff --git a/tools/clang_tidy.py b/tools/clang_tidy.py index 77b101dedf0f3e..abbadc70691b46 100644 --- a/tools/clang_tidy.py +++ b/tools/clang_tidy.py @@ -7,7 +7,6 @@ import subprocess import sys - DEFAULT_FILE_PATTERN = r".*\.[ch](pp)?" # @@ -start,count +start,count @@ @@ -27,11 +26,6 @@ def run_shell_command(arguments, process_name=None): return output.decode() -def normalize_directory_path(path): - """Normalizes a directory path.""" - return path.rstrip('/') - - def transform_globs_into_regexes(globs): """Turns glob patterns into regular expressions.""" return [glob.replace("*", ".*").replace("?", ".") for glob in globs] @@ -55,37 +49,16 @@ def git_diff(args, verbose): return run_shell_command(command, process_name="git diff") -def filter_files(files, file_patterns, verbose): +def filter_files(files, file_patterns): """Returns all files that match any of the patterns.""" filtered = [] for file in files: - has_match = False for pattern in file_patterns: - if pattern.search(file): + if pattern.match(file): filtered.append(file) - has_match = True - if not has_match and verbose: - message = "{} does not match any ".format(file) - message += "file pattern in {{{}}}".format(', '.join(map(str, file_patterns))) - print(message) return filtered -def remove_recursive_files(files, paths, verbose): - """ - Removes all files that are not immediately under one of the given paths. - """ - for file in files: - if os.path.dirname(file) in paths: - yield file - else: - if verbose: - - message = "{} ({}) does not match any ".format(file, os.path.dirname(file)) - message += "non-recursive path in {{{}}}".format(", ".join(paths)) - print(message) - - def get_changed_files(revision, paths, verbose): """Runs git diff to get the paths of all changed files.""" # --diff-filter AMU gets us files that are (A)dded, (M)odified or (U)nmerged (in the working copy). @@ -179,17 +152,7 @@ def parse_options(): ) parser.add_argument("-r", "--revision", help="Git revision to get changes from") parser.add_argument( - "-p", - "--paths", - nargs="+", - default=["."], - help="Lint only the given paths (recursively)", - ) - parser.add_argument( - "-n", - "--no-recursive", - action="store_true", - help="If paths are supplied with -p/--paths, do not recurse into paths", + "-p", "--paths", nargs="+", default=["."], help="Lint only the given paths" ) parser.add_argument( "-s", @@ -210,15 +173,12 @@ def parse_options(): def main(): options = parse_options() - paths = map(normalize_directory_path, options.paths) if options.revision: - files = get_changed_files(options.revision, paths, options.verbose) + files = get_changed_files(options.revision, options.paths, options.verbose) else: - files = get_all_files(paths) - if options.no_recursive: - files = remove_recursive_files(files, paths, options.verbose) + files = get_all_files(options.paths) file_patterns = get_file_patterns(options.glob, options.regex) - files = filter_files(files, file_patterns, options.verbose) + files = filter_files(files, file_patterns) # clang-tidy error's when it does not get input files. if not files: diff --git a/tools/cpp_build/build_caffe2.sh b/tools/cpp_build/build_caffe2.sh index 6a50c14e05523e..b35435acb388c6 100755 --- a/tools/cpp_build/build_caffe2.sh +++ b/tools/cpp_build/build_caffe2.sh @@ -24,7 +24,6 @@ cmake -DUSE_CUDA:BOOL=$USE_CUDA \ -DCMAKE_BUILD_TYPE:STRING=$BUILD_TYPE \ -DCMAKE_INSTALL_PREFIX:STRING=$INSTALL_PREFIX \ -DCMAKE_INSTALL_MESSAGE=NEVER \ - -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=ON \ -G "$GENERATE" \ $PYTORCHPATH/ $MAKE -j "$JOBS" install diff --git a/tools/cpp_build/build_libtorch.sh b/tools/cpp_build/build_libtorch.sh index 6dd9a589cf1074..92a9b9981ed697 100755 --- a/tools/cpp_build/build_libtorch.sh +++ b/tools/cpp_build/build_libtorch.sh @@ -24,7 +24,6 @@ cmake -DUSE_CUDA:BOOL=$USE_CUDA \ -DCMAKE_INSTALL_MESSAGE=NEVER \ -Dnanopb_BUILD_GENERATOR:BOOL=OFF \ -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON \ - -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=ON \ -DVERBOSE:BOOL=${VERBOSE:-0} \ -G "$GENERATE" \ $PYTORCHPATH/torch diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py index 5a76d447ad2498..ad9ad2e05c4f4c 100644 --- a/tools/jit/gen_jit_dispatch.py +++ b/tools/jit/gen_jit_dispatch.py @@ -52,6 +52,28 @@ def jit_type_of(arg): typ = '{}?'.format(typ) return typ +# map from aten 'simple_type' to the function that will cast a attribute value +# to that type +FROM_ATTRIBUTE = { + 'Device': 'as_device(node->is(attr::{}))', + 'IntList': 'std::vector(node->is(attr::{}))', + 'Layout': 'static_cast(node->i(attr::{}))', + 'Scalar': 'Scalar(node->t(attr::{}))', + 'ScalarType': 'static_cast(node->i(attr::{}))', + 'Tensor': 'node->t(attr::{})', + 'bool': 'bool(node->i(attr::{}))', + 'double': 'node->f(attr::{})', + 'int64_t': 'node->i(attr::{})', + 'std::array': 'as_bool_array<2>(node->is(attr::{}))', + 'std::array': 'as_bool_array<3>(node->is(attr::{}))', + 'std::array': 'as_bool_array<4>(node->is(attr::{}))', +} + + +def from_attribute(arg): + simple_type = arg['simple_type'] + return FROM_ATTRIBUTE[simple_type].format(arg['name']) + # map from aten 'simple_type' to the function that will turn a tensor into # that type @@ -62,7 +84,6 @@ def jit_type_of(arg): 'Scalar': '{}.toScalar()', 'ScalarType': 'static_cast({}.toInt())', 'Tensor': '{}.toTensor()', - 'TensorList': '{}.toTensorList()->elements()', 'bool': 'bool({}.toInt())', 'double': '{}.toDouble()', 'int64_t': '{}.toInt()', @@ -77,13 +98,15 @@ def from_ivalue(arg, value): return FROM_IVALUE[simple_type].format(value) +KW_ACCESS = CodeTemplate("""(node->${method}(Symbol::attr("${name}")))""") + CALL_NAMESPACE = CodeTemplate("""\ auto result = at::${name}( ${args} ); """) CALL_METHOD = CodeTemplate("""\ -DeviceGuard device_guard(deviceForInputs(stack, ${num_inputs})); +DeviceGuard device_guard(deviceForInputs(stack, ${num_dynamic_inputs})); auto result = (${first}).${name}( ${args} ); @@ -99,20 +122,24 @@ def from_ivalue(arg, value): ); """) +# TODO (apaszke): remove the attributed codepath once we remove them CONSTRUCTOR = CodeTemplate("""\ -[](Stack & stack) { +[](Node *node) { + ${kw_assignments} + return Operation([=](Stack & stack) { autograd::profiler::RecordFunction record("${name}"); ${call} - drop(stack, ${num_inputs}); + drop(stack, ${num_dynamic_inputs}); pack(stack, std::move(result)); return 0; + }); } """) OPERATOR = CodeTemplate("""\ Operator( "${signature}", - ${op} + ${ops} ), """) @@ -144,6 +171,9 @@ def is_jit_op(decl): # we currently only support vararg tensor lists when they are the _first_ argument # and the only tensor argument arguments = decl['arguments'] + # Only support a single TensorList arg + if sum(arg['simple_type'] == 'TensorList' for arg in arguments) > 1: + return False return ((not decl['api_name'].endswith('_') or is_magic_method(decl['api_name'])) and not decl['name'].endswith('_out') and @@ -167,7 +197,7 @@ def gen_jit_dispatch(declarations, out, template_path): ops = [] - def get_invocation(decl, args, num_inputs): + def get_invocation(decl, args, num_dynamic_inputs): # because the arg list can get lengthy we put them on a separate line def pack_arguments(args): @@ -181,36 +211,109 @@ def pack_arguments(args): elif 'namespace' in decl['method_of']: return CALL_NAMESPACE.substitute(name=decl['name'], args=pack_arguments(args), - num_inputs=num_inputs) + num_dynamic_inputs=num_dynamic_inputs) else: return CALL_METHOD.substitute( name=decl['name'], first=args[0], args=pack_arguments(args[1:]), - num_inputs=num_inputs) + num_dynamic_inputs=num_dynamic_inputs) - def emit_decl_variant(decl): + def emit_decl_variant(decl, is_positional_arg, has_tensorlist): + # is_positional_arg is a boolean list the same length as decl['arguments'] + # that indicates if the argument should come from the postional list + # of inputs. If false, the argument comes from the constant attributes kw_assignments = [] arguments = [] - num_inputs = len(decl['arguments']) + + if has_tensorlist: + kw_assignments.append('size_t varargs_length = node->inputs().size();') + # arguments look like: [tensor list], arg1, arg2, arg3 + # we use peek(, static_inputs) to read the non-vararg inputs + # from the end of the stack + static_inputs = sum(is_positional_arg) - 1 + num_dynamic_inputs = 'varargs_length' + tensorlist_idx = [i for i, arg in enumerate(decl['arguments']) if arg['simple_type'] == 'TensorList'][0] + else: + static_inputs = sum(is_positional_arg) + num_dynamic_inputs = static_inputs real_inputs = 0 - for arg in decl['arguments']: - if arg['simple_type'] in default_only_types: + for i, arg in enumerate(decl['arguments']): + # This conditional allows us to process argument lists with a flattened argument list + # with a single TensorList. Given the sequence of arguments: + # a b c [d e f g] h i # [] is the list + # + # 1. For the section where we are processing positional inputs before the + # TensorList: + # a b c [d e f g] h i # [] is the list + # ~~~~~~~~~~~~ <- N + # we set this view_length to the total number of varargs inputs (i.e. the length) + # of the whole argument list. This means that indexing into the list using peek() + # we will retrieve arguments ar their true indices (i.e. peek at 0 points to a, + # 1 points to b, etc...). Similarly, we can use peekSlice() to index into the + # list itself this way. + # 2. After the list: + # a b c [d e f g] h i # [] is the list + # ~~~~~~ <- N + # Here we set the view length to static_inputs. In our example, + # we effectively ignore the fact that we have a list here. What is + # significant is that our index i is equivalent when the view length + # is right-justified, whether we have the list or not. Concretely, + # indexing h or i from `a b c [d e f g] h i` is equvalent to indexing + # h or i from `a b c h i`. + view_length = 'varargs_length' if has_tensorlist and i < tensorlist_idx else static_inputs + + if arg['simple_type'] == 'TensorList': + # NOTE: don't advance real_inputs here. After this we are going + # to switch over to indexing from the end as if we only had + # the static arguments. + arguments.append('toTensors(peekSlice(stack, {}, varargs_length - {}, varargs_length))' + .format(real_inputs, static_inputs)) + elif arg['simple_type'] in default_only_types: arguments.append(arg['default']) - else: - value = '(std::move(peek(stack, {}, {})))'.format(real_inputs, num_inputs) + elif is_tensor_arg(arg) or is_positional_arg[i]: + value = '(std::move(peek(stack, {}, {})))'.format(real_inputs, view_length) arguments.append(from_ivalue(arg, value)) real_inputs += 1 + else: + assign = "auto {} = {};".format(arg['name'], from_attribute(arg)) + kw_assignments.append(assign) + arguments.append(arg['name']) - call = get_invocation(decl, arguments, num_inputs) + call = get_invocation(decl, arguments, num_dynamic_inputs) returns = decl['returns'] + all_scalars = all(r['dynamic_type'] != 'TensorList' for r in returns) constructor = CONSTRUCTOR.substitute(name=decl['name'], call=call, kw_assignments=kw_assignments, - num_inputs=num_inputs) + num_dynamic_inputs=num_dynamic_inputs) return constructor + def emit_decl(decl): + arguments = decl['arguments'] + has_tensorlist = any(arg['simple_type'] == 'TensorList' for arg in arguments) + num_tensor_args = sum(map(is_tensor_arg, arguments)) + + # Right now, we generate dispatch methods that either take all non-tensor arguments + # as attributes, or don't use any attributes at all. In the future we might want to + # have something in the middle too (might be useful for e.g. constant propagation + # into attributes, as that would allow us to avoid reparsing tensors into scalar + # args at every invocation). + + all_real_arguments_are_inputs = tuple(arg['simple_type'] not in default_only_types for arg in arguments) + only_tensors_are_inputs = tuple(is_tensor_arg(arg) for arg in arguments) + + variants = [emit_decl_variant(decl, all_real_arguments_are_inputs, has_tensorlist)] + # in some cases there are no inputs that are possibly attributes, so the + # variants are actually the same. If so avoid generating both to save compilation + # time. + if all_real_arguments_are_inputs != only_tensors_are_inputs: + variants += [',', emit_decl_variant(decl, only_tensors_are_inputs, has_tensorlist)] + + ops.append(OPERATOR.substitute(signature=signature(decl), + ops=variants)) + # This function declares an order on declarations. This is necessary because # there is some ambiguity in the choice of overload: if an argument is overloaded # to accept both Scalar and Tensor, the schema with the Tensor should come first @@ -273,8 +376,7 @@ def declkey(decl): jit_decls = sort_decls(jit_decls) for decl in jit_decls: - ops.append(OPERATOR.substitute(signature=signature(decl), - op=emit_decl_variant(decl))) + emit_decl(decl) # Sort the generated snippets to ensure that the generation is deterministic env = { diff --git a/tools/jit/templates/register_aten_ops.cpp b/tools/jit/templates/register_aten_ops.cpp index 3dc973463d6e90..06ad9c2840b1cc 100644 --- a/tools/jit/templates/register_aten_ops.cpp +++ b/tools/jit/templates/register_aten_ops.cpp @@ -29,6 +29,7 @@ using autograd::Variable; using autograd::variable_list; using at::Scalar; using at::Tensor; +using at::TensorList; using at::TensorOptions; using at::DeviceGuard; @@ -41,20 +42,26 @@ int deviceForInputs(Stack & stack, size_t N) { return t.type().is_cuda() ? (int) t.get_device() : -1; } +std::vector toTensors(at::ArrayRef ivalues) { + return fmap(ivalues, [](const IValue& v) { + return v.toTensor(); + }); +} + template -std::array as_bool_array(at::ArrayRef vec) { +std::array as_bool_array(const std::vector& vec) { std::array res; JIT_ASSERT(vec.size() == N); std::copy(vec.begin(), vec.end(), res.begin()); return res; } -at::Device as_device(ArrayRef elements) { +at::Device as_device(const std::vector& elements) { return at::Device(static_cast(elements[0]), elements[1]); } RegisterOperators reg({ - ${constructors} +${constructors} }); } // anon namespace diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index 057bf6efeac3dd..88546fda7ed604 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -102,7 +102,6 @@ add_custom_command( "${TOOLS_PATH}/autograd/gen_autograd.py" "${TOOLS_PATH}/autograd/gen_autograd_functions.py" "${TOOLS_PATH}/autograd/gen_variable_type.py" - "${TOOLS_PATH}/jit/gen_jit_dispatch.py" "${TOOLS_PATH}/jit/templates/register_aten_ops.cpp" "${TOOLS_PATH}/jit/templates/aten_interned_strings.h" WORKING_DIRECTORY "${TORCH_SRC_DIR}/..") @@ -139,7 +138,6 @@ set(TORCH_SRCS ${TORCH_SRC_DIR}/csrc/jit/operator.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/batch_mm.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/canonicalize.cpp - ${TORCH_SRC_DIR}/csrc/jit/passes/constant_propagation.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/common_subexpression_elimination.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/create_autodiff_subgraphs.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/dead_code_elimination.cpp @@ -163,6 +161,8 @@ set(TORCH_SRCS ${TORCH_SRC_DIR}/csrc/jit/test_jit.cpp ${TORCH_SRC_DIR}/csrc/jit/tracer.cpp ${TORCH_SRC_DIR}/csrc/jit/type.cpp + ${TORCH_SRC_DIR}/csrc/onnx/onnx.cpp + ${TORCH_SRC_DIR}/csrc/onnx/onnx.npb.cpp ${TORCH_SRC_DIR}/csrc/torch.cpp ${TORCH_SRC_DIR}/csrc/utils/tensor_flatten.cpp ${TORCH_SRC_DIR}/csrc/utils/variadic.cpp @@ -267,12 +267,6 @@ if(OPENMP_FOUND) target_link_libraries(torch -fopenmp) endif() -if (NOT NO_API AND NOT USE_ROCM) - target_include_directories(torch PUBLIC - ${TORCH_SRC_DIR}/csrc/api - ${TORCH_SRC_DIR}/csrc/api/include) -endif() - if(USE_CUDA) if(MSVC) set(TORCH_CUDA_LIBRARIES @@ -371,7 +365,7 @@ install(TARGETS torch ARCHIVE DESTINATION "${TORCH_INSTALL_LIB_DIR}") # JIT Tests. TODO: Put into test/cpp/jit folder -if (BUILD_TORCH_TEST AND NOT MSVC AND NOT APPLE AND NOT USE_ROCM) +if (NOT MSVC AND NOT APPLE AND NOT USE_ROCM) add_executable(test_jit ${TORCH_SRC_DIR}/csrc/jit/test_jit.cpp) target_link_libraries(test_jit torch ${TORCH_CUDA_LIBRARIES}) target_compile_definitions(test_jit PUBLIC USE_CATCH _FORCE_INLINES) @@ -385,6 +379,10 @@ if (BUILD_TORCH_TEST AND NOT MSVC AND NOT APPLE AND NOT USE_ROCM) endif() if (BUILD_TORCH_TEST AND NOT NO_API AND NOT USE_ROCM) + target_include_directories(torch PUBLIC + ${TORCH_SRC_DIR}/csrc/api + ${TORCH_SRC_DIR}/csrc/api/include) + set(TORCH_API_TEST_DIR "${TORCH_SRC_DIR}/../test/cpp/api") add_executable(test_api diff --git a/torch/__init__.py b/torch/__init__.py index a40111bcca6b02..3fbb0b76fcc386 100644 --- a/torch/__init__.py +++ b/torch/__init__.py @@ -298,8 +298,3 @@ def manager_path(): # attach docstrings to torch and tensor functions from . import _torch_docs, _tensor_docs, _storage_docs del _torch_docs, _tensor_docs, _storage_docs - - -def compiled_with_cxx11_abi(): - r"""Returns whether PyTorch was built with _GLIBCXX_USE_CXX11_ABI=1""" - return _C._GLIBCXX_USE_CXX11_ABI diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp index af367c3e544905..2194310a46d522 100644 --- a/torch/csrc/Module.cpp +++ b/torch/csrc/Module.cpp @@ -402,6 +402,16 @@ PyObject *THPModule_isDefaultTypeCuda(PyObject *_unused, PyObject *arg) { END_HANDLE_TH_ERRORS } +PyObject *THPModule_useZeroSizeDim(PyObject *_unused, PyObject *arg) { + HANDLE_TH_ERRORS +#ifdef USE_TH_SIZE_ZERO_DIM + Py_RETURN_TRUE; +#else + Py_RETURN_FALSE; +#endif + END_HANDLE_TH_ERRORS +} + static PyMethodDef TorchMethods[] = { {"_initExtension", (PyCFunction)THPModule_initExtension, METH_O, NULL}, {"_autograd_init", (PyCFunction)THPAutograd_initExtension, METH_NOARGS, NULL}, @@ -432,6 +442,7 @@ static PyMethodDef TorchMethods[] = { {"set_flush_denormal", (PyCFunction)THPModule_setFlushDenormal, METH_O, NULL}, {"get_default_dtype", (PyCFunction)THPModule_getDefaultDtype, METH_NOARGS, NULL}, {"_is_default_type_cuda", (PyCFunction)THPModule_isDefaultTypeCuda, METH_NOARGS, NULL}, + {"_use_zero_size_dim", (PyCFunction)THPModule_useZeroSizeDim, METH_NOARGS, NULL}, {NULL, NULL, 0, NULL} }; @@ -613,13 +624,6 @@ static PyObject* initModule() { ASSERT_TRUE(PyModule_AddObject(module, "has_mkl", at::hasMKL() ? Py_True : Py_False) == 0); -#ifdef _GLIBCXX_USE_CXX11_ABI - ASSERT_TRUE(PyModule_AddObject(module, "_GLIBCXX_USE_CXX11_ABI", - _GLIBCXX_USE_CXX11_ABI ? Py_True : Py_False) == 0); -#else - ASSERT_TRUE(PyModule_AddObject(module, "_GLIBCXX_USE_CXX11_ABI", Py_False) == 0); -#endif - auto& defaultGenerator = at::globalContext().defaultGenerator(at::kCPU); THPDefaultGenerator = (THPGenerator*)THPGenerator_NewWithGenerator( defaultGenerator); diff --git a/torch/csrc/api/include/torch/nn/cursor.h b/torch/csrc/api/include/torch/nn/cursor.h index 2ae5c5d93752c1..c0f56eea72fbd0 100644 --- a/torch/csrc/api/include/torch/nn/cursor.h +++ b/torch/csrc/api/include/torch/nn/cursor.h @@ -48,7 +48,7 @@ class CursorBase { /// A `(key, value)` pair exposed by cursor iterators. struct Item { - Item(const std::string& key_, T& value_); + Item(const std::string& key_, T& module_); T& operator*(); const T& operator*() const; diff --git a/torch/csrc/autograd/anomaly_mode.h b/torch/csrc/autograd/anomaly_mode.h index 1f12f0a65c7460..7327d03f11b887 100644 --- a/torch/csrc/autograd/anomaly_mode.h +++ b/torch/csrc/autograd/anomaly_mode.h @@ -18,7 +18,7 @@ struct AnomalyMode { struct AnomalyMetadata { - virtual ~AnomalyMetadata() = default; + virtual ~AnomalyMetadata(){}; virtual void store_stack() = 0; virtual void print_stack() = 0; }; diff --git a/torch/csrc/autograd/aten_variable_hooks.cpp b/torch/csrc/autograd/aten_variable_hooks.cpp index 2f3899e4f8b59a..7a2c3974c2227c 100644 --- a/torch/csrc/autograd/aten_variable_hooks.cpp +++ b/torch/csrc/autograd/aten_variable_hooks.cpp @@ -6,7 +6,6 @@ namespace torch { namespace autograd { struct VariableHooks : public at::VariableHooksInterface { VariableHooks(at::VariableHooksArgs) {} void registerVariableTypeFor(at::Context*, at::Backend, at::ScalarType) const override; - at::Type& getVariableType(const at::Type&) const override; }; // Sigh, the registry doesn't support namespaces :( @@ -21,8 +20,4 @@ void VariableHooks::registerVariableTypeFor(at::Context* context, at::Backend ba register_variable_type_for(baseType); } -at::Type& VariableHooks::getVariableType(const at::Type& baseType) const { - return *VariableType::getType(baseType); -} - }} // torch::autograd diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp index 74e15f5caefe9d..8309ba1ce1038c 100644 --- a/torch/csrc/autograd/engine.cpp +++ b/torch/csrc/autograd/engine.cpp @@ -159,7 +159,7 @@ struct GraphTask { std::unordered_map exec_info; std::vector captured_vars; - void init_to_execute(Function& graph_root, const edge_list& outputs); + void init_to_execute(Function& graph_root, const edge_list& captures); // The value of worker_device in the thread that created this task. // See Note [Reentrant backwards] @@ -499,14 +499,14 @@ struct ClearCallbacks { std::mutex& callbacks_lock; }; -auto Engine::execute(const edge_list& roots, +auto Engine::execute(const edge_list& input_roots, const variable_list& inputs, bool keep_graph, bool create_graph, const edge_list& outputs) -> variable_list { std::call_once(start_threads_flag, &Engine::start_threads, this); - validate_outputs(roots, const_cast(inputs), [](const std::string& msg) { + validate_outputs(input_roots, const_cast(inputs), [](const std::string& msg) { return msg; }); @@ -517,7 +517,7 @@ auto Engine::execute(const edge_list& roots, std::unique_lock lock(graph_task.mutex); // Now compute the dependencies for all executable functions and queue the root - auto graph_root = std::make_shared(roots, inputs); + auto graph_root = std::make_shared(input_roots, inputs); compute_dependencies(graph_root.get(), graph_task); if (!outputs.empty()) { graph_task.init_to_execute(*graph_root, outputs); diff --git a/torch/csrc/autograd/engine.h b/torch/csrc/autograd/engine.h index 94490303ccc240..db8b3357ac2536 100644 --- a/torch/csrc/autograd/engine.h +++ b/torch/csrc/autograd/engine.h @@ -57,7 +57,7 @@ struct TORCH_API Engine { ReadyQueue& ready_queue(int device); void start_threads(); virtual void thread_init(int device); - virtual void thread_main(GraphTask *graph_task); + virtual void thread_main(GraphTask *task); virtual void thread_on_exception(FunctionTask& task, std::exception& e); std::once_flag start_threads_flag; diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h index 46a80b90b29ffa..b02bdf3928f2ff 100644 --- a/torch/csrc/autograd/function.h +++ b/torch/csrc/autograd/function.h @@ -328,7 +328,7 @@ struct TORCH_API Function : std::enable_shared_from_this { /// See Function::is_traceable() for definition. struct TraceableFunction : public Function { using Function::Function; - bool is_traceable() final { + bool is_traceable() final override { return true; } }; diff --git a/torch/csrc/autograd/function_hook.h b/torch/csrc/autograd/function_hook.h index f3cf5b2e793c6a..03c52fea54535c 100644 --- a/torch/csrc/autograd/function_hook.h +++ b/torch/csrc/autograd/function_hook.h @@ -10,12 +10,12 @@ struct Variable; using variable_list = std::vector; struct FunctionPreHook { - virtual ~FunctionPreHook() = default; + virtual ~FunctionPreHook() {} virtual variable_list operator()(const variable_list& grads) = 0; }; struct FunctionPostHook { - virtual ~FunctionPostHook() = default; + virtual ~FunctionPostHook() {} virtual variable_list operator()(const variable_list& grad_input, const variable_list& grad_output) = 0; }; diff --git a/torch/csrc/autograd/functions/accumulate_grad.h b/torch/csrc/autograd/functions/accumulate_grad.h index db86ae428d4060..44d4b7f106c860 100644 --- a/torch/csrc/autograd/functions/accumulate_grad.h +++ b/torch/csrc/autograd/functions/accumulate_grad.h @@ -6,9 +6,9 @@ namespace torch { namespace autograd { struct AccumulateGrad : public Function { - explicit AccumulateGrad(Variable variable_); + explicit AccumulateGrad(Variable variable); - variable_list apply(variable_list&& grads) override; + variable_list apply(variable_list&& inputs) override; Variable variable; }; diff --git a/torch/csrc/autograd/functions/basic_ops.cpp b/torch/csrc/autograd/functions/basic_ops.cpp index c4a54d99d08702..b04b0f25ca42d5 100644 --- a/torch/csrc/autograd/functions/basic_ops.cpp +++ b/torch/csrc/autograd/functions/basic_ops.cpp @@ -11,7 +11,7 @@ namespace torch { namespace autograd { -auto Error::apply(variable_list&& inputs) -> variable_list { +auto Error::apply(variable_list&& grad_outputs) -> variable_list { throw std::runtime_error(msg); } diff --git a/torch/csrc/autograd/functions/tensor.h b/torch/csrc/autograd/functions/tensor.h index 1a21a360ba9fc2..aa4b422136930f 100644 --- a/torch/csrc/autograd/functions/tensor.h +++ b/torch/csrc/autograd/functions/tensor.h @@ -13,7 +13,7 @@ namespace torch { namespace autograd { struct CopyBackwards : public Function { - variable_list apply(variable_list&& grads) override; + variable_list apply(variable_list&& inputs) override; at::Type *src_type; int32_t src_device = -1; @@ -23,12 +23,9 @@ struct CopyBackwards : public Function { // grad[idx] is defined by the relative sizes, strides, and offset of base and // view. struct CopySlices : public Function { - CopySlices( - const Variable& base_var, - at::TensorGeometry view_, - std::shared_ptr fn_); + CopySlices(const Variable& base, at::TensorGeometry view, std::shared_ptr fn); - variable_list apply(variable_list&& inputs) override; + variable_list apply(variable_list&& grads) override; void release_variables() override; at::TensorGeometry base; diff --git a/torch/csrc/autograd/input_buffer.h b/torch/csrc/autograd/input_buffer.h index f1c02e0d78e565..2e0febfc84b0bc 100644 --- a/torch/csrc/autograd/input_buffer.h +++ b/torch/csrc/autograd/input_buffer.h @@ -22,14 +22,14 @@ struct InputBuffer { InputBuffer& operator=(InputBuffer&& other) = default; // Accumulates the variable at a specified index. - void add(size_t pos, Variable var); + void add(size_t idx, Variable var); int device() const; Variable operator[](size_t pos) { return buffer[pos]; } // Returns the inputs as a list of variables. Destroys given InputBuffer. - static std::vector variables(InputBuffer&& g); + static std::vector variables(InputBuffer&& buffer); private: std::vector buffer; diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h index ba0fee1510baa2..dd77dc193ba9bd 100644 --- a/torch/csrc/autograd/profiler.h +++ b/torch/csrc/autograd/profiler.h @@ -185,7 +185,7 @@ struct TORCH_API RecordFunction { using thread_event_lists = std::vector>; // NOTE: changing profiler modes is **NOT THREAD SAFE**. You should ensure that // there no autograd functions are being executed when these function are used. -TORCH_API void enableProfiler(ProfilerState new_state); +TORCH_API void enableProfiler(ProfilerState state); TORCH_API thread_event_lists disableProfiler(); } // namespace profiler diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp index e9d29bd0caa688..08e494530040eb 100644 --- a/torch/csrc/autograd/python_function.cpp +++ b/torch/csrc/autograd/python_function.cpp @@ -45,7 +45,7 @@ namespace torch { namespace autograd { VariableInfo::VariableInfo(const Variable& var) : type(&var.type()) - , size(var.sizes().vec()) + , size(var.sizes()) , requires_grad(var.requires_grad()) { if (var.type().is_cuda()) { device = var.get_device(); diff --git a/torch/csrc/autograd/python_variable_indexing.cpp b/torch/csrc/autograd/python_variable_indexing.cpp index 1aa21f84d45cf2..cd8329cad01434 100644 --- a/torch/csrc/autograd/python_variable_indexing.cpp +++ b/torch/csrc/autograd/python_variable_indexing.cpp @@ -154,6 +154,14 @@ static Variable applySlicing(const Variable& self, PyObject* index, variable_lis result = applySelect(result, dim, THPUtils_unpackLong(obj)); } else if (PySlice_Check(obj)) { result = applySlice(result, dim, obj); +#ifndef USE_TH_SIZE_ZERO_DIM + if (result.numel() == 0) { + // TODO: currently we don't have support for 0-sized dims, so slicing a dim + // to size 0 will return a size 0 tensor. for now, just shortcircuit slicing + // and return that size 0 tensor. + return result; + } +#endif dim++; } else if (obj == Py_Ellipsis) { dim += self.dim() - specified_dims; diff --git a/torch/csrc/autograd/saved_variable.h b/torch/csrc/autograd/saved_variable.h index 037f06a7f95c11..61a1d3b3eac172 100644 --- a/torch/csrc/autograd/saved_variable.h +++ b/torch/csrc/autograd/saved_variable.h @@ -45,10 +45,10 @@ class TORCH_API SavedVariable { std::weak_ptr grad_accumulator_; VariableVersion version_counter_; - uint32_t saved_version_ = 0; - uint32_t output_nr_ = 0; + uint32_t saved_version_; + uint32_t output_nr_; bool was_default_constructed_ = true; - bool requires_grad_ = false; - bool has_grad_fn_ = false; + bool requires_grad_; + bool has_grad_fn_; }; }} // namespace torch::autograd diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp index 30aded0a85e73a..9bbae25d9c4d96 100644 --- a/torch/csrc/autograd/variable.cpp +++ b/torch/csrc/autograd/variable.cpp @@ -22,7 +22,7 @@ namespace torch { namespace autograd { Variable::Impl::Impl(at::Tensor data, bool requires_grad, Edge gradient_edge) - : TensorImpl(data.type().backend(), data.type().scalarType(), nullptr, /* is variable */ true), + : TensorImpl(VariableType::getType(data), nullptr), data_(std::move(data)), grad_fn_(std::move(gradient_edge.function)), requires_grad_(false), @@ -118,9 +118,7 @@ void Variable::Impl::backward( void Variable::Impl::set_data(Tensor new_data) { if (new_data.type() != data_.type()) { - scalar_type_ = new_data.type().scalarType(); - backend_ = new_data.type().backend(); - is_variable_ = true; + type_ = VariableType::getType(new_data.type()); // Clear grad_accumulator if it exists, since it stores the old type info. grad_accumulator_.reset(); } @@ -156,8 +154,8 @@ std::shared_ptr& Variable::ViewImpl::get_grad_fn() { AT_ASSERT(output_nr_ == 0); auto fn = std::make_shared(); fn->self_geometry = at::TensorGeometry(base_); - fn->size = sizes().vec(); - fn->stride = strides().vec(); + fn->size = sizes(); + fn->stride = strides(); fn->storage_offset = data_.storage_offset(); fn->set_next_edges(collect_next_edges(base_)); fn->add_input_metadata(base_.type(), sizes()); diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h index d46008bbdd10b0..c97a0322359a4d 100644 --- a/torch/csrc/autograd/variable.h +++ b/torch/csrc/autograd/variable.h @@ -263,7 +263,7 @@ struct Variable::Impl : public at::TensorImpl { TORCH_API explicit Impl( at::Tensor data, bool requires_grad = false, - Edge gradient_edge = Edge()); + Edge edge = Edge()); ~Impl() override; @@ -327,6 +327,9 @@ struct Variable::Impl : public at::TensorImpl { /// Reset all expensive fields to free up resources void release_resources() override; + // Make this field public so we can access it from `Variable`. + using at::TensorImpl::type_; + std::string name; at::Tensor data_; diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp index 8237239f99b639..0e869876e8e1fa 100644 --- a/torch/csrc/cuda/comm.cpp +++ b/torch/csrc/cuda/comm.cpp @@ -74,7 +74,7 @@ tensor_list2d broadcast_coalesced(TensorList tensors, IntList devices, size_t bu } tensor_list2d outputs(devices.size()); - outputs[0] = tensors.vec(); + outputs[0] = tensors; for (auto & o : outputs) o.reserve(tensors.size()); diff --git a/torch/csrc/distributed/c10d/ddp.h b/torch/csrc/distributed/c10d/ddp.h deleted file mode 100644 index 7b26c1475fc1c6..00000000000000 --- a/torch/csrc/distributed/c10d/ddp.h +++ /dev/null @@ -1,52 +0,0 @@ -#pragma once - -#include - -#include - -#include - -#include -#include -#include - -namespace c10d { -inline void distBroadcastCoalesced( - std::vector& tensors, - int64_t bufferSize, - ProcessGroup& processGroup) { - auto tensorGroups = torch::utils::take_tensors(tensors, bufferSize); - // We store single-element vectors in `flatTensors` because - // `ProcessGroup::broadcast` takes a reference to a vector, which must be - // alive until the `wait()` call on the returned `Work` completes. - std::vector> flatTensors; - std::vector> work; - flatTensors.reserve(tensorGroups.size()); - work.reserve(tensorGroups.size()); - for (const auto& group : tensorGroups) { - // Flatten each group of tensors (whose size equals `bufferSize`) into a - // single tensor. - flatTensors.push_back({torch::utils::flatten_dense_tensors(group.tensors)}); - BroadcastOptions broadcastOptions; - broadcastOptions.rootRank = 0; - broadcastOptions.rootTensor = 0; - // Enqueue a work item and collect the `Work` (essntially a "future") so we - // can `wait()` for its completion after we have collected all `Work` items. - work.push_back( - processGroup.broadcast(flatTensors.back(), broadcastOptions)); - } - // Now loop through each group, wait for the broadcast to complete, and - // un-flatten the broadcast tensor back into device-local individual tensors. - for (size_t group = 0; group < tensorGroups.size(); ++group) { - auto& tensors = tensorGroups[group].tensors; - work[group]->wait(); - const auto synced = - torch::utils::unflatten_dense_tensors(flatTensors[group][0], tensors); - AT_ASSERT(synced.size() == tensors.size()); - for (size_t i = 0; i < synced.size(); ++i) { - // Copy into the per-process tensors. - tensors[i].copy_(synced[i], /*non_blocking=*/true); - } - } -} -} // namespace c10d diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index 797fcbcdd2432e..2bd7a871dc36fc 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -13,10 +13,9 @@ #include #include -#include -#include -#include -#include +#include "torch/csrc/Exceptions.h" +#include "torch/csrc/utils/object_ptr.h" +#include "torch/csrc/utils/pybind.h" namespace torch { namespace distributed { @@ -200,8 +199,6 @@ PyObject* c10d_init(PyObject* _unused) { &::c10d::ProcessGroup::Work::wait, py::call_guard()); - module.def("_dist_broadcast_coalesced", &::c10d::distBroadcastCoalesced); - Py_RETURN_TRUE; } diff --git a/torch/csrc/jit/argument_spec.h b/torch/csrc/jit/argument_spec.h index f404b4ce9a05c6..d6bd90cb708784 100644 --- a/torch/csrc/jit/argument_spec.h +++ b/torch/csrc/jit/argument_spec.h @@ -59,21 +59,20 @@ struct ArgumentSpec { for(int32_t i = 0; i < num_inputs; i++) { auto & pod = pods[i]; pod.is_tensor = static_cast(inputs[i].isTensor()); - if (pod.is_tensor) { - at::Tensor t = inputs[i].toTensor(); - pod.defined = t.defined(); - if (pod.defined) { - pod.type = static_cast(t.type().scalarType()); - pod.device = (!t.type().is_cuda()) ? -1 : t.get_device(); - pod.requires_grad = with_grad && autograd::as_variable_ref(t).requires_grad(); - total_dims += t.ndimension(); - auto sizes = t.sizes(); - std::copy(sizes.begin(),sizes.end(), next_dim); - next_dim += sizes.size(); - auto strides = t.strides(); - std::copy(strides.begin(), strides.end(), next_dim); - next_dim += strides.size(); - } + if (!pod.is_tensor) continue; + at::Tensor t = inputs[i].toTensor(); + pod.defined = t.defined(); + if (pod.defined) { + pod.type = static_cast(t.type().scalarType()); + pod.device = (!t.type().is_cuda()) ? -1 : t.get_device(); + pod.requires_grad = with_grad && autograd::as_variable_ref(t).requires_grad(); + total_dims += t.ndimension(); + auto sizes = t.sizes(); + std::copy(sizes.begin(),sizes.end(), next_dim); + next_dim += sizes.size(); + auto strides = t.strides(); + std::copy(strides.begin(), strides.end(), next_dim); + next_dim += strides.size(); } // each POD has a running tally of all dimensions including its own pod.total_dims = total_dims; diff --git a/torch/csrc/jit/attributes.h b/torch/csrc/jit/attributes.h index 53b87af9ef991d..f69790cab52e00 100644 --- a/torch/csrc/jit/attributes.h +++ b/torch/csrc/jit/attributes.h @@ -28,7 +28,7 @@ struct AttributeValue { Symbol name; virtual AttributeKind kind() const = 0; virtual Ptr clone() const = 0; - virtual ~AttributeValue() = default; + virtual ~AttributeValue() {} }; template @@ -101,7 +101,7 @@ struct AttributeError : public std::exception { // we return Derived* pointers because Nodes are normally held as pointers. template struct Attributes { - Attributes() = default; + Attributes() {} void copyAttributes(const Attributes & rhs) { values_.clear(); for(auto & i : rhs.values_) { diff --git a/torch/csrc/jit/autodiff.cpp b/torch/csrc/jit/autodiff.cpp index 7f250bf7c452aa..c830dc45a537f5 100644 --- a/torch/csrc/jit/autodiff.cpp +++ b/torch/csrc/jit/autodiff.cpp @@ -9,7 +9,6 @@ #include #include -#include namespace torch { namespace jit { @@ -565,13 +564,14 @@ static void lambdaLiftReverse(Gradient& grad_desc, ReverseDetails& rev_info) { reverse_block->owningNode()->destroy(); } -Gradient differentiate(std::shared_ptr& graph, const std::vector& requires_grad) { +Gradient differentiate(std::shared_ptr& _graph, const std::vector& requires_grad) { Gradient grad_desc; // Take ownership of the graph - JIT_ASSERTM(graph.use_count() == 1, - "differentiate will mutate and destroy the graph, so it requires " - "graph.use_count() == 1, but found %d", graph.use_count()); - std::swap(graph, grad_desc.f); + JIT_ASSERTM( + _graph.use_count() == 1, + "differentiate will mutate and destroy the graph, so it requires " + "graph.use_count() == 1, but found ", _graph.use_count()); + std::swap(_graph, grad_desc.f); // XXX: Take care when handling outputs - they can be duplicated! WithInsertPoint guard(grad_desc.f->block()); diff --git a/torch/csrc/jit/autodiff.h b/torch/csrc/jit/autodiff.h index ea2b7a1170efeb..6dd2be9db0e779 100644 --- a/torch/csrc/jit/autodiff.h +++ b/torch/csrc/jit/autodiff.h @@ -4,9 +4,7 @@ #include "torch/csrc/jit/ir.h" #include - #include -#include namespace torch { namespace jit { diff --git a/torch/csrc/jit/constants.cpp b/torch/csrc/jit/constants.cpp index 47e593bbb125e2..3c4ad0c130ea31 100644 --- a/torch/csrc/jit/constants.cpp +++ b/torch/csrc/jit/constants.cpp @@ -22,13 +22,8 @@ Value* insertConstant( n->f_(attr::value, val.toDouble()); n->output()->setType(FloatType::get()); } else if(val.isIntList()) { - n->is_(attr::value, val.toIntList()->elements().vec()); + n->is_(attr::value, val.toIntList()->elements()); n->output()->setType(ListType::ofInts()); - } else if(val.isTensorList()) { - n->ts_(attr::value, fmap(val.toTensorList()->elements(), [](const at::Tensor & t) { - return autograd::Variable(t).data(); - })); - n->output()->setType(ListType::ofTensors()); } else { throw std::runtime_error("Unsupported value kind: " + val.tagKind()); } @@ -71,14 +66,6 @@ RegisterOperators reg({ push(stack, is); return 0; }; - } else if(type->isSubtypeOf(ListType::ofTensors())) { - auto ts = fmap(node->ts(attr::value), [](const at::Tensor & t) -> at::Tensor { - return autograd::make_variable(t); - }); - return [ts](Stack& stack) { - push(stack, ts); - return 0; - }; } else { std::stringstream ss; ss << "constant literal not supported for: " << type->str(); diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp index 20208af5496c28..71dec999c40216 100644 --- a/torch/csrc/jit/export.cpp +++ b/torch/csrc/jit/export.cpp @@ -1,7 +1,6 @@ #include "torch/csrc/jit/export.h" -#include "torch/csrc/autograd/symbolic.h" -#include "onnx/onnx.pb.h" #include "torch/csrc/onnx/onnx.h" +#include "torch/csrc/autograd/symbolic.h" #include "torch/csrc/utils/functional.h" #include @@ -19,8 +18,7 @@ namespace torch { namespace jit { namespace { -namespace onnx_torch = ::torch::onnx; -namespace onnx = ::ONNX_NAMESPACE; +namespace onnx = ::torch::onnx; std::string value_name(Value* n) { return n->uniqueName(); @@ -28,7 +26,7 @@ std::string value_name(Value* n) { struct ExportContext { size_t num_blocks = 0; - onnx_torch::OperatorExportTypes operator_export_type; + onnx::OperatorExportTypes operator_export_type; }; void encodeGraph(onnx::GraphProto * p_g, const std::shared_ptr & g, @@ -45,37 +43,34 @@ void encodeTensor(onnx::TensorProto * p, const at::Tensor & tensor, for(auto d : tensor.sizes()) { p->add_dims(d); } - onnx::TensorProto_DataType onnx_type; + onnx::DataType onnx_type; // Most integral types and float16 need to be serialized as int32 at::ScalarType cast_type = tensor.type().scalarType(); switch(tensor.type().scalarType()) { case at::kDouble: - onnx_type = onnx::TensorProto_DataType_DOUBLE; + onnx_type = onnx::kDOUBLE; break; case at::kFloat: - onnx_type = onnx::TensorProto_DataType_FLOAT; + onnx_type = onnx::kFLOAT; break; case at::kHalf: - onnx_type = onnx::TensorProto_DataType_FLOAT16; + onnx_type = onnx::kFLOAT16; cast_type = at::kInt; break; case at::kByte: - onnx_type = onnx::TensorProto_DataType_UINT8; - cast_type = at::kInt; - break; case at::kChar: - onnx_type = onnx::TensorProto_DataType_INT8; + onnx_type = onnx::kINT8; cast_type = at::kInt; break; case at::kShort: - onnx_type = onnx::TensorProto_DataType_INT16; + onnx_type = onnx::kINT16; cast_type = at::kInt; break; case at::kInt: - onnx_type = onnx::TensorProto_DataType_INT32; + onnx_type = onnx::kINT32; break; case at::kLong: - onnx_type = onnx::TensorProto_DataType_INT64; + onnx_type = onnx::kINT64; break; default: AT_ERROR("unexpected tensor scalar type"); @@ -90,14 +85,13 @@ void encodeTensor(onnx::TensorProto * p, const at::Tensor & tensor, if (external_ref) { // For now, we use the name of the tensor as the external lookup name to // avoid ONNX protobuf changes. - JIT_ASSERT(external_ref.value() == p->name()); + JIT_ASSERT(external_ref.value() == p->get_name()); JIT_ASSERT(raw_data_export_map != nullptr); JIT_ASSERT(raw_data_export_map->count(external_ref.value()) == 0); (*raw_data_export_map)[external_ref.value()] = t; - p->set_raw_data("__EXTERNAL"); + p->set_external_data_present(); } else { - JIT_ASSERT(t.is_contiguous()); - p->set_raw_data(std::string(static_cast(t.data_ptr()), t.type().elementSizeInBytes() * t.numel())); + p->set_raw_data(t); } } @@ -108,50 +102,50 @@ void addAttribute(onnx::NodeProto * n_p, jit::Node * n, jit::Symbol name, Export switch(n->kindOf(name)) { case AttributeKind::f: attr->set_f(n->f(name)); - attr->set_type(onnx::AttributeProto_AttributeType_FLOAT); + attr->set_type(onnx::aFLOAT); break; case AttributeKind::fs: - attr->set_type(onnx::AttributeProto_AttributeType_FLOATS); + attr->set_type(onnx::aFLOATS); for(auto & v : n->fs(name)) attr->add_floats(v); break; case AttributeKind::i: - attr->set_type(onnx::AttributeProto_AttributeType_INT); + attr->set_type(onnx::aINT); attr->set_i(n->i(name)); break; case AttributeKind::is: - attr->set_type(onnx::AttributeProto_AttributeType_INTS); + attr->set_type(onnx::aINTS); for(auto & v : n->is(name)) attr->add_ints(v); break; case AttributeKind::s: - attr->set_type(onnx::AttributeProto_AttributeType_STRING); + attr->set_type(onnx::aSTRING); attr->set_s(n->s(name)); break; case AttributeKind::ss: - attr->set_type(onnx::AttributeProto_AttributeType_STRINGS); + attr->set_type(onnx::aSTRINGS); for(auto & v : n->ss(name)) attr->add_strings(v); break; case AttributeKind::t: { - attr->set_type(onnx::AttributeProto_AttributeType_TENSOR); + attr->set_type(onnx::aTENSOR); auto t = attr->mutable_t(); encodeTensor(t, n->t(name)); } break; case AttributeKind::ts: - attr->set_type(onnx::AttributeProto_AttributeType_TENSORS); + attr->set_type(onnx::aTENSORS); for(auto & v : n->ts(name)) { auto t = attr->add_tensors(); encodeTensor(t, v); } break; case AttributeKind::g: { - attr->set_type(onnx::AttributeProto_AttributeType_GRAPH); + attr->set_type(onnx::aGRAPH); auto g = attr->mutable_g(); encodeGraph(g, n->g(name), {}, ctx, nullptr); } break; case AttributeKind::gs: - attr->set_type(onnx::AttributeProto_AttributeType_GRAPHS); + attr->set_type(onnx::aGRAPHS); for(auto & v : n->gs(name)) { auto g = attr->add_graphs(); encodeGraph(g, v, {}, ctx, nullptr); @@ -160,52 +154,49 @@ void addAttribute(onnx::NodeProto * n_p, jit::Node * n, jit::Symbol name, Export } } -void encodeTypeProtoTensorType(onnx::TypeProto_Tensor* tensor_type, Value* n) { +void encodeTypeProtoTensorType(onnx::TypeProtoTensor* tensor_type, Value* n) { onnx::TensorShapeProto* shape = tensor_type->mutable_shape(); if (TensorTypePtr node_type = n->type()->cast()) { const std::vector& sizes = node_type->sizes(); - for (size_t i = 0; i < sizes.size(); i++) { - shape->add_dim(); - shape->mutable_dim(i)->set_dim_value(sizes[i]); + for (std::int64_t s : sizes) { + shape->add_dim(s); } - onnx::TensorProto_DataType onnx_type; + onnx::DataType onnx_type; switch(node_type->scalarType()) { case at::kDouble: - onnx_type = onnx::TensorProto_DataType_DOUBLE; + onnx_type = onnx::kDOUBLE; break; case at::kFloat: - onnx_type = onnx::TensorProto_DataType_FLOAT; + onnx_type = onnx::kFLOAT; break; case at::kHalf: - onnx_type = onnx::TensorProto_DataType_FLOAT16; + onnx_type = onnx::kFLOAT16; break; case at::kByte: - onnx_type = onnx::TensorProto_DataType_UINT8; - break; case at::kChar: - onnx_type = onnx::TensorProto_DataType_INT8; + onnx_type = onnx::kINT8; break; case at::kShort: - onnx_type = onnx::TensorProto_DataType_INT16; + onnx_type = onnx::kINT16; break; case at::kInt: - onnx_type = onnx::TensorProto_DataType_INT32; + onnx_type = onnx::kINT32; break; case at::kLong: - onnx_type = onnx::TensorProto_DataType_INT64; + onnx_type = onnx::kINT64; break; default: AT_ERROR("unexpected tensor scalar type"); break; } - tensor_type->set_elem_type(onnx_type); + tensor_type->set_data_type(onnx_type); } } void encodeValueInfo(onnx::ValueInfoProto* v, Value* n) { v->set_name(value_name(n)); onnx::TypeProto* t = v->mutable_type(); - onnx::TypeProto_Tensor* tensor_type = t->mutable_tensor_type(); + onnx::TypeProtoTensor* tensor_type = t->mutable_tensor_type(); encodeTypeProtoTensorType(tensor_type, n); } @@ -235,7 +226,7 @@ void encodeBlock(onnx::GraphProto * p_g, Block *b, encodeValueInfo(v, output); } for (auto node : b->nodes()) { - bool is_raw_export = ctx->operator_export_type == onnx_torch::OperatorExportTypes::RAW; + bool is_raw_export = ctx->operator_export_type == onnx::OperatorExportTypes::RAW; if (node->kind() == prim::Undefined && !is_raw_export) { // Undefined nodes are used to implement optional inputs. One // way to "not provide" an optional input is to create an @@ -262,7 +253,7 @@ void encodeBlock(onnx::GraphProto * p_g, Block *b, JIT_ASSERT(!node->kind().is_onnx()); p_n->set_domain(node->kind().domainString()); } - else if (ctx->operator_export_type != onnx_torch::OperatorExportTypes::ONNX_ATEN_FALLBACK) { + else if (ctx->operator_export_type != onnx::OperatorExportTypes::ONNX_ATEN_FALLBACK) { JIT_ASSERT(node->kind().is_onnx()); } p_n->set_op_type(node->kind().toUnqualString()); @@ -272,7 +263,7 @@ void encodeBlock(onnx::GraphProto * p_g, Block *b, if (is_raw_export && node->blocks().size() > 0) { auto blocks = p_n->add_attribute(); blocks->set_name("_blocks"); - blocks->set_type(onnx::AttributeProto_AttributeType_GRAPHS); + blocks->set_type(onnx::aGRAPHS); for (auto block : node->blocks()) { auto graph = blocks->add_graphs(); encodeBlock(graph, block, initializers, ctx, raw_data_export_map); @@ -283,7 +274,7 @@ void encodeBlock(onnx::GraphProto * p_g, Block *b, auto body = p_n->add_attribute(); body->set_name("body"); - body->set_type(onnx::AttributeProto_AttributeType_GRAPH); + body->set_type(onnx::aGRAPH); auto g = body->mutable_g(); encodeBlock(g, node->blocks()[0], {}, ctx, raw_data_export_map); } @@ -292,13 +283,13 @@ void encodeBlock(onnx::GraphProto * p_g, Block *b, auto true_branch = p_n->add_attribute(); true_branch->set_name("then_branch"); - true_branch->set_type(onnx::AttributeProto_AttributeType_GRAPH); + true_branch->set_type(onnx::aGRAPH); auto true_g = true_branch->mutable_g(); encodeBlock(true_g, node->blocks()[0], {}, ctx, raw_data_export_map); auto false_branch = p_n->add_attribute(); false_branch->set_name("else_branch"); - false_branch->set_type(onnx::AttributeProto_AttributeType_GRAPH); + false_branch->set_type(onnx::aGRAPH); auto false_g = false_branch->mutable_g(); encodeBlock(false_g, node->blocks()[1], {}, ctx, raw_data_export_map); } @@ -309,7 +300,7 @@ void encodeBlock(onnx::GraphProto * p_g, Block *b, for (auto & tensor : initializers) { // TODO: stop using positions to determine which initializers // match to which inputs - std::string name = p_g->input(inputs_count++).name(); + std::string name = p_g->get_input_name(inputs_count++); auto p = p_g->add_initializer(); p->set_name(name); if (raw_data_export_map) { @@ -323,8 +314,8 @@ void encodeBlock(onnx::GraphProto * p_g, Block *b, void encodeModel(onnx::ModelProto* p_m, const std::shared_ptr& g, const std::vector& initializers, RawDataExportMap* raw_data_export_map = nullptr, - onnx_torch::OperatorExportTypes operator_export_type - = onnx_torch::OperatorExportTypes::ONNX) { + onnx::OperatorExportTypes operator_export_type + = onnx::OperatorExportTypes::ONNX) { onnx::GraphProto* p_g = p_m->mutable_graph(); ExportContext ctx; ctx.operator_export_type = operator_export_type; @@ -343,7 +334,7 @@ std::string getNodeStackTraceString(Node* n) { } } // namespace -void validateGraph(const std::shared_ptr& graph, onnx_torch::OperatorExportTypes operator_export_type) { +void validateGraph(const std::shared_ptr& graph, onnx::OperatorExportTypes operator_export_type) { for (auto node : graph->nodes()) { // Macro'ed so we get a marginally better line number on failed export #define FAIL_EXPORT(name) \ @@ -365,7 +356,7 @@ void validateGraph(const std::shared_ptr& graph, onnx_torch::OperatorExpo "Cannot export individual pack_padded_sequence or pad_packed_sequence; these operations must occur in pairs.\n\nUsage of this operation occurred at:\n" + getNodeStackTraceString(node)); } - bool is_aten_fallback = operator_export_type == onnx_torch::OperatorExportTypes::ONNX_ATEN_FALLBACK; + bool is_aten_fallback = operator_export_type == onnx::OperatorExportTypes::ONNX_ATEN_FALLBACK; if (!node->kind().is_onnx() && !is_aten_fallback && node->kind() != prim::Undefined) { FAIL_EXPORT( "Couldn't export operator " + node->kind().toDisplayString() + "\n\nDefined at:\n" + @@ -376,182 +367,6 @@ void validateGraph(const std::shared_ptr& graph, onnx_torch::OperatorExpo } } -// Pretty printing -namespace { -constexpr char indent_char = ' '; -constexpr size_t indent_multiplier = 2; - -std::string idt(size_t indent) { - return std::string(indent * indent_multiplier, indent_char); -} - -std::string nlidt(size_t indent) { - return std::string("\n") + idt(indent); -} - -void dump(const onnx::TensorProto& tensor, std::ostream& stream) { - stream << "TensorProto shape: ["; - for (int i = 0; i < tensor.dims_size(); ++i) { - stream << tensor.dims(i) << (i == tensor.dims_size() - 1 ? "" : " "); - } - stream << "]"; -} - -void dump(const onnx::TensorShapeProto& shape, std::ostream& stream) { - for (int i = 0; i < shape.dim_size(); ++i) { - auto &dim = shape.dim(i); - if (dim.has_dim_value()) { - stream << dim.dim_value(); - } else { - stream << "?"; - } - stream << (i == shape.dim_size() - 1 ? "" : " "); - } -} - -void dump(const onnx::TypeProto_Tensor& tensor_type, std::ostream& stream) { - stream << "Tensor dims: "; - dump(tensor_type.shape(), stream); -} - -void dump(const onnx::TypeProto& type, std::ostream& stream) { - dump(type.tensor_type(), stream); -} - -void dump(const onnx::ValueInfoProto& value_info, std::ostream& stream) { - stream << "{name: \"" << value_info.name() - << "\", type:"; - dump(value_info.type(), stream); - stream << "}"; -} - -void dump(const onnx::GraphProto& graph, std::ostream& stream, size_t indent); - -void dump(const onnx::AttributeProto& attr, std::ostream& stream, size_t indent) { - stream << "{ name: '" << attr.name() << "', type: "; - if (attr.has_f()) { - stream << "float, value: " << attr.f(); - } else if (attr.has_i()) { - stream << "int, value: " << attr.i(); - } else if (attr.has_s()) { - stream << "string, value: '" << attr.s() << "'"; - } else if (attr.has_g()) { - stream << "graph, value:\n"; - dump(attr.g(), stream, indent+1); - stream << nlidt(indent); - } else if (attr.has_t()) { - stream << "tensor, value:"; - dump(attr.t(), stream); - } else if (attr.floats_size()) { - stream << "floats, values: ["; - for (int i = 0; i < attr.floats_size(); ++i) - stream << attr.floats(i) << (i == attr.floats_size() - 1 ? "" : " "); - stream << "]"; - } else if (attr.ints_size()) { - stream << "ints, values: ["; - for (int i = 0; i < attr.ints_size(); ++i) - stream << attr.ints(i) << (i == attr.ints_size() - 1 ? "" : " "); - stream << "]"; - } else if (attr.strings_size()) { - stream << "strings, values: ["; - for (int i = 0; i < attr.strings_size(); ++i) - stream << "'" << attr.strings(i) << "'" << (i == attr.strings_size() - 1 ? "" : " "); - stream << "]"; - } else if (attr.tensors_size()) { - stream << "tensors, values: ["; - for (auto& t : attr.tensors()) { - dump(t, stream); - } - stream << "]"; - } else if (attr.graphs_size()) { - stream << "graphs, values: ["; - for (auto& g : attr.graphs()) { - dump(g, stream, indent+1); - } - stream << "]"; - } else { - stream << "UNKNOWN"; - } - stream << "}"; -} - -void dump(const onnx::NodeProto& node, std::ostream& stream, size_t indent) { - stream << "Node {type: \"" << node.op_type() << "\", inputs: ["; - for (int i = 0; i < node.input_size(); ++i) { - stream << node.input(i) << (i == node.input_size() - 1 ? "" : ","); - } - stream << "], outputs: ["; - for (int i = 0; i < node.output_size(); ++i) { - stream << node.output(i) << (i == node.output_size() - 1 ? "" : ","); - } - stream << "], attributes: ["; - for (int i = 0; i < node.attribute_size(); ++i) { - dump(node.attribute(i), stream, indent+1); - stream << (i == node.attribute_size() - 1 ? "" : ","); - } - stream << "]}"; -} - -void dump(const onnx::GraphProto& graph, std::ostream& stream, size_t indent) { - stream << idt(indent) << "GraphProto {" << nlidt(indent+1) - << "name: \"" << graph.name() << "\"" << nlidt(indent+1) - << "inputs: ["; - for (int i = 0; i < graph.input_size(); ++i) { - dump(graph.input(i), stream); - stream << (i == graph.input_size() - 1 ? "" : ","); - } - stream << "]" << nlidt(indent+1) - << "outputs: ["; - for (int i = 0; i < graph.output_size(); ++i) { - dump(graph.output(i), stream); - stream << (i == graph.output_size() - 1 ? "" : ","); - } - stream << "]" << nlidt(indent+1) - << "initializers: ["; - for (int i = 0; i < graph.initializer_size(); ++i) { - dump(graph.initializer(i), stream); - stream << (i == graph.initializer_size() - 1 ? "" : ","); - } - stream << "]" << nlidt(indent+1) - << "nodes: [" << nlidt(indent+2); - for (int i = 0; i < graph.node_size(); ++i) { - dump(graph.node(i), stream, indent+2); - if (i != graph.node_size() - 1) stream << "," << nlidt(indent+2); - } - stream << nlidt(indent+1) << "]\n" << idt(indent) << "}\n"; -} - -void dump(const onnx::OperatorSetIdProto& operator_set_id, std::ostream& stream) { - stream << "OperatorSetIdProto { domain: " << operator_set_id.domain() << "}"; -} - -void dump(const onnx::ModelProto& model, std::ostream& stream, size_t indent) { - stream << idt(indent) - << "ModelProto {" << nlidt(indent+1) - << "producer_name: \"" << model.producer_name() << "\"" << nlidt(indent+1) - << "domain: \"" << model.domain() << "\"" << nlidt(indent+1) - << "doc_string: \"" << model.doc_string() << "\""; - if (model.has_graph()) { - stream << nlidt(indent+1) << "graph:\n"; - dump(model.graph(), stream, indent+2); - } - if (model.opset_import_size()) { - stream << idt(indent+1) << "opset_import: ["; - for (auto &opset_imp : model.opset_import()) { - dump(opset_imp, stream); - } - stream << "],\n"; - } - stream << idt(indent) << "}\n"; -} -} // namespace - -std::string prettyPrint(const onnx::ModelProto& model) { - std::stringstream ss; - dump(model, ss, 0); - return ss.str(); -} - } namespace { @@ -561,15 +376,14 @@ RawDataExportMap ToModelProto( const std::vector & initializers, int64_t onnx_opset_version, bool defer_weight_export, - onnx_torch::OperatorExportTypes operator_export_type, + onnx::OperatorExportTypes operator_export_type, onnx::ModelProto *model_proto) { - if (operator_export_type != onnx_torch::OperatorExportTypes::RAW) { + if (operator_export_type != onnx::OperatorExportTypes::RAW) { validateGraph(graph, operator_export_type); } model_proto->set_producer_name("pytorch"); model_proto->set_producer_version("0.3"); - model_proto->set_ir_version(onnx::IR_VERSION); auto* imp = model_proto->add_opset_import(); // This is the version of ONNX operator set we are targeting imp->set_version(onnx_opset_version); @@ -597,12 +411,12 @@ std::string PrettyPrintExportedGraph( int64_t onnx_opset_version, bool defer_weight_export, ::torch::onnx::OperatorExportTypes operator_export_type) { - ::ONNX_NAMESPACE::ModelProto model_proto; + ::torch::onnx::ModelProto model_proto; RawDataExportMap raw_data_export_map; raw_data_export_map = ToModelProto( graph, initializers, onnx_opset_version, defer_weight_export, operator_export_type, &model_proto); - return prettyPrint(model_proto); + return model_proto.prettyPrint(); } // export_raw_ir will export IR ops without turning them into ONNX ops. @@ -616,12 +430,21 @@ std::tuple ExportGraph( int64_t onnx_opset_version, bool defer_weight_export, ::torch::onnx::OperatorExportTypes operator_export_type) { - ::ONNX_NAMESPACE::ModelProto model_proto; + ::torch::onnx::ModelProto model_proto; RawDataExportMap raw_data_export_map; raw_data_export_map = ToModelProto( graph, initializers, onnx_opset_version, defer_weight_export, operator_export_type, &model_proto); - return std::make_tuple(model_proto.SerializeAsString(), raw_data_export_map); + + size_t out_size; + pb_get_encoded_size(&out_size, onnx_ModelProto_fields, &model_proto.proto); + + // Allocate storage and export the graph + std::string out(out_size, '\0'); + pb_ostream_t ostream = pb_ostream_from_buffer(reinterpret_cast(&out[0]), out_size); + pb_encode(&ostream, onnx_ModelProto_fields, &model_proto.proto); + + return std::make_tuple(out, raw_data_export_map); } }} diff --git a/torch/csrc/jit/fusion_compiler.cpp b/torch/csrc/jit/fusion_compiler.cpp index 22f8b40ba30542..8d20045efefe6a 100644 --- a/torch/csrc/jit/fusion_compiler.cpp +++ b/torch/csrc/jit/fusion_compiler.cpp @@ -345,14 +345,18 @@ std::vector emitCompilationUnit(std::ostream & out, size_t i = 0; for(auto o : subgraph.outputs()) { auto & desc = agraph.output_desc[i++]; - if(o->node()->kind() != prim::FusedConcat) { + if(o->node()->kind() != aten::cat) { emitFormal(o, desc); concat_desc.emplace_back(); flat_output_nodes.push_back(o); } else { auto cat = o->node(); - concat_desc.emplace_back(desc, cat->inputs().size(), cat->i(attr::dim)); - for(auto c : cat->inputs()) { + auto tensor_inputs = cat->inputs(); + // We need to drop the dim arg + tensor_inputs = tensor_inputs.slice(0, tensor_inputs.size() - 1); + size_t nInputs = tensor_inputs.size(); + concat_desc.emplace_back(desc, nInputs, cat->get(attr::dim).value()); + for(auto c : tensor_inputs) { emitFormal(c, *concat_desc.back().subtensorDesc); flat_output_nodes.push_back(c); } @@ -382,9 +386,8 @@ std::vector emitCompilationUnit(std::ostream & out, } for(auto n : subgraph.nodes()) { - // FusedConcat nodes work by narrowing the output Tensors before the kernel runs - if (n->kind() == prim::FusedConcat) - continue; + if(n->kind() == aten::cat) + continue; // Concat nodes by narrowing the output Tensors before the kernel runs env.s("node",valueName(n->output())); env.s("rhs", encodeRHS(n)); body << format("auto ${node} = ${rhs};\n",env); diff --git a/torch/csrc/jit/fusion_compiler.h b/torch/csrc/jit/fusion_compiler.h index c2f35ee0aa2074..6c4759aefb692a 100644 --- a/torch/csrc/jit/fusion_compiler.h +++ b/torch/csrc/jit/fusion_compiler.h @@ -86,7 +86,7 @@ struct CompiledFusionFunction { TH_DISALLOW_COPY_AND_ASSIGN(CompiledFusionFunction); CompiledFusionFunction(const std::string & name, AnnotatedGraph & agraph); - virtual ~CompiledFusionFunction() = default; + virtual ~CompiledFusionFunction() {} // expects outputs to be pre-allocated void launch_with_tensors(at::ArrayRef inputs, at::ArrayRef outputs); diff --git a/torch/csrc/jit/graph_executor.cpp b/torch/csrc/jit/graph_executor.cpp index 56a836b312d0c7..df81c378ad137d 100644 --- a/torch/csrc/jit/graph_executor.cpp +++ b/torch/csrc/jit/graph_executor.cpp @@ -21,7 +21,6 @@ #include "torch/csrc/jit/passes/specialize_undef.h" #include "torch/csrc/jit/passes/loop_unrolling.h" #include "torch/csrc/jit/passes/lower_grad_of.h" -#include "torch/csrc/jit/passes/constant_propagation.h" #include "torch/csrc/jit/symbolic_variable.h" #include "torch/csrc/jit/ivalue.h" @@ -241,7 +240,14 @@ struct GraphExecutorImpl { , symbolically_differentiable(symbolically_differentiable) , may_introduce_gradient(calcMayIntroduceGradient(this->graph->block())) {} GraphExecutorImpl(std::shared_ptr graph, bool optimize) - : GraphExecutorImpl(graph, optimize, isDifferentiable(*graph)) {} + : GraphExecutorImpl(graph, optimize, isDifferentiable(*graph)) { + for(auto input : graph->inputs()) { + JIT_ASSERTM(input->type()->kind() != TypeKind::TupleType, "tuples cannot be inputs to the graph"); + } + for(auto output : graph->outputs()) { + JIT_ASSERTM(output->type()->kind() != TypeKind::TupleType, "tuples cannot be outputs to the graph"); + } + } // entry point where execution begins void run(Stack & stack) { @@ -510,28 +516,28 @@ void runRequiredPasses(const std::shared_ptr& g) { RemoveExpands(g); } -void specializeToSpec(const std::shared_ptr& graph, const ArgumentSpec& spec) { +void specializeToSpec(const std::shared_ptr& graph_, const ArgumentSpec& spec) { // clean up GradOf and AutogradAdd nodes // this must be first because later passes do not know what GradOfs are std::vector defined; for(size_t i = 0; i < spec.size(); ++i) { defined.push_back(spec.at(i).defined()); } - specializeUndef(*graph, defined); + specializeUndef(*graph_, defined); // required passes shared with autograd fallback - runRequiredPasses(graph); + runRequiredPasses(graph_); // Decompose addmm nodes to add + mm, so expands can be inserted and // gradients accumulated on the backward pass // // In the future, if we need more passes like this, we should convert this // into a generic canonicalization pass. - DecomposeAddmm(graph); + DecomposeAddmm(graph_); // clean up dead constants from specialization - EliminateDeadCode(graph); + EliminateDeadCode(graph_); // calculate all input shapes - PropagateInputShapes(*graph, spec); + PropagateInputShapes(*graph_, spec); } void runOptimization(std::shared_ptr & graph, bool graphMustSupportVariables) { @@ -548,7 +554,7 @@ void runOptimization(std::shared_ptr & graph, bool graphMustSupportVariab // They also may assume that concrete sizes/strides are availiable UnrollLoops(graph); - ConstantPropagation(graph); + //TODO: create peephole optimizations that are safe to run // when we are using variables, and when we do not know sizes. PeepholeOptimize(graph); diff --git a/torch/csrc/jit/graph_executor.h b/torch/csrc/jit/graph_executor.h index 2693af50af1025..4e862c9e0a1e44 100644 --- a/torch/csrc/jit/graph_executor.h +++ b/torch/csrc/jit/graph_executor.h @@ -34,7 +34,7 @@ struct GraphExecutorState { struct GraphExecutorImpl; struct TORCH_API GraphExecutor { - GraphExecutor() = default; + GraphExecutor() {} GraphExecutor(std::shared_ptr graph, bool optimize = true); // note: if not specified, symbolically_differentiable is computed from the graph. GraphExecutor(std::shared_ptr graph, bool optimize, bool symbolically_differentiable); diff --git a/torch/csrc/jit/graph_node_list.h b/torch/csrc/jit/graph_node_list.h index 054b9517776863..996a8b2c75fa0f 100644 --- a/torch/csrc/jit/graph_node_list.h +++ b/torch/csrc/jit/graph_node_list.h @@ -1,5 +1,3 @@ -#pragma once - #include "torch/csrc/jit/assertions.h" namespace torch { namespace jit { diff --git a/torch/csrc/jit/import.cpp b/torch/csrc/jit/import.cpp index a453925cf2f8eb..5b128fd822dafd 100644 --- a/torch/csrc/jit/import.cpp +++ b/torch/csrc/jit/import.cpp @@ -1,5 +1,5 @@ #include "torch/csrc/jit/import.h" -#include "onnx/onnx.pb.h" +#include "torch/csrc/onnx/onnx.npb.h" #include "torch/csrc/jit/ir.h" #include "torch/csrc/utils/functional.h" #include "torch/csrc/jit/assertions.h" @@ -16,60 +16,401 @@ namespace torch { namespace jit { namespace { -// IR graph construction +// Deserialized data + +struct Tensor_ { + std::vector dims; + std::vector raw_data; + onnx_TensorProto_DataType data_type; +}; + +struct AttributeValue_ { + std::string name; + onnx_AttributeProto_AttributeType type; + double f; + int64_t i; + std::string s; + Tensor_ t; + std::string g; + std::vector fs; + std::vector is; + std::vector ss; + std::vector ts; + std::vector gs; +}; + +struct Value_ { + std::string name; +}; + +struct Node_ { + std::string op_type; + std::string domain; + std::vector inputs; + std::vector outputs; + std::vector attrs; +}; + +struct Graph_ { + std::vector inputs; + std::vector outputs; + std::vector nodes; + std::vector initializers; +}; + +struct Model_ { + Graph_ graph; +}; + + +// Readers + +struct ReaderBase { + ReaderBase() {} + ReaderBase(pb_callback_t& cb) { + initialize_callback(cb); + } + + void initialize_callback(pb_callback_t& cb) { + cb.funcs.decode = ReaderBase::decode; + cb.arg = this; + } + + virtual void decode(pb_istream_t *stream) = 0; + + static bool decode(pb_istream_t *stream, const pb_field_t *, void **_self) { + ReaderBase* self = *reinterpret_cast(_self); + self->decode(stream); + return true; + } +}; + + +template +struct Reader : ReaderBase {}; + +template +struct Reader> : Reader { + Reader(pb_callback_t& cb) : Reader(cb) {} + // Decode is going to be called repeatedly from the callback + // (registered in the parent class constructor) each time an + // element is encountered. So all we do is relay the decoding + // through the parent class decode and push the result, every + // time this decode is called. + virtual void decode(pb_istream_t *stream) override { + Reader::decode(stream); + values.push_back(std::move(Reader::value)); + } + std::vector values; +}; + +template<> +struct Reader : ReaderBase { + Reader(pb_callback_t& cb) : ReaderBase(cb) {} + virtual void decode(pb_istream_t *stream) override { + // For string and bytes, the length value has already been + // parsed, and is available at stream->bytes_left. + std::vector res(stream->bytes_left); + if (!pb_read(stream, res.data(), stream->bytes_left)) { + throw std::runtime_error("Decoding failed"); + } + value.assign(res.begin(), res.end()); + } + std::string value; +}; + +template<> +struct Reader : ReaderBase { + Reader(pb_callback_t& cb) : ReaderBase(cb) {} + virtual void decode(pb_istream_t *stream) override { + if (!pb_decode_fixed32(stream, &value)) { + throw std::runtime_error("Decoding failed"); + } + } + double value; +}; + +template<> +struct Reader : ReaderBase { + Reader(pb_callback_t& cb) : ReaderBase(cb) {} + virtual void decode(pb_istream_t *stream) override { + if (!pb_decode_varint(stream, reinterpret_cast(&value))) { + throw std::runtime_error("Decoding failed"); + } + } + int64_t value; +}; + +template<> +struct Reader> : ReaderBase { + Reader(pb_callback_t& cb) : ReaderBase(cb) {} + virtual void decode(pb_istream_t *stream) override { + // For string and bytes, the length value has already been + // parsed, and is available at stream->bytes_left. + value.resize(stream->bytes_left); + if (!pb_read(stream, value.data(), stream->bytes_left)) { + throw std::runtime_error("Decoding failed"); + } + } + std::vector value; +}; + +template<> +struct Reader : ReaderBase { + Reader() + : proto(onnx_TensorProto_init_default) + , dims_reader(proto.dims) + , raw_data_reader(proto.raw_data) + {} + + Reader(pb_callback_t& cb) + : Reader() { initialize_callback(cb); } + + virtual void decode(pb_istream_t *stream) override { + if (!pb_decode(stream, onnx_TensorProto_fields, &proto)) { + throw std::runtime_error("Decoding failed"); + } -namespace onnx = ::ONNX_NAMESPACE; + value.dims = std::move(dims_reader.values); + value.raw_data = std::move(raw_data_reader.value); + value.data_type = proto.data_type; + } -at::Tensor buildTensor(const onnx::TensorProto& tensor_proto) { + onnx_TensorProto proto; + Reader> dims_reader; + Reader> raw_data_reader; + Tensor_ value; +}; + +template<> +struct Reader : ReaderBase { + Reader() + : proto(onnx_AttributeProto_init_default) + , name_reader(proto.name) + , str_reader(proto.s) + , tensor_reader(proto.t) + , graph_reader(proto.g) + , floats_reader(proto.floats) + , ints_reader(proto.ints) + , strings_reader(proto.strings) + , tensors_reader(proto.tensors) + , graphs_reader(proto.graphs) {} + + Reader(pb_callback_t& cb) + : Reader() { initialize_callback(cb); } + + virtual void decode(pb_istream_t *stream) override { + if (!pb_decode(stream, onnx_AttributeProto_fields, &proto)) { + throw std::runtime_error("Decoding failed"); + } + + value.name = std::move(name_reader.value); + value.type = proto.type; + value.f = proto.f; + value.i = proto.i; + value.s = std::move(str_reader.value); + value.t = std::move(tensor_reader.value); + value.g = std::move(graph_reader.value); + value.fs = std::move(floats_reader.values); + value.is = std::move(ints_reader.values); + value.ss = std::move(strings_reader.values); + value.ts = std::move(tensors_reader.values); + value.gs = std::move(graphs_reader.values); + } + + onnx_AttributeProto proto; + Reader name_reader; + Reader str_reader; + Reader tensor_reader; + Reader graph_reader; + Reader> floats_reader; + Reader> ints_reader; + Reader> strings_reader; + Reader> tensors_reader; + Reader> graphs_reader; + AttributeValue_ value; +}; + +template<> +struct Reader : ReaderBase { + Reader() + : proto(onnx_ValueInfoProto_init_default) + , name_reader(proto.name) {} + Reader(pb_callback_t& cb) + : Reader() { initialize_callback(cb); } + + virtual void decode(pb_istream_t *stream) override { + if (!pb_decode(stream, onnx_ValueInfoProto_fields, &proto)) { + throw std::runtime_error("Decoding failed"); + } + + value.name = std::move(name_reader.value); + } + + onnx_ValueInfoProto proto; + Reader name_reader; + Value_ value; +}; + + +template<> +struct Reader : ReaderBase { + Reader() + : proto(onnx_NodeProto_init_default) + , op_type_reader(proto.op_type) + , domain_reader(proto.domain) + , inputs_reader(proto.input) + , outputs_reader(proto.output) + , attrs_reader(proto.attribute) + {} + Reader(pb_callback_t& cb) + : Reader() { initialize_callback(cb); } + + virtual void decode(pb_istream_t *stream) override { + if (!pb_decode(stream, onnx_NodeProto_fields, &proto)) { + throw std::runtime_error("Decoding failed"); + } + + value.op_type = std::move(op_type_reader.value); + value.domain = std::move(domain_reader.value); + value.inputs = std::move(inputs_reader.values); + value.outputs = std::move(outputs_reader.values); + value.attrs = std::move(attrs_reader.values); + } + + onnx_NodeProto proto; + Reader op_type_reader; + Reader domain_reader; + Reader> inputs_reader; + Reader> outputs_reader; + Reader> attrs_reader; + Node_ value; +}; + + +template<> +struct Reader : ReaderBase { + Reader() + : proto(onnx_GraphProto_init_default) + , input_reader(proto.input) + , output_reader(proto.output) + , node_reader(proto.node) + , initializer_reader(proto.initializer) + {} + Reader(pb_callback_t& cb) + : Reader() { initialize_callback(cb); } + + virtual void decode(pb_istream_t *stream) override { + if (!pb_decode(stream, onnx_GraphProto_fields, &proto)) { + throw std::runtime_error("Decoding failed"); + } + + value.inputs = std::move(input_reader.values); + value.outputs = std::move(output_reader.values); + value.nodes = std::move(node_reader.values); + value.initializers = std::move(initializer_reader.values); + } + + static Graph_ read(pb_istream_t *stream) { + Reader reader; + reader.decode(stream); + return reader.value; + } + + onnx_GraphProto proto; + Reader> input_reader; + Reader> output_reader; + Reader> node_reader; + Reader> initializer_reader; + Graph_ value; +}; + + +template<> +struct Reader : ReaderBase { + Reader() + : proto(onnx_ModelProto_init_default) + , graph_reader(proto.graph) {} + Reader(pb_callback_t& cb) + : Reader() { initialize_callback(cb); } + + virtual void decode(pb_istream_t *stream) override { + if (!pb_decode(stream, onnx_ModelProto_fields, &proto)) { + throw std::runtime_error("Decoding failed"); + } + + value.graph = std::move(graph_reader.value); + } + + static Model_ read(pb_istream_t *stream) { + Reader reader; + reader.decode(stream); + return reader.value; + } + + onnx_ModelProto proto; + Reader graph_reader; + Model_ value; +}; + + +// IR graph construction + +at::Tensor buildTensor(const Tensor_& tensor_) { at::Tensor tensor; - switch(tensor_proto.data_type()) { - case onnx::TensorProto_DataType_UINT8: + switch(tensor_.data_type) { + case onnx_TensorProto_DataType_UINT8: tensor = at::CPU(at::kByte).tensor(); break; - case onnx::TensorProto_DataType_INT8: + case onnx_TensorProto_DataType_INT8: tensor = at::CPU(at::kChar).tensor(); break; - case onnx::TensorProto_DataType_INT16: + case onnx_TensorProto_DataType_INT16: tensor = at::CPU(at::kShort).tensor(); break; - case onnx::TensorProto_DataType_INT32: + case onnx_TensorProto_DataType_INT32: tensor = at::CPU(at::kInt).tensor(); break; - case onnx::TensorProto_DataType_INT64: + case onnx_TensorProto_DataType_INT64: tensor = at::CPU(at::kLong).tensor(); break; - case onnx::TensorProto_DataType_FLOAT16: + case onnx_TensorProto_DataType_FLOAT16: tensor = at::CPU(at::kHalf).tensor(); break; - case onnx::TensorProto_DataType_FLOAT: + case onnx_TensorProto_DataType_FLOAT: tensor = at::CPU(at::kFloat).tensor(); break; - case onnx::TensorProto_DataType_DOUBLE: + case onnx_TensorProto_DataType_DOUBLE: tensor = at::CPU(at::kDouble).tensor(); break; default: throw std::runtime_error("Unsupported data type"); } - std::vector sizes = {tensor_proto.dims().begin(), tensor_proto.dims().end()}; - tensor.resize_(sizes); + tensor.resize_(tensor_.dims); JIT_ASSERT( tensor.storage()->pImpl()->get_size() * tensor.storage()->pImpl()->elementSize() == - tensor_proto.raw_data().size()); + tensor_.raw_data.size()); - std::memcpy(tensor.data_ptr(), tensor_proto.raw_data().data(), tensor_proto.raw_data().size()); + std::memcpy(tensor.data_ptr(), tensor_.raw_data.data(), tensor_.raw_data.size()); return tensor; } -void buildBlock(const onnx::GraphProto& graph_proto, Block* block, +Graph_ readSubgraph(const std::string& serialized_subgraph) { + pb_istream_t istream = pb_istream_from_buffer(reinterpret_cast(serialized_subgraph.data()), serialized_subgraph.size()); + + return Reader::read(&istream); +} + +void buildBlock(const Graph_& graph_, Block* block, std::unordered_map& value_map); -void buildBlocks(const std::vector& graphs_, Node* node, +void buildBlocks(const std::vector& graphs_, Node* node, std::unordered_map& value_map) { for (auto g_ : graphs_) { auto block = node->addBlock(); @@ -77,96 +418,97 @@ void buildBlocks(const std::vector& graphs_, Node* node, } } -std::shared_ptr buildGraph(const onnx::GraphProto& graph_proto) { +std::shared_ptr buildGraph(const Graph_& graph_) { auto graph = std::make_shared(); std::unordered_map value_map; - buildBlock(graph_proto, graph->block(), value_map); + buildBlock(graph_, graph->block(), value_map); return graph; } -void buildBlock(const onnx::GraphProto& graph_proto, Block* block, +void buildBlock(const Graph_& graph_, Block* block, std::unordered_map& value_map) { - for (auto & input : graph_proto.input()) { - value_map[input.name()] = block->addInput(); + for (auto & input : graph_.inputs) { + value_map[input.name] = block->addInput(); } - for (auto & node_ : graph_proto.node()) { - JIT_ASSERT(node_.op_type() != "PythonOp"); + for (auto & node_ : graph_.nodes) { + JIT_ASSERT(node_.op_type != "PythonOp"); - auto node = block->owningGraph()->create(Symbol::fromDomainAndUnqualString(node_.domain(), node_.op_type()), - node_.output().size()); + auto node = block->owningGraph()->create(Symbol::fromDomainAndUnqualString(node_.domain, node_.op_type), + node_.outputs.size()); - for (auto & attr : node_.attribute()) { - Symbol name = Symbol::attr(attr.name()); + for (auto & attr : node_.attrs) { + Symbol name = Symbol::attr(attr.name); - switch(attr.type()) { - case onnx::AttributeProto_AttributeType_UNDEFINED: + switch(attr.type) { + case onnx_AttributeProto_AttributeType_UNDEFINED: throw std::runtime_error("UNDEFINED attribute unsupported"); break; - case onnx::AttributeProto_AttributeType_FLOAT: - node->f_(name, attr.f()); + case onnx_AttributeProto_AttributeType_FLOAT: + node->f_(name, attr.f); break; - case onnx::AttributeProto_AttributeType_INT: - node->i_(name, attr.i()); + case onnx_AttributeProto_AttributeType_INT: + node->i_(name, attr.i); break; - case onnx::AttributeProto_AttributeType_STRING: - node->s_(name, std::move(attr.s())); + case onnx_AttributeProto_AttributeType_STRING: + node->s_(name, std::move(attr.s)); break; - case onnx::AttributeProto_AttributeType_TENSOR: - node->t_(name, buildTensor(attr.t())); + case onnx_AttributeProto_AttributeType_TENSOR: + node->t_(name, buildTensor(attr.t)); break; - case onnx::AttributeProto_AttributeType_GRAPH: - node->g_(name, buildGraph(attr.g())); + case onnx_AttributeProto_AttributeType_GRAPH: + node->g_(name, buildGraph(readSubgraph(attr.g))); break; - case onnx::AttributeProto_AttributeType_FLOATS: - node->fs_(name, {attr.floats().begin(), attr.floats().end()}); + case onnx_AttributeProto_AttributeType_FLOATS: + node->fs_(name, std::move(attr.fs)); break; - case onnx::AttributeProto_AttributeType_INTS: - node->is_(name, {attr.ints().begin(), attr.ints().end()}); + case onnx_AttributeProto_AttributeType_INTS: + node->is_(name, std::move(attr.is)); break; - case onnx::AttributeProto_AttributeType_STRINGS: - node->ss_(name, {attr.strings().begin(), attr.strings().end()}); + case onnx_AttributeProto_AttributeType_STRINGS: + node->ss_(name, std::move(attr.ss)); break; - case onnx::AttributeProto_AttributeType_TENSORS: - node->ts_(name, fmap(attr.tensors(), [](const onnx::TensorProto& t) { return buildTensor(t); })); + case onnx_AttributeProto_AttributeType_TENSORS: + node->ts_(name, fmap(attr.ts, [](const Tensor_& t) { return buildTensor(t); })); break; - case onnx::AttributeProto_AttributeType_GRAPHS: - if (attr.name() == "_blocks") { - buildBlocks({attr.graphs().begin(), attr.graphs().end()}, node, value_map); + case onnx_AttributeProto_AttributeType_GRAPHS: + if (attr.name == "_blocks") { + buildBlocks(fmap(attr.gs, [](const std::string& g) { return readSubgraph(g); }), node, value_map); } else { - node->gs_(name, fmap(attr.graphs(), [](const onnx::GraphProto& g_) { return buildGraph(g_); })); + node->gs_(name, fmap(fmap(attr.gs, [](const std::string& g) { return readSubgraph(g); } ), + [](const Graph_& g_) { return buildGraph(g_); })); } break; } } - for (auto & input : node_.input()) { + for (auto & input : node_.inputs) { auto v = value_map[input]; node->addInput(v); } - for (int i=0; ioutputs()[i]; + for (size_t i=0; ioutputs()[i]; } block->appendNode(node); } - for (auto & output : graph_proto.output()) { - Value* v = value_map.at(output.name()); + for (auto & output : graph_.outputs) { + Value* v = value_map.at(output.name); block->registerOutput(v); } } -std::shared_ptr buildGraph(const onnx::GraphProto& graph_proto, std::vector& initializers) { +std::shared_ptr buildGraph(const Graph_& graph_, std::vector& initializers) { - auto graph = buildGraph(graph_proto); + auto graph = buildGraph(graph_); - for (auto tensor_ : graph_proto.initializer()) { + for (auto tensor_ : graph_.initializers) { initializers.push_back(buildTensor(tensor_)); } @@ -215,10 +557,12 @@ void reconstructOutputTypes(Block *b) { std::shared_ptr ImportIRGraph(const std::string& serialized_graph, std::vector& initializers) { - auto model_proto = ::ONNX_NAMESPACE::ModelProto(); - model_proto.ParseFromString(serialized_graph); - auto graph = buildGraph(model_proto.graph(), initializers); + pb_istream_t istream = pb_istream_from_buffer(reinterpret_cast(serialized_graph.data()), serialized_graph.size()); + + auto model = Reader::read(&istream); + + auto graph = buildGraph(model.graph, initializers); reconstructOutputTypes(graph->block()); diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp index 5363eda02ff528..d3a9bd9139a96e 100644 --- a/torch/csrc/jit/init.cpp +++ b/torch/csrc/jit/init.cpp @@ -18,7 +18,6 @@ #include "torch/csrc/jit/passes/onnx/fixup_onnx_loop.h" #include "torch/csrc/jit/passes/shape_analysis.h" #include "torch/csrc/jit/passes/decompose_addmm.h" -#include "torch/csrc/jit/passes/constant_propagation.h" #include "torch/csrc/jit/passes/loop_unrolling.h" #include "torch/csrc/jit/passes/to_batch.h" #include "torch/csrc/jit/passes/specialize_undef.h" @@ -71,14 +70,11 @@ void initJITBindings(PyObject *module) { }) .def("_jit_pass_lint", LintGraph) .def("_jit_pass_shape_analysis", [](Graph& graph, py::tuple inputs, bool with_grad) { - PropagateInputShapes(graph, ArgumentSpec(with_grad, createStack(inputs, graph.inputs()))); + PropagateInputShapes(graph, ArgumentSpec(with_grad, createStack(inputs))); }) .def("_jit_pass_remove_expands", RemoveExpands) .def("_jit_pass_erase_number_types", EraseNumberTypes) .def("_jit_pass_loop_unrolling", UnrollLoops) - .def("_jit_pass_constant_propagation", [](std::shared_ptr& g) { - return ConstantPropagation(g); - }) .def("_jit_run_cpp_tests", [] { // We have to release the GIL inside this method, because if we happen to // initialize the autograd engine in these tests, the newly spawned worker threads will @@ -186,16 +182,15 @@ void initJITBindings(PyObject *module) { return ge.graph(); }) .def("graph_for", [](GraphExecutor& ge, py::args args) { - return ge.graphFor(createStack(args, ge.graph()->inputs())); + return ge.graphFor(createStack(args)); }) .def("get_debug_state", [](GraphExecutor& ge) { return ge.getDebugState(); }) .def("__call__", [](GraphExecutor& ge, py::args args) -> py::object { - const auto & graph = ge.graph(); - auto stack = createStack(args, graph->inputs()); + auto stack = createStack(args); ge.run(stack); - return wrapStack(std::move(stack), graph->outputs()); + return wrapStack(std::move(stack)); }); diff --git a/torch/csrc/jit/interned_strings.h b/torch/csrc/jit/interned_strings.h index c567793552d73a..52b8cb0eaccd98 100644 --- a/torch/csrc/jit/interned_strings.h +++ b/torch/csrc/jit/interned_strings.h @@ -50,7 +50,6 @@ _(prim, TensorToNum) \ _(prim, AutogradAdd) \ _(prim, GradOf) \ _(prim, AnyDefined) \ -_(prim, FusedConcat) \ _(aten, __not__) \ FORALL_ATEN_BASE_SYMBOLS(_) \ _(onnx, Add) \ diff --git a/torch/csrc/jit/interpreter.cpp b/torch/csrc/jit/interpreter.cpp index 0c1fe17ade0dfd..65bdcf695f6de2 100644 --- a/torch/csrc/jit/interpreter.cpp +++ b/torch/csrc/jit/interpreter.cpp @@ -337,9 +337,9 @@ struct PreprocessGraph { struct ContainerTensor : public at::TensorImpl { public: ContainerTensor() - : TensorImpl(at::Backend::Undefined,at::ScalarType::Undefined, nullptr, /* is_variable */ false) {} + : TensorImpl(&(at::globalContext().getType(at::Backend::Undefined,at::ScalarType::Undefined)), nullptr) {} - virtual ~ContainerTensor() = default; + virtual ~ContainerTensor() {} virtual at::IntList sizes() const override { throw std::runtime_error("sizes() on ContainerTensor"); } @@ -685,8 +685,8 @@ struct CodeImpl { // InterpreterState state that is held across stages and used to compute a Code struct InterpreterStateImpl { - InterpreterStateImpl(const Code & code) - : function(code.pImpl), + InterpreterStateImpl(const Code & function_) + : function(function_.pImpl), int_data(function->int_data.data()), bool_data(function->bool_data), registers(function->register_size) { @@ -775,15 +775,15 @@ std::ostream & operator<<(std::ostream & out, const Code & code) { Code::Code(std::shared_ptr& graph) : pImpl(new CodeImpl(graph)) {} -Code::~Code() = default; +Code::~Code() {} const std::vector& Code::executors() { return pImpl->executors(); } -InterpreterState::InterpreterState(const Code & code) - : pImpl(new InterpreterStateImpl(code)) {} -InterpreterState::~InterpreterState() = default; +InterpreterState::InterpreterState(const Code & function) + : pImpl(new InterpreterStateImpl(function)) {} +InterpreterState::~InterpreterState() {} void InterpreterState::runOneStage(Stack & stack) { return pImpl->runOneStage(stack); diff --git a/torch/csrc/jit/ir.cpp b/torch/csrc/jit/ir.cpp index ede14249c46dce..7f09b22b324d11 100644 --- a/torch/csrc/jit/ir.cpp +++ b/torch/csrc/jit/ir.cpp @@ -44,9 +44,9 @@ std::ostream& operator<<(std::ostream & out, const at::ArrayRef & nodes) { } struct const_value_list_with_types { - const ArrayRef values; + const std::vector& values; bool use_newlines; - const_value_list_with_types(ArrayRef values, bool use_newlines = false) + const_value_list_with_types(const std::vector& values, bool use_newlines = false) : values(values), use_newlines(use_newlines) {} }; std::ostream& operator<<(std::ostream & out, const_value_list_with_types l) { @@ -355,7 +355,7 @@ void Graph::lint() const { // - every use will occur later in the topsort struct LintScope { - LintScope() = default; + LintScope() {} LintScope(std::unique_ptr parent) : parent(std::move(parent)) {} bool contains(const Value * v) { @@ -487,13 +487,13 @@ void LintGraph(std::shared_ptr& graph) { graph->lint(); } -void Block::cloneFrom(Block * src, std::function value_map) { +void Block::cloneFrom(Block * src, std::function outer_map) { std::unordered_map local_map; auto env = [&](Value * v) { auto it = local_map.find(v); if(it != local_map.end()) return it->second; - return value_map(v); + return outer_map(v); }; auto graph = owningGraph(); @@ -619,8 +619,23 @@ Value* Node::namedInput(Symbol name) const { // so this is completely unsafe and needs to be gone as soon as possible. return v; } + const auto & the_schema = schema(); + int64_t tensor_list_pos = 0; + for (auto & arg : the_schema.arguments) { + if (*arg.type == *ListType::ofTensors()) + break; + tensor_list_pos++; + } int64_t arg_pos = findArgument(schema(), name).first; - return input(arg_pos); + // XXX: we don't have a single value we could give for a Tensor[], + // because we flatten lists into arguments + JIT_ASSERT(arg_pos != tensor_list_pos); + // NB: if there's no tensor list, then tensor_list_pos == arguments.size(), so this is always true + if (arg_pos < tensor_list_pos) { + return input(arg_pos); + } else { + return input(inputs().size() - (the_schema.arguments.size() - arg_pos)); + } } bool Node::matches(const char *signature_literal, at::ArrayRef const_inputs) { @@ -631,12 +646,8 @@ bool Node::matches(const char *signature_literal, at::ArrayRef const_inp return true; } -void Node::dump() const { - std::cout << *this << "\n"; -} - void Node::findSchema() const { - schema_ = &getOperatorFor(this).schema(); + schema_ = &getOperatorFor(this).schema; } PythonOp* defaultAllocPythonOp(Graph*g) { diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h index b2caa642b6fe20..9af468e6ee06e7 100644 --- a/torch/csrc/jit/ir.h +++ b/torch/csrc/jit/ir.h @@ -54,7 +54,7 @@ struct Value; TORCH_API std::ostream& operator<<(std::ostream & out, const Graph & g); TORCH_API std::ostream& operator<<(std::ostream & out, const Type & t); -TORCH_API std::ostream& operator<<(std::ostream & out, const Node & n); +TORCH_API std::ostream& operator<<(std::ostream & out, const Node & t); // A list of nodes, with inputs and outputs struct Block; @@ -683,9 +683,7 @@ struct Node : public Attributes { return *schema_; } - void dump() const; - - virtual ~Node() = default; + virtual ~Node() {} private: std::pair findInput(Symbol name); void findSchema() const; @@ -891,7 +889,8 @@ friend struct Block; , block_(new Block(this, nullptr)) , insert_before_(return_node()) {} - Graph() : Graph(std::make_shared()) {} + Graph() + : Graph( std::make_shared()) {} at::ArrayRef inputs() { return block_->inputs(); diff --git a/torch/csrc/jit/ivalue.h b/torch/csrc/jit/ivalue.h index 6eef40a0323068..42a5be89e55e4b 100644 --- a/torch/csrc/jit/ivalue.h +++ b/torch/csrc/jit/ivalue.h @@ -83,7 +83,6 @@ struct ConstantList; struct IValue; using Tuple = ConstantList; using IntList = ConstantList; -using TensorList = ConstantList; using DoubleList = ConstantList; // IValue is the generic tagged union used by the interpreter to hold @@ -94,7 +93,7 @@ using DoubleList = ConstantList; // retain/release calls. #define TORCH_FORALL_TAGS(_) \ - _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(TensorList) + _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) struct IValue { IValue() @@ -224,20 +223,6 @@ struct IValue { return toRetainable(); } - //TensorList - IValue(Shared v); - IValue(std::vector v); - bool isTensorList() const { return Tag::TensorList == tag; } - Shared toTensorList() && { - JIT_ASSERT(isTensorList()); - return moveToRetainable(); - } - Shared toTensorList() const & { - JIT_ASSERT(isTensorList()); - return toRetainable(); - } - - // None bool isNone() { return Tag::None == tag; } @@ -384,15 +369,8 @@ inline IValue::IValue(Shared v) inline IValue::IValue(std::vector v) : IValue(DoubleList::create(std::move(v))) {} -inline IValue::IValue(Shared v) -: tag(Tag::TensorList), retainable(true) { - as_retainable = v.detach(); -} -inline IValue::IValue(std::vector v) -: IValue(TensorList::create(std::move(v))) {} - inline std::vector IValue::copyToIntList() const { - return toIntList()->elements().vec(); + return std::vector(toIntList()->elements()); } }} diff --git a/torch/csrc/jit/operator.cpp b/torch/csrc/jit/operator.cpp index 5cb2c2c11ad5a7..f19d18caa9289e 100644 --- a/torch/csrc/jit/operator.cpp +++ b/torch/csrc/jit/operator.cpp @@ -248,12 +248,8 @@ std::string canonicalSchemaString(const FunctionSchema& schema) { using OperatorMap = std::unordered_map>>; struct OperatorRegistry { -private: - std::mutex lock; OperatorMap operators; - // list of operators whose schema have not yet been parsed, and must - // be registered before any call to lookup an opeator - std::vector> to_register; + std::mutex lock; // Those two maps are used to implement lookupByLiteral, which is needed for the n->match(...) calls. // Basically, every function schema is assigned a unique string you can use to match it. However, // parsing those strings or comparing and hashing them character by character would be very slow, so @@ -264,26 +260,18 @@ struct OperatorRegistry { // by performing a lookup in the operators_by_sig map. std::unordered_map> operators_by_sig; std::unordered_map> operators_by_sig_literal; + void registerOperator(Operator&& op){ + std::lock_guard guard(lock); - // XXX - caller must be holding lock - void registerPendingOperators() { - for(auto op : to_register) { - Symbol sym = Symbol::fromQualString(op->schema().name); - operators[sym].push_back(op); - operators_by_sig[canonicalSchemaString(op->schema())] = op; - } - to_register.clear(); - } + Symbol sym = Symbol::fromQualString(op.schema.name); + auto op_ptr = std::make_shared(std::move(op)); -public: - void registerOperator(Operator&& op) { - std::lock_guard guard(lock); - to_register.push_back(std::make_shared(std::move(op))); + operators[sym].push_back(op_ptr); + + operators_by_sig[canonicalSchemaString(op.schema)] = op_ptr; } const std::shared_ptr& lookupByLiteral(const char * name) { - std::lock_guard guard(lock); - registerPendingOperators(); auto it = operators_by_sig_literal.find(name); if (it == operators_by_sig_literal.end()) { auto op_ptr_it = operators_by_sig.find(name); @@ -301,10 +289,8 @@ struct OperatorRegistry { return it->second; } - const std::vector>& getOperators(Symbol name) { std::lock_guard guard(lock); - registerPendingOperators(); static std::vector> empty; auto it = operators.find(name); if(it != operators.end()) @@ -356,16 +342,16 @@ bool typeMatches(TypePtr actual, TypePtr formal) { } bool Operator::matches(const Node* node) const { - if (node->kind().toQualString() != schema().name) { + if (node->kind().toQualString() != schema.name) { return false; } size_t attributes_size = node->numAttributes(); size_t attributes_seen = 0; auto inputs_size = node->inputs().size(); size_t input_i = 0; - for(size_t arg_i = 0; arg_i < schema().arguments.size(); ++arg_i) { + for(size_t arg_i = 0; arg_i < schema.arguments.size(); ++arg_i) { at::optional attribute_kind; - const Argument& arg = schema().arguments[arg_i]; + const Argument& arg = schema.arguments[arg_i]; if(attributes_size > 0 && (attribute_kind = attributeKindOf(arg.type))) { auto name = Symbol::fromQualString("attr::" + arg.name); if(!node->hasAttribute(name) || node->kindOf(name) != *attribute_kind) { @@ -373,6 +359,22 @@ bool Operator::matches(const Node* node) const { return false; } attributes_seen++; + } else if(*arg.type == *ListType::ofTensors()) { + // Tensor[] is handled as varargs, consume inputs until the remaining required arguments + // XXX - there can only be a single Tensor[] in a declaration + size_t remaining_required = 0; + for(size_t j = arg_i + 1; j < schema.arguments.size(); ++j){ + // remaining arguments are only those that won't be consumed from attributes + if(attributes_size == 0 || !attributeKindOf(schema.arguments[j].type)) + remaining_required++; + } + while(inputs_size - input_i > remaining_required) { + auto input = node->inputs()[input_i++]; + if(!typeMatches(input->type(), DynamicType::get())) { + // std::cout << "vararg argument is not Dynamic\n"; + return false; + } + } } else { if(input_i == inputs_size) { // std::cout << "not enough inputs\n"; @@ -386,11 +388,11 @@ bool Operator::matches(const Node* node) const { } } - if(!schema().is_vararg && input_i != inputs_size) { + if(!schema.is_vararg && input_i != inputs_size) { // std::cout << "not all inputs used\n" << input_i << " " << inputs_size << "\n"; return false; } - if(!schema().is_vararg && attributes_seen != attributes_size) { + if(!schema.is_vararg && attributes_seen != attributes_size) { // std::cout << "not all attributes used\n" << attributes_seen << " " << attributes_size << "\n"; return false; } @@ -424,7 +426,7 @@ const Operator& getOperatorFor(const Node* node) { er << "\ncandidates were:\n"; const auto& candidates = getAllOperatorsFor(node->kind()); for(auto & candidate : candidates) { - er << " " << candidate->schema() << "\n"; + er << " " << candidate->schema << "\n"; } throw er; } @@ -434,7 +436,7 @@ OperatorSet::OperatorSet(std::initializer_list sig_literals) { auto & registry = getRegistry(); for (const char * sig : sig_literals) { auto op = registry.lookupByLiteral(sig); - ops[Symbol::fromQualString(op->schema().name)].push_back(op); + ops[Symbol::fromQualString(op->schema.name)].push_back(op); } } diff --git a/torch/csrc/jit/operator.h b/torch/csrc/jit/operator.h index be2c20b01a5379..7e6a314d2cb8c3 100644 --- a/torch/csrc/jit/operator.h +++ b/torch/csrc/jit/operator.h @@ -2,81 +2,57 @@ // once C10 exists this can be removed, or stubbed out, but we need // it now to implement correct semantic checking for script #pragma once - +#include "ATen/ATen.h" #include "torch/csrc/jit/assertions.h" #include "torch/csrc/jit/ir.h" #include "torch/csrc/jit/function_schema.h" #include "torch/csrc/jit/stack.h" -#include "ATen/ATen.h" - -#include -#include -#include -#include -#include -#include -#include - namespace torch { namespace jit { -FunctionSchema parseSchema(const std::string& schema); +FunctionSchema parseSchema(const std::string& decl); using OperationCreator = std::function; struct TORCH_API Operator { - Operator(FunctionSchema schema, OperationCreator op_creator) - : schema_(std::make_shared(std::move(schema))), - op_creator_(std::move(op_creator)) {} + Operator(FunctionSchema schema, OperationCreator op, OperationCreator op_const_attributes = nullptr) + : schema(std::move(schema)) + , op(std::move(op)) + , op_const_attributes(std::move(op_const_attributes)) {} - Operator(const std::string& schema, OperationCreator op_creator) - : schema_string_(schema), op_creator_(std::move(op_creator)) {} + Operator(const std::string& schema, OperationCreator op, OperationCreator op_const_attributes = nullptr) + : Operator(parseSchema(schema), std::move(op), std::move(op_const_attributes)) {} - // Helper constructor to register `op` to run + // Helper constructor to regsiter `op` to run // run for _every_ IR Node where n.kind() == name, regardless of arguments. - // This is accomplished by marking the schema varargs and having no required - // arguments. This is used for things like prim::While or prim::If that can - // take a number of different valid input types and lengths. - Operator(Symbol name, OperationCreator op_creator) - : Operator(FunctionSchema(name, {}, {}, true), std::move(op_creator)) {} - - Operator(FunctionSchema schema, Operation op) - : schema_(std::make_shared(std::move(schema))), - op_(std::make_shared(std::move(op))) {} - - Operator(const std::string& schema, Operation op) - : schema_string_(schema), - op_(std::make_shared(std::move(op))) {} - - bool matches(const Node* node) const; - - Operation getOperation(Node* node = nullptr) const { - if (op_) { - return *op_; + // This is accomplished by marking the schema varargs and having no required arguments. + // This is used for things like prim::While or prim::If that can take a number + // of different valid input types and lengths. + Operator(Symbol name, OperationCreator op) + : Operator(FunctionSchema(name, {}, {}, true), op, op) {} + + FunctionSchema schema; + + bool matches(const Node* n) const; + // Operators have different versions depending on if some inputs are encoded + // as attributes or inputs. This function returns the right Operation function, + // given a node encoded for one variant. + // Behavior is undefined if matches(n) == false + // TODO (apaszke) : remove + Operation selectVariant(Node* n) const { + if(n->hasAttributes()) { + JIT_ASSERT(op_const_attributes != nullptr); + return op_const_attributes(n); + } else { + return op(n); } - AT_ASSERT(node != nullptr); - return op_creator_(node); } - - const FunctionSchema & schema() const { - // we lazily parse schema initialized from strings so that - // we do less work during static operator registration - if(!schema_) { - schema_ = std::make_shared(parseSchema(schema_string_.value())); - schema_string_ = at::nullopt; - } - return *schema_; + bool hasAttributedVersion() const { + return op_const_attributes != nullptr; } private: - mutable at::optional schema_string_; - // cannot use at::optional because windows has issues that require an assignment operator to be generated - // cannot use std::unique_ptr because initializer lists of Operators end up copying the Operator - mutable std::shared_ptr schema_; - - // Essentially a variant. - // NB: std::function has a default state (where it == nullptr). - std::shared_ptr op_; - OperationCreator op_creator_; + OperationCreator op; + OperationCreator op_const_attributes; }; const std::vector>& getAllOperatorsFor(Symbol name); @@ -86,7 +62,7 @@ const Operator& getOperatorFor(const Node* node); inline Operation getOperation(Node* node) { // note: getOperatorFor ensures that getOperatorFor(node).matches(node) == true // so the call to selectVariant is always valid. - return getOperatorFor(node).getOperation(node); + return getOperatorFor(node).selectVariant(node); } void registerOperator(Operator&& op); diff --git a/torch/csrc/jit/passes/batch_mm.cpp b/torch/csrc/jit/passes/batch_mm.cpp index 414dc1652a4da1..0e40bc8831a6df 100644 --- a/torch/csrc/jit/passes/batch_mm.cpp +++ b/torch/csrc/jit/passes/batch_mm.cpp @@ -3,9 +3,8 @@ #include "torch/csrc/jit/passes/dead_code_elimination.h" #include "torch/csrc/jit/interned_strings.h" #include "torch/csrc/jit/constants.h" -#include "torch/csrc/jit/symbolic_variable.h" -#include "torch/csrc/jit/assertions.h" #include "torch/csrc/utils/functional.h" +#include "torch/csrc/jit/assertions.h" #include #include @@ -192,11 +191,12 @@ void BatchMMBlock(Block* block) { int cat_dim = s == Side::LHS ? 1 : 0; cat_sizes[cat_dim] *= matmuls.size(); // make them really cat_sizes + auto inputs = fmap(matmuls, [=](Node *mm) { return mm->inputs()[inputs_off]; }); WithInsertPoint iguard { root.node }; - auto inputs = fmap(matmuls, [=](Node *mm) -> SymbolicVariable { return mm->inputs()[inputs_off]; }); - auto cat_output = SymbolicVariable::cat(inputs, cat_dim).value(); - cat_output->setType(type->withSizes(cat_sizes)); - return cat_output; + inputs.push_back(insertConstant(*graph, cat_dim)); + Node *cat = graph->insertNode(graph->create(aten::cat, inputs)); + cat->output()->setType(type->withSizes(cat_sizes)); + return cat->output(); }; auto lhs_batch = batch_inputs(Side::LHS, root.lhs_sizes); diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp deleted file mode 100644 index 39492f9e76c50c..00000000000000 --- a/torch/csrc/jit/passes/constant_propagation.cpp +++ /dev/null @@ -1,95 +0,0 @@ -#include "torch/csrc/jit/passes/constant_propagation.h" -#include "torch/csrc/autograd/variable.h" -#include "torch/csrc/jit/constants.h" -#include "torch/csrc/jit/interpreter.h" -#include "torch/csrc/jit/ir.h" -#include "torch/csrc/jit/ivalue.h" -#include "torch/csrc/jit/operator.h" -#include "torch/csrc/jit/passes/dead_code_elimination.h" -#include "torch/csrc/utils/functional.h" - -namespace torch { namespace jit { - -namespace { - -std::unordered_set skip_list = { - //FIXME If & Loop require special casing because they cannot be run as a - //single node. - prim::If, - prim::Loop, - //FIXME Same problem as in DCE - cpp & python PythonOp and CppOp should be - //FIXME treated as having side effects but ONNX depends on them being removed - prim::Print, - //all the rand functions from native_functions.yaml - aten::permute, - aten::rand, - aten::rand_out, - aten::rand_like, - aten::randint, - aten::randint_out, - aten::randint_like, - aten::randn, - aten::randn_out, - aten::randn_like, - aten::randperm, - aten::randperm_out, - }; - -std::vector runNode(Node* n) { - auto op = getOperation(n); - Stack stack; - for (auto input : n->inputs()) { - stack.push_back(*(toIValue(input))); - } - op(stack); - auto var_outputs = fmap(stack, [&](IValue v) { - if (v.isTensor()) { - return IValue(autograd::as_variable_ref(v.toTensor()).data()); - } else { - return v; - } - }); - return var_outputs; -} - -void propagateNode(Node* n) { - auto outputs = runNode(n); - auto graph = n->owningGraph(); - WithInsertPoint guard(n); - for (size_t i = 0; i < outputs.size(); ++i) { - auto new_output = insertConstant(*graph, outputs[i]); - n->outputs()[i]->replaceAllUsesWith(new_output); - // let dce elimination remove n - } -} - -} // anonymous namespace - -void ConstantPropagation(Node* n, bool recurse) { - bool constant_inputs = (n->inputs().size() > 0) && - std::all_of(n->inputs().begin(), n->inputs().end(), [&](Value* v) { - return v->node()->kind() == prim::Constant; - }); - bool supported_node = skip_list.count(n->kind()) == 0; - if (constant_inputs && supported_node) { - propagateNode(n); - } - if (recurse) { - for (Block * block : n->blocks()) - ConstantPropagation(block, recurse); - } -} - -void ConstantPropagation(Block* block, bool recurse) { - ConstantPropagation(block->param_node(), recurse); - for (auto n: block->nodes()) { - ConstantPropagation(n, recurse); - } -} - -void ConstantPropagation(std::shared_ptr& graph) { - ConstantPropagation(graph->block(), true); - EliminateDeadCode(graph); -} - -}} diff --git a/torch/csrc/jit/passes/constant_propagation.h b/torch/csrc/jit/passes/constant_propagation.h deleted file mode 100644 index 12df329c81ccfc..00000000000000 --- a/torch/csrc/jit/passes/constant_propagation.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include "torch/csrc/jit/ir.h" - -namespace torch { namespace jit { - -TORCH_API void ConstantPropagation(std::shared_ptr& graph); -TORCH_API void ConstantPropagation(Block* block, bool recurse); -TORCH_API void ConstantPropagation(Node* n, bool recurse); - -}} diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp index cc8dcb8926dee0..cb3757cffb0e34 100644 --- a/torch/csrc/jit/passes/graph_fuser.cpp +++ b/torch/csrc/jit/passes/graph_fuser.cpp @@ -177,25 +177,16 @@ struct GraphFuser { } } - bool isFusableCatNode(Node * node) { - if (node->kind() != aten::cat) - return false; - if (!node->is_constant(attr::dim)) - return false; + bool allCatInputsHaveSameSize(Node * node) { + JIT_ASSERT(node->kind() == aten::cat); + std::vector inputs = node->inputs(); + if (!node->hasAttributes()) { + inputs.pop_back(); // Get rid of the dim argument + } - auto tensors_node = node->namedInput(attr::tensors)->node(); - if (tensors_node->kind() != prim::ListConstruct) return false; - // NB: Note that technically other uses of the list aren't a big problem for us. - // It would be enough to place the prim::FusedConcat before the prim::ListConstruct, and - // allUsersAreThisConsumerOrOccurAfterIt would still be satisfied. However, I don't expect this - // to be necessary any time soon, and so we're simply assuming that we don't have to deal with that. - if (tensors_node->output()->uses().size() > 1) return false; - auto tensors = tensors_node->inputs(); - - // Our fusion code assumes that all inputs have the same shapes, so we need to check this too. - auto expected = tensors.at(0)->type()->cast(); + auto expected = inputs.at(0)->type()->cast(); if (!expected) return false; - return std::all_of(tensors.begin(), tensors.end(), [&expected](Value *v) { + return std::all_of(inputs.begin(), inputs.end(), [expected](Value *v) { auto actual = v->type()->cast(); return actual && actual->sizes() == expected->sizes(); }); @@ -206,7 +197,15 @@ struct GraphFuser { // because it is not a simple map, can be put in a fusion group // as long as no items in the group read the output of concat bool isFusableAsExitNode(Node * node) { - return isFusable(node) || isFusableCatNode(node); + if(isFusable(node)) + return true; + // this concat fusion only works when all the inputs are the same size + // and we can statically infer the dimension along which we should concat + // otherwise they cannot partipate in the same map + if(node->kind() == aten::cat && node->is_constant(attr::dim) && allCatInputsHaveSameSize(node)) + return true; + + return false; } // necessary condition for fusion. If all of the uses of producer are consumer @@ -242,9 +241,8 @@ struct GraphFuser { // we can move the consumer up into the producer. // but this requires better handling of merging fusion groups so it is not done now at::optional consumer_device = getDevice(consumer); - Node *real_consumer = consumer->kind() == aten::cat ? consumer->namedInput(attr::tensors)->node() : consumer; return isFusable(producer->node()) && - allUsersAreThisConsumerOrOccurAfterIt(real_consumer, producer) && + allUsersAreThisConsumerOrOccurAfterIt(consumer, producer) && consumer_device && consumer_device == getDevice(producer->node()) && (*consumer_device != kCPUDevice || sharedFusionCompiler().canCompileOnCPU()); } @@ -391,24 +389,7 @@ struct GraphFuser { Node * fuse(Node * consumer, Value * producer) { auto group = consumer; - if (consumer->kind() == aten::cat) { - Graph * graph = consumer->owningGraph(); - Node * list_construct = consumer->namedInput(attr::tensors)->node(); - int64_t dim = consumer->get(attr::dim).value(); - - Node * fused_cat = graph->create(prim::FusedConcat, list_construct->inputs())->i_(attr::dim, dim); - fused_cat->insertBefore(list_construct); - fused_cat->output()->copyMetadata(consumer->output()); - consumer->output()->replaceAllUsesWith(fused_cat->output()); - topological_index[fused_cat] = topological_index[list_construct]; - - // NB: this deletes the fused_cat node from the original graph - group = createSingletonFusionGroup(fused_cat); - consumer->destroy(); - if (list_construct->output()->uses().empty()) { - list_construct->destroy(); - } - } else if (consumer->kind() != prim::FusionGroup) { + if(group->kind() != prim::FusionGroup) { group = createSingletonFusionGroup(consumer); } if (producer->node()->kind() == prim::FusionGroup) { @@ -469,6 +450,7 @@ struct GraphFuser { } } + // TODO: Remove this restriction if we ever need to distribute across // multiple return operators Node * producer_for_chunk_node = producer_for_chunk->node(); JIT_ASSERT(producer_for_chunk_node->outputs().size() == 1); @@ -539,14 +521,11 @@ struct GraphFuser { std::pair scanNode(Node * consumer) { auto stage_guard = block->owningGraph()->setStageTemporary(consumer->stage()); if(isFusableAsExitNode(consumer)) { - value_list inputs; - auto consumer_inputs = consumer->kind() == aten::cat ? - consumer->namedInput(attr::tensors)->node()->inputs() : - consumer->inputs(); // handle inputs in reverse topological order as well... // otherwise in f(a,a+b) it will appear a is used twice if we consider // the f-a fusion before the f-(a+b) fusion first. - for(auto i : consumer_inputs) { + value_list inputs; + for(auto i : consumer->inputs()) { if (i->node()->owningBlock() == block) { inputs.push_back(i); JIT_ASSERT(topological_index.count(i->node()) > 0); diff --git a/torch/csrc/jit/passes/lower_grad_of.h b/torch/csrc/jit/passes/lower_grad_of.h index 0ec3589e3acd31..a0a881e3002ed9 100644 --- a/torch/csrc/jit/passes/lower_grad_of.h +++ b/torch/csrc/jit/passes/lower_grad_of.h @@ -10,6 +10,6 @@ namespace torch { namespace jit { // outputs = // else: // outputs = undefineds -TORCH_API void LowerGradOf(Graph& g); +TORCH_API void LowerGradOf(Graph& graph); }} diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp index ee9b76f417bd17..63fb7030aa3ad1 100644 --- a/torch/csrc/jit/passes/shape_analysis.cpp +++ b/torch/csrc/jit/passes/shape_analysis.cpp @@ -263,39 +263,6 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) { default: break; // fall-through } - if (node->matches("aten::cat(Tensor[] tensors, int dim) -> Tensor", /*with_const=*/attr::dim)) { - auto list_node = node->namedInput(attr::tensors)->node(); - JIT_ASSERT(list_node->kind() == prim::ListConstruct); - auto tensors = list_node->inputs(); - if (tensors.size() > 0) { - auto input_types = fmap(tensors, [](Value *v) { return v->type()->cast(); }); - if (std::all_of(input_types.begin(), input_types.end(), - [](const TensorTypePtr& tp) { return tp != nullptr; })) { - std::vector sizes = input_types[0]->sizes(); - const int64_t dim = wrapDim(node->get(attr::dim).value(), sizes); - const int64_t ndim = sizes.size(); - - if (dim < 0 || dim >= ndim) - goto cat_fail; - - sizes[dim] = 0; - for (auto & tp : input_types) { - auto & tp_sizes = tp->sizes(); - if (sizes.size() != tp_sizes.size()) - goto cat_fail; - for (int64_t i = 0; i < ndim; ++i) { - if (sizes[i] != tp_sizes[i] && i != dim) { - goto cat_fail; - } - } - sizes[dim] += tp_sizes[dim]; - } - node->output()->setType(input_types[0]->withSizes(sizes)); - return; - } - } - } -cat_fail: bool can_propagate_by_running = canPropagateShapeByRunningIt(node); auto maybe_tensor_types = gatherTensorTypes(node); diff --git a/torch/csrc/jit/passes/to_batch.cpp b/torch/csrc/jit/passes/to_batch.cpp index f78da9b92baccc..5494cf2b78a798 100644 --- a/torch/csrc/jit/passes/to_batch.cpp +++ b/torch/csrc/jit/passes/to_batch.cpp @@ -3,530 +3,59 @@ namespace torch { namespace jit { -std::unordered_map>> ToBatch::batch_operator_table; - -std::shared_ptr ToBatch::getBatchOperator(std::string name, int64_t num_inputs){ - if(batch_operator_table.find(name) == batch_operator_table.end()){ - throw std::runtime_error("function " + name + " is not supported in batched tensor yet"); - } - auto ops = batch_operator_table.at(name); - if(num_inputs == -1) // default function - return ops[0]; - for(auto op : ops){ - if(size_t(num_inputs) == op->inputs().size()) - return op; - } - throw std::runtime_error("function " + name + " with " + std::to_string(num_inputs) + " inputs is not supported in batched tensor yet"); -} - -// replace aten operator node with BatchTensor operator graph -void ToBatch::visitAten(Node* n, Block* block, Block* res_block){ - auto res_graph = res_block->owningGraph(); - auto func_name = std::string(n->kind().toUnqualString()); - std::vector new_inputs; - for(Value *input : n->inputs()){ - if(rn_env.find(input) == rn_env.end()){ // non-tensor input - auto new_input = batch_map.at(input); - new_inputs.insert(new_inputs.end(), new_input.begin(), new_input.end()); - } - else{ // batched tensor input - new_inputs.push_back(rn_env.at(input)); - } - } - - // transform scalar to tensor before pass to batch operator script - for(size_t i = 0; i < new_inputs.size(); i++){ - auto input = new_inputs[i]; - if(input->type() == IntType::get() || input->type() == FloatType::get()){ - auto to_tensor_node = res_graph->createNumToTensor(input); - res_graph->insertNode(to_tensor_node); - new_inputs[i] = to_tensor_node->output(); - } - } - - auto batch_graph = getBatchOperator(func_name, new_inputs.size()); - auto outputs = script::inlineCallTo(*res_block->owningGraph(), *batch_graph, new_inputs); - - // Assume all outputs from inlined operator implementation are in the triple form batched tensor or just a single non-tensor. - if(outputs.size() == 1){ - // if previous output is scalar, transform new output back to scalar from dynamic - if(n->outputs()[0]->type() != outputs[0]->type()){ - Node* to_scalar_node; - if(n->outputs()[0]->type() == IntType::get()){ - to_scalar_node = res_graph->createTensorToNum(IntType::get(), outputs[0]); - } - else if(n->outputs()[0]->type() == FloatType::get()){ - to_scalar_node = res_graph->createTensorToNum(FloatType::get(), outputs[0]); - } - else{ - throw std::runtime_error("NYI: scalar type other than int, float is not supported yet"); - } - res_graph->insertNode(to_scalar_node); - rn_env[n->outputs()[0]] = to_scalar_node->output(); - } - else - rn_env[n->outputs()[0]] = outputs[0]; - } - else{ - for(size_t i = 0; i < n->outputs().size(); i++){ - auto output = n->outputs()[i]; - batch_map[output] = std::vector(outputs.begin() + i * EXP_BTENSOR_SIZE, outputs.begin() + i * EXP_BTENSOR_SIZE + EXP_BTENSOR_SIZE); - } - } -} - -// clone prim::Constant to new graph -// batching transformation is applied to the output of prim::NumToTensor. -// If there is a prim::NumToTensor following prim::Constant, it will be finally transformed to BatchTensor. -void ToBatch::visitConstant(Node* n, Block* block, Block* res_block){ - auto res_graph = res_block->owningGraph(); - auto* r_node = res_graph->createClone(n, rn_fn); - r_node->setStage(n->stage()); - res_block->appendNode(r_node); - rn_env[n->output()] = r_node->output(); -} - -// change return tensor to expanded batched tensor, eg: {data, mask, dims} -void ToBatch::visitNumToTensor(Node* n, Block* block, Block* res_block){ - auto res_graph = res_block->owningGraph(); - auto* r_node = res_graph->createClone(n, rn_fn); - r_node->setStage(n->stage()); - res_block->appendNode(r_node); - auto outputs = script::inlineCallTo(*res_block->owningGraph(), *getBatchOperator("batch_from_scalar_tensor"), r_node->outputs()); - batch_map[n->output()] = outputs; -} - -// clone prim::TensorToNum to new graph -void ToBatch::visitTensorToNum(Node* n, Block* block, Block* res_block){ - auto res_graph = res_block->owningGraph(); - if(rn_env.find(n->input()) == rn_env.end()){ - rn_env[n->input()] = batch_map.at(n->input())[0]; - } - auto* r_node = res_graph->createClone(n, rn_fn); - r_node->setStage(n->stage()); - res_block->appendNode(r_node); - rn_env[n->output()] = r_node->output(); - batch_map[n->output()] = batch_map.at(n->input()); -} - -// clone prim::ListConstruct to new graph -void ToBatch::visitListConstruct(Node* n, Block* block, Block* res_block){ - auto res_graph = res_block->owningGraph(); - if(n->inputs()[0]->type() == DynamicType::get()){ // TensorList: expand directly - std::vector inputs; - for(Value* input: n->inputs()) { - auto res = batch_map.at(input); - inputs.insert(inputs.end(), res.begin(), res.end()); - } - batch_map[n->output()] = inputs; - } - else { // ScalarList: transform to tensor, then transform back - for(Value* input : n->inputs()) { - if(rn_env.find(input) == rn_env.end()){ - rn_env[input] = batch_map.at(input)[0]; - } - } - auto* r_node = res_graph->createClone(n, rn_fn); - r_node->setStage(n->stage()); - res_block->appendNode(r_node); - // transform int[] to tensor - auto to_tensor_node = res_graph->create(Symbol::fromQualString("aten::_list_to_tensor")); - to_tensor_node->setStage(n->stage()); - to_tensor_node->addInput(r_node->output()); - res_block->appendNode(to_tensor_node); - rn_env[n->output()] = to_tensor_node->output(); - } -} - -// prim::If transformation: -// elif is not supported -// -// transformation example: -// @torch.jit.batch(batch_size=4) -// def batch_if(a, b): -// if a > b: -// a += b -// else: -// a -= b -// return a -// -// original graph: -// graph(%a.1 : Dynamic -// %b : Dynamic) { -// %2 : Dynamic = aten::gt(%a.1, %b) -// %a : Dynamic = prim::If(%2) -// block0() { -// %a.2 : Dynamic = aten::add[alpha={1}](%a.1, %b) -// -> (%a.2) -// } -// block1() { -// %a.3 : Dynamic = aten::sub[alpha={1}](%a.1, %b) -// -> (%a.3) -// } -// return (%a); -// } -// -// transformed graph: -// graph(%a.1_data : Dynamic -// %a.1_mask : Dynamic -// %a.1_dims : Dynamic -// %b_data : Dynamic -// %b_mask : Dynamic -// %b_dims : Dynamic) { -// %6 : Dynamic = aten::gt(%a.1_data, %b_data) // calculate condition -// %7 : Dynamic = aten::mul(%a.1_mask, %b_mask) -// %8 : Dynamic = aten::__or__(%a.1_dims, %b_dims) -// %9 : int = prim::TensorToNum(%6) -// %10 : Long() = prim::Constant[value={1}]() // if_block -// %alpha.1 : float = prim::TensorToNum(%10) -// %data.1 : Dynamic = aten::add(%a.1_data, %b_data, %alpha.1) -// %mask.1 : Dynamic = aten::mul(%a.1_mask, %b_mask) -// %dims.1 : Dynamic = aten::__or__(%a.1_dims, %b_dims) -// %15 : Long() = prim::Constant[value={1}]() // else_block -// %alpha : float = prim::TensorToNum(%15) -// %data.4 : Dynamic = aten::sub(%a.1_data, %b_data, %alpha) -// %mask : Dynamic = aten::mul(%a.1_mask, %b_mask) -// %dims : Dynamic = aten::__or__(%a.1_dims, %b_dims) -// %20 : Dynamic = aten::type_as(%7, %6) // combine two outputs (batch_where) -// %cond_mask.1 : Dynamic = aten::mul(%6, %20) -// %22 : int = aten::dim(%cond_mask.1) -// %23 : int = prim::Constant[value=1]() -// %24 : int = aten::eq(%22, %23) -// %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%24) -// block0() { -// %28 : int = aten::dim(%data.1) -// %29 : int = prim::Constant[value=1]() -// %30 : int = aten::sub(%28, %29) -// %31 : int = prim::Constant[value=1]() -// %data.3 : Dynamic = prim::Loop(%30, %31, %cond_mask.1) -// block0(%_ : int, %34 : Dynamic) { -// %35 : int = prim::Constant[value=1]() -// %36 : int = aten::neg(%35) -// %data.2 : Dynamic = aten::unsqueeze(%34, %36) -// %38 : int = prim::Constant[value=1]() -// -> (%38, %data.2) -// } -// %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1) -// %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask.1) -// -> (%cond_data.1, %cond_mask.2, %data.3) -// } -// block1() { -// -> (%cond_mask.1, %cond_mask.1, %cond_mask.1) -// } -// %res_data : Dynamic = aten::where(%cond_data, %data.1, %data.4) -// %res_mask : Dynamic = aten::where(%cond_mask, %mask.1, %mask) -// %res_dims : Dynamic = aten::__or__(%dims.1, %dims) -// return (%res_data, %res_mask, %res_dims); -// } -void ToBatch::visitIf(Node* n, Block* block, Block* res_block){ - toBatch(n->blocks()[0], res_block); - toBatch(n->blocks()[1], res_block); - - // combine results from two if paths - for(size_t i = 0; i < n->outputs().size(); i++){ - std::vector inputs; - if(batch_map.find(n->input()) == batch_map.end()){ // cond is scalar - inputs.push_back(rn_env.at(n->input())); - } - else{ // cond is tensor - auto cond = batch_map.at(n->input()); - inputs.insert(inputs.end(), cond.begin(), cond.end()); - } - auto if_output = batch_map.at(n->blocks()[0]->outputs()[i]); - inputs.insert(inputs.end(), if_output.begin(), if_output.end()); - auto else_output = batch_map.at(n->blocks()[1]->outputs()[i]); - inputs.insert(inputs.end(), else_output.begin(), else_output.end()); - auto outputs = script::inlineCallTo(*res_block->owningGraph(), *getBatchOperator("where", inputs.size()), inputs); - batch_map[n->outputs()[i]] = outputs; - } -} - -// prim::Loop transformation: -// -// transformation example: -// @torch.jit.batch(batch_size=4) -// def batch_while(a, b): -// while a > b: -// a -= b -// return a -// -// original graph: -// graph(%a.1 : Dynamic -// %b : Dynamic) { -// %2 : int = prim::Constant[value={2147483647}]() -// %3 : Dynamic = aten::gt(%a.1, %b) -// %a : Dynamic = prim::Loop(%2, %3, %a.1) -// block0(%4 : Dynamic, %5 : Dynamic) { -// %a.2 : Dynamic = aten::sub[alpha={1}](%5, %b) -// %9 : Dynamic = aten::gt(%a.2, %b) -// -> (%9, %a.2) -// } -// return (%a); -// } -// -// transformed graph: -// graph(%a.1_data : Dynamic -// %a.1_mask : Dynamic -// %a.1_dims : Dynamic -// %b_data : Dynamic -// %b_mask : Dynamic -// %b_dims : Dynamic) { -// %6 : int = prim::Constant[value=2147483647]() -// %7 : Dynamic = aten::gt(%a.1_data, %b_data) -// %8 : Dynamic = aten::mul(%a.1_mask, %b_mask) -// %9 : Dynamic = aten::__or__(%a.1_dims, %b_dims) -// %10 : int = prim::TensorToNum(%7) -// %11 : Dynamic = aten::mul(%7, %8) -// %12 : Dynamic = aten::sum(%11) -// %13 : Dynamic = aten::gt[other={0}](%12) // cond_any -// %14 : int = prim::TensorToNum(%13) -// %62 : Dynamic, %63 : Dynamic, %64 : Dynamic, %a : Dynamic, %60 : Dynamic, %61 : Dynamic = prim::Loop(%6, %14, %7, %8, %9, %a.1_data, %a.1_mask, %a.1_dims) -// block0(%loop_num : int, %cond_data.2 : Dynamic, %cond_mask.3 : Dynamic, %cond_dims : Dynamic, %6_data : Dynamic, %6_mask : Dynamic, %6_dims : Dynamic) { -// %23 : Long() = prim::Constant[value={1}]() -// %alpha : float = prim::TensorToNum(%23) -// %data.1 : Dynamic = aten::sub(%6_data, %b_data, %alpha) -// %mask : Dynamic = aten::mul(%6_mask, %b_mask) -// %dims : Dynamic = aten::__or__(%6_dims, %b_dims) -// %28 : Dynamic = aten::gt(%data.1, %b_data) -// %29 : Dynamic = aten::mul(%mask, %b_mask) -// %30 : Dynamic = aten::__or__(%dims, %b_dims) -// %31 : int = prim::TensorToNum(%28) -// %32 : Dynamic = aten::type_as(%cond_mask.3, %cond_data.2) // update outputs (batch_where) -// %cond_mask.1 : Dynamic = aten::mul(%cond_data.2, %32) -// %34 : int = aten::dim(%cond_mask.1) -// %35 : int = prim::Constant[value=1]() -// %36 : int = aten::eq(%34, %35) -// %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%36) -// block0() { -// %40 : int = aten::dim(%data.1) -// %41 : int = prim::Constant[value=1]() -// %42 : int = aten::sub(%40, %41) -// %43 : int = prim::Constant[value=1]() -// %data.3 : Dynamic = prim::Loop(%42, %43, %cond_mask.1) -// block0(%_ : int, %46 : Dynamic) { -// %47 : int = prim::Constant[value=1]() -// %48 : int = aten::neg(%47) -// %data.2 : Dynamic = aten::unsqueeze(%46, %48) -// %50 : int = prim::Constant[value=1]() -// -> (%50, %data.2) -// } -// %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1) -// %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask) -// -> (%cond_data.1, %cond_mask.2, %data.3) -// } -// block1() { -// -> (%cond_mask.1, %cond_mask.1, %cond_mask.1) -// } -// %res_data : Dynamic = aten::where(%cond_data, %data.1, %6_data) -// %res_mask : Dynamic = aten::where(%cond_mask, %mask, %6_mask) -// %res_dims : Dynamic = aten::__or__(%dims, %6_dims) -// %56 : Dynamic = aten::mul(%28, %29) -// %57 : Dynamic = aten::sum(%56) -// %58 : Dynamic = aten::gt[other={0}](%57) -// %59 : int = prim::TensorToNum(%58) -// -> (%59, %28, %29, %30, %res_data, %res_mask, %res_dims) -// } -// return (%a, %60, %61); -// } -void ToBatch::visitLoop(Node* n, Block* block, Block* res_block){ - auto res_graph = res_block->owningGraph(); - // bool cond_is_tensor indicates whether cond is tensor - // cond_is_tensor = false, eg: for loop, n->inputs()[1] = byte() - // cond_is_tensor = true, eg: in some while loop, cond is a batched tensor, - // we need to add expanded cond to the inputs of loop node and block, - // and compute cond_any as cond for while loop - bool cond_is_tensor = (batch_map.find(n->inputs()[1]) != batch_map.end()); - - // create prim::Loop node for res_block - - // type of cond in loop should be int type - if(rn_env.at(n->inputs()[0])->type() != IntType::get()){ - auto to_int_node = res_graph->createTensorToNum(IntType::get(), rn_env.at(n->inputs()[0])); - res_graph->insertNode(to_int_node); - rn_env[n->inputs()[0]] = to_int_node->output(); - } - if(cond_is_tensor){ - auto cond = batch_map.at(n->inputs()[1]); - auto cond_any = script::inlineCallTo(*res_block->owningGraph(), *getBatchOperator("any"), cond); - auto to_int_node = res_graph->createTensorToNum(IntType::get(), cond_any[0]); - res_graph->insertNode(to_int_node); - rn_env[n->inputs()[1]] = to_int_node->output(); - } - for(size_t i = 2; i < n->inputs().size(); i++){ - auto input = n->inputs()[i]; - rn_env[input] = batch_map.at(input)[0]; - } - auto* r_node = res_graph->createClone(n, rn_fn, /*copy_blocks=*/false); - - // change inputs of prim::Loop - if(cond_is_tensor){ - for(size_t i = 0; i < EXP_BTENSOR_SIZE; i++){ - auto cond = batch_map.at(n->inputs()[1]); - r_node->insertInput(i + 2, cond[i]); - } - } - for(size_t i = 2; i < n->inputs().size(); i++){ - for(size_t j = 1; j < EXP_BTENSOR_SIZE; j++){ - r_node->insertInput((i - 2) * EXP_BTENSOR_SIZE + EXP_BTENSOR_SIZE * cond_is_tensor + 2 + j, batch_map.at(n->inputs()[i])[j]); - } - } - r_node->setStage(n->stage()); - res_block->appendNode(r_node); - - // create block for Loop node in res_block - // if cond is tensor: first 4 inputs of block: cond_any, cond_data, cond_mask, cond_dims - // if cond is not tensor: first 1 input of block: cond - auto loop_block = r_node->addBlock(); - - // add inputs - loop_block->addInput("loop_num"); - loop_block->inputs()[0]->setType(IntType::get()); - rn_env[n->blocks()[0]->inputs()[0]] = loop_block->inputs()[0]; - if(cond_is_tensor){ - for(size_t i = 0; i < EXP_BTENSOR_SIZE; i++){ - loop_block->addInput("cond_" + EXP_BTENSOR_NAME[i]); - } - } - for(size_t i = 1; i < n->blocks()[0]->inputs().size(); i++){ - auto input = n->blocks()[0]->inputs()[i]; - auto name = input->uniqueName(); - for(size_t j = 0; j < EXP_BTENSOR_SIZE; j++){ - loop_block->addInput(name + "_" + EXP_BTENSOR_NAME[j]); - } - batch_map[input] = std::vector(loop_block->inputs().slice((i - 1) * EXP_BTENSOR_SIZE + 1 + EXP_BTENSOR_SIZE * cond_is_tensor, EXP_BTENSOR_SIZE).vec()); - } - - toBatch(n->blocks()[0], loop_block); - - WithInsertPoint guard(loop_block); - - // use where operator to update variables and add to outputs - for(size_t i = 0; i < n->outputs().size(); i++){ - std::vector inputs, outputs; - if(cond_is_tensor){ - for(size_t j = 0; j < EXP_BTENSOR_SIZE; j++){ - inputs.push_back(loop_block->inputs()[j + 1]); - } - auto data = batch_map.at(n->blocks()[0]->outputs()[i + 1]); - inputs.insert(inputs.end(), data.begin(), data.end()); - for(size_t j = 0; j < EXP_BTENSOR_SIZE; j++){ - inputs.push_back(loop_block->inputs()[i * EXP_BTENSOR_SIZE + j + EXP_BTENSOR_SIZE + 1]); - } - outputs = script::inlineCallTo(*res_block->owningGraph(), *getBatchOperator("where"), inputs); - } - else{ - for(size_t j = 0; j < EXP_BTENSOR_SIZE; j++){ - inputs.push_back(loop_block->inputs()[i * EXP_BTENSOR_SIZE + j + 1]); - } - auto data = batch_map.at(n->blocks()[0]->outputs()[i + 1]); - inputs.insert(inputs.end(), data.begin(), data.end()); - outputs = script::inlineCallTo(*res_block->owningGraph(), *getBatchOperator("update"), inputs); - } - batch_map[n->outputs()[i]] = outputs; - for(size_t j = 0; j < EXP_BTENSOR_SIZE; j++){ - loop_block->registerOutput(outputs[j]); - } - } - - // update loop conditions - if(cond_is_tensor){ - auto cond = batch_map.at(n->blocks()[0]->outputs()[0]); - auto cond_any = script::inlineCallTo(*res_block->owningGraph(), *getBatchOperator("any"), cond); - auto to_int_node = res_graph->createTensorToNum(IntType::get(), cond_any[0]); - res_graph->insertNode(to_int_node); - loop_block->insertOutput(0, to_int_node->output()); - for(size_t i = 0; i < EXP_BTENSOR_SIZE; i++){ - loop_block->insertOutput(i + 1, cond[i]); - } - } - else{ - auto cond = rn_env.at(n->blocks()[0]->outputs()[0]); - loop_block->insertOutput(0, cond); - } - - // change outputs of prim::Loop - auto size = r_node->outputs().size(); - for(size_t i = 0; i < size; i++){ - for(size_t j = 1; j < EXP_BTENSOR_SIZE; j++){ - r_node->insertOutput(i * EXP_BTENSOR_SIZE + j); - } - batch_map[n->outputs()[i]] = r_node->outputs().slice(i * EXP_BTENSOR_SIZE, EXP_BTENSOR_SIZE).vec(); - } - // add cond to outputs of loop node - if(cond_is_tensor){ - for(size_t i = 0; i < EXP_BTENSOR_SIZE; i++){ - r_node->insertOutput(i); - } - } -} +std::unordered_map> ToBatch::batch_operator_table; void ToBatch::toBatch(Block* block, Block* res_block) { - WithInsertPoint guard(res_block); - - // change inputs of block - expand tensor to batchtensor eg: (data, mask, dims) - // eg: a -> a_data, a_mask, a_dims - // for block in prim::Loop, register inputs separately to deal with cond - if(!block->owningNode() || block->owningNode()->kind() != prim::Loop){ - auto size = block->inputs().size(); - for(size_t i = 0; i < size; i++){ - auto input = block->inputs()[i]; - auto name = input->uniqueName(); - for(size_t j = 0; j < EXP_BTENSOR_SIZE; j++){ - res_block->addInput(name + "_" + EXP_BTENSOR_NAME[j]); - } - batch_map[input] = std::vector(res_block->inputs().slice(i * EXP_BTENSOR_SIZE, EXP_BTENSOR_SIZE).vec()); - } + // change inputs of a graph - expand tensor to {data, mask, dims} + auto size = block->inputs().size(); + for(size_t i = 0; i < size; i++){ + auto input = block->inputs()[i]; + auto name = input->uniqueName(); + res_block->addInput(name + "_data"); + res_block->addInput(name + "_mask"); + res_block->addInput(name + "_dims"); + batch_map[input] = std::vector(res_block->inputs().slice(i * 3, 3)); } for (auto it = block->nodes().begin(); it != block->nodes().end(); it++) { auto n = *it; + // replace tensor operator to BatchTensor operator if(n->kind().is_aten()){ - visitAten(n, block, res_block); - } - else if(n->kind().is_prim()){ - switch(n->kind()){ - case prim::Constant: - visitConstant(n, block, res_block); - break; - case prim::NumToTensor: - visitNumToTensor(n, block, res_block); - break; - case prim::TensorToNum: - visitTensorToNum(n, block, res_block); - break; - case prim::ListConstruct: - visitListConstruct(n, block, res_block); - break; - case prim::If: - visitIf(n, block, res_block); - break; - case prim::Loop: - visitLoop(n, block, res_block); - break; - default: - throw std::runtime_error("NYI: node of prim kind other than [Constant, NumToTensor, TensorToNum, If, Loop] is not supported yet"); + auto batch_graph = batch_operator_table.at(n->kind().toUnqualString()); + WithInsertPoint guard(res_block); + std::vector new_inputs; + for(Value *input : n->inputs()){ + if(batch_map.find(input) != batch_map.end()){ + auto new_input = batch_map.at(input); + new_inputs.insert(new_inputs.end(), new_input.begin(), new_input.end()); + } + else{ + throw std::runtime_error("NYI: non-tensor input for aten operator is not supported yet"); + } + } + auto outputs = script::inlineCallTo(*res_block->owningGraph(), *batch_graph, new_inputs); + // Assume all outputs from inlined operator implementation are in the triple form. + for(size_t i = 0; i < n->outputs().size(); i++){ + auto output = n->outputs()[i]; + batch_map[output] = std::vector(outputs.begin() + i * 3, outputs.begin() + i * 3 + 3); } } - else{ - throw std::runtime_error("NYI: node that is not aten or prim kind is not supported yet"); + else if(n->kind().is_prim()){ + throw std::runtime_error("NYI: node of prim kind is not supported to transform to batch graph yet"); } } - // change outputs of block - expand tensor to batchtensor(data, mask, dims) - // for block in prim::Loop, register outputs separately to deal with cond and cond_any - // for block in prim::If, register outputs separately by combining outputs from two paths and return - if(!block->owningNode() || (block->owningNode()->kind() != prim::Loop && block->owningNode()->kind() != prim::If)) { - for(Value* output : block->outputs()){ - auto r_output = batch_map.at(output); - for(size_t i = 0; i < EXP_BTENSOR_SIZE; i++){ - res_block->registerOutput(r_output[i]); - } - } + // change outputs of a graph - expand tensor to {data, mask, dims} + for(Value* output : block->outputs()){ + auto r_output = batch_map.at(output); + res_block->registerOutput(r_output[0]); + res_block->registerOutput(r_output[1]); + res_block->registerOutput(r_output[2]); } } std::shared_ptr to_batch_graph(std::shared_ptr& graph){ // std::cout<toString()< res_graph = std::make_shared(graph->scope_root()); + auto res_graph = std::make_shared(graph->scope_root()); ToBatch to_batch; to_batch.toBatch(graph->block(), res_graph->block()); // std::cout<toString()<(); m.def("to_batch_graph", &to_batch_graph); m.def("register_batch_operator", [](std::string name, std::shared_ptr graph){ - ToBatch::batch_operator_table[name].push_back(graph); + ToBatch::batch_operator_table[name] = graph; }); } diff --git a/torch/csrc/jit/passes/to_batch.h b/torch/csrc/jit/passes/to_batch.h index 6545e2a2d4f8ed..23c23a0632b310 100644 --- a/torch/csrc/jit/passes/to_batch.h +++ b/torch/csrc/jit/passes/to_batch.h @@ -3,33 +3,14 @@ #include "torch/csrc/jit/pybind.h" #include "torch/csrc/jit/ir.h" -#include - namespace torch { namespace jit { class ToBatch { private: - // number of tensors to represent a expanded BatchTensor. {data, mask, dims} for now. - const size_t EXP_BTENSOR_SIZE = 3; - const std::vector EXP_BTENSOR_NAME = {"data", "mask", "dims"}; // mapping from tensor in original graph to {data, mask, dims} in new graph std::unordered_map> batch_map; - // mapping from input in original graph to new input in new graph - used in createClone - std::unordered_map rn_env; - std::function rn_fn = [this](Value* v) { return rn_env.at(v); }; - -private: - std::shared_ptr getBatchOperator(std::string name, int64_t input_num = -1); - void visitAten(Node* n, Block* block, Block* res_block); - void visitConstant(Node* n, Block* block, Block* res_block); - void visitNumToTensor(Node* n, Block* block, Block* res_block); - void visitTensorToNum(Node* n, Block* block, Block* res_block); - void visitListConstruct(Node* n, Block* block, Block* res_block); - void visitIf(Node* n, Block* block, Block* res_block); - void visitLoop(Node* n, Block* block, Block* res_block); - public: - static std::unordered_map>> batch_operator_table; + static std::unordered_map> batch_operator_table; TORCH_API void toBatch(Block* block, Block* res_block); }; diff --git a/torch/csrc/jit/pybind_utils.h b/torch/csrc/jit/pybind_utils.h index 0598e651d32437..415fc311086ac9 100644 --- a/torch/csrc/jit/pybind_utils.h +++ b/torch/csrc/jit/pybind_utils.h @@ -4,70 +4,26 @@ namespace torch { namespace jit { -inline Stack createStack(const py::tuple& tuple, at::ArrayRef inputs, size_t reserve_extra_space = 0) { - if (tuple.size() != inputs.size()) { - throw std::runtime_error("expected " + std::to_string(inputs.size()) + - " inputs, but got " + std::to_string(tuple.size())); - } - static const auto castToIValue = [](const py::object& obj, Type& t) -> IValue{ - switch (t.kind()) { - case TypeKind::DynamicType: - case TypeKind::TensorType: - return py::cast(obj); - case TypeKind::FloatType: - return py::cast(obj); - case TypeKind::IntType: - return py::cast(obj); - case TypeKind::NoneType: - return {}; - case TypeKind::ListType: - case TypeKind::TupleType: - throw std::runtime_error("Lists and tuples are not supported yet"); - case TypeKind::NumberType: - throw std::runtime_error("Insufficient type information to convert input"); - } - throw std::runtime_error("Missing cases in castToIValue! File a bug report."); - }; +inline Stack createStack(const py::tuple& tuple, size_t reserve_extra_space = 0) { Stack result; result.reserve(tuple.size() + reserve_extra_space); - for (size_t i = 0; i < inputs.size(); ++i) { - result.push_back(castToIValue(tuple[i], *inputs[i]->type())); + for(auto e : tuple) { + result.push_back(py::cast(e)); } return result; } -inline py::object wrapStack(Stack&& outputs, at::ArrayRef output_vals) { - if (outputs.size() != output_vals.size()) { - throw std::runtime_error("expected " + std::to_string(output_vals.size()) + - " outputs, but got " + std::to_string(outputs.size())); - } - static const auto createOutput = [](IValue && ivalue, Value * value) -> py::object { - switch (value->type()->kind()) { - case TypeKind::DynamicType: - case TypeKind::TensorType: - return py::cast(autograd::Variable(ivalue.toTensor())); - case TypeKind::FloatType: - return py::cast(ivalue.toDouble()); - case TypeKind::IntType: - return py::cast(ivalue.toInt()); - case TypeKind::NoneType: - return py::none(); - case TypeKind::ListType: - case TypeKind::TupleType: - throw std::runtime_error("Lists and tuples are not supported yet"); - case TypeKind::NumberType: - throw std::runtime_error("Insufficient type information to convert input"); - } - throw std::runtime_error("Missing cases in createOutput! File a bug report."); - }; +inline py::object wrapStack(Stack&& outputs) { if (outputs.size() == 0) { return py::none(); } else if (outputs.size() == 1) { - return createOutput(std::move(outputs[0]), output_vals[0]); + JIT_ASSERT(outputs[0].isTensor()); + return py::cast(autograd::as_variable_ref(std::move(outputs[0]).toTensor())); } else { py::tuple tuple(outputs.size()); for(size_t i = 0; i < outputs.size(); i++) { - tuple[i] = createOutput(std::move(outputs[i]), output_vals[i]); + JIT_ASSERT(outputs[i].isTensor()); + tuple[i] = py::cast(autograd::as_variable_ref(std::move(outputs[i]).toTensor())); } return tuple; } diff --git a/torch/csrc/jit/python_arg_flatten.h b/torch/csrc/jit/python_arg_flatten.h index 3e1477e52e0701..b5139032fde169 100644 --- a/torch/csrc/jit/python_arg_flatten.h +++ b/torch/csrc/jit/python_arg_flatten.h @@ -14,7 +14,7 @@ namespace torch { namespace jit { namespace python { struct IODescriptor { struct VariableMetadata { VariableMetadata(const autograd::Variable& var) - : sizes(var.sizes().vec()) + : sizes(var.sizes()) , type(var.type().scalarType()) , device(var.type().is_cuda() ? var.get_device() : -1) , requires_grad(var.requires_grad()) {} @@ -104,7 +104,7 @@ struct ParsedArgs { ParsedArgs flatten(py::handle obj); -PyObject* unflatten(at::ArrayRef vars, +PyObject* unflatten(at::ArrayRef outputs, const IODescriptor& structure); }}} // namespace torch::jit::python diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp index b72fdb6b8860b1..81211085569953 100644 --- a/torch/csrc/jit/python_ir.cpp +++ b/torch/csrc/jit/python_ir.cpp @@ -451,22 +451,10 @@ void initPythonIRBindings(PyObject * module_) { .def("scalarType",[](Type& t) { return at::toString(t.expect()->scalarType()); }) - .def("__eq__", [](std::shared_ptr& self, std::shared_ptr& other) { - return *self == *other; - }) - .def("isSubtypeOf", [](std::shared_ptr& self, std::shared_ptr other) { - return self->isSubtypeOf(other); - }); + ; - py::class_>(m, "NumberType") - .def_static("get", &NumberType::get); - py::class_>(m, "IntType") - .def_static("get", &IntType::get); - py::class_>(m, "FloatType") - .def_static("get", &FloatType::get); py::class_>(m, "DynamicType") - .def_static("get", &DynamicType::get); - + .def(py::init([](){ return DynamicType::create(); })); py::class_>(m, "TupleType") .def(py::init([](std::vector a){ return TupleType::create(a); })) .def("elements", [](TupleType &self){ @@ -477,9 +465,7 @@ void initPythonIRBindings(PyObject * module_) { return types; }); py::class_>(m, "ListType") - .def_static("ofInts", &ListType::ofInts) - .def_static("ofTensors", &ListType::ofTensors) - .def("getElementType", &ListType::getElementType); + .def_static("ofInts", &ListType::ofInts); py::class_(m,"Use") .def_readonly("user",&Use::user) diff --git a/torch/csrc/jit/python_tracer.cpp b/torch/csrc/jit/python_tracer.cpp index 0496af67412654..7439b2b5e334cc 100644 --- a/torch/csrc/jit/python_tracer.cpp +++ b/torch/csrc/jit/python_tracer.cpp @@ -103,10 +103,10 @@ void pythonRecordSourceLocation(Node* n) { n->setSourceLocation(sl); } -void initPythonTracerBindings(PyObject* module) { +void initPythonTracerBindings(PyObject* module_) { setRecordSourceLocation(pythonRecordSourceLocation); - auto m = py::handle(module).cast(); + auto m = py::handle(module_).cast(); py::class_>(m, "TracingState", py::dynamic_attr()) // NB: no constructor; you have to get it from C++ code .def("__repr__", [](const TracingState& s) { diff --git a/torch/csrc/jit/register_prim_ops.cpp b/torch/csrc/jit/register_prim_ops.cpp index f2b8ea18a2be24..8fe747e59900f0 100644 --- a/torch/csrc/jit/register_prim_ops.cpp +++ b/torch/csrc/jit/register_prim_ops.cpp @@ -231,18 +231,6 @@ RegisterOperators reg({ push(stack, std::move(vals)); return 0; }; - } else if (lt->getElementType()->isSubtypeOf(DynamicType::get())) { - return [=](Stack& stack) { - const size_t stack_size = stack.size(); - std::vector vals; - vals.reserve(num_inputs); - for (size_t i = stack_size - num_inputs; i < stack_size; ++i) { - vals.push_back(std::move(stack[i]).toTensor()); - } - drop(stack, num_inputs); - push(stack, std::move(vals)); - return 0; - }; } else { std::stringstream ss; ss << "unsupported list type: " << *lt->getElementType(); @@ -347,35 +335,7 @@ RegisterOperators reg2({ return 0; }; }), - Operator( - "aten::_tensor_to_list(Tensor a) -> int[]", - [](Node* node) { - return [=](Stack& stack) { - at::Tensor t; - pop(stack, t); - std::vector elems; - for(int i = 0; i < t.size(0); i++){ - elems.push_back(*t[i].toIntData()); - } - push(stack, jit::IntList::create(elems)); - return 0; - }; - }), - Operator( - "aten::_list_to_tensor(int[] a) -> Tensor", - [](Node* node) { - return [=](Stack& stack) { - std::vector l; - pop(stack, l); - auto t = torch::empty( - {static_cast(l.size())}, at::dtype(at::kInt)); - for(size_t i = 0; i < l.size(); i++){ - t[i] = l[i]; - } - push(stack, t); - return 0; - }; - }), + // commutative DEFINE_ST_OP(mul, at::mul(b, a)) DEFINE_ST_OP(add, at::add(b, a)) diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp index 4f27cb25b53cb7..0016f69b5ce07b 100644 --- a/torch/csrc/jit/script/compiler.cpp +++ b/torch/csrc/jit/script/compiler.cpp @@ -351,19 +351,37 @@ Value* createNumber(Graph& g, const SourceRange& loc, const at::Tensor& val) { return output; } +Value* createStack(Graph& g, const SourceRange& loc, at::ArrayRef inputs) { + // bake in constant propagation for the all-constant case because it is + // common to see constant lists like [1, 2] passed to attributes + bool all_constant = std::all_of(inputs.begin(), inputs.end(), [&](Value* v) { + return v->node()->kind() == prim::Constant; + }); + if(all_constant) { + auto values = fmap(inputs, [&](Value* v) { + return v->node()->t(attr::value); + }); + return insertConstant(g, at::stack(values), loc); + } + return g.insertNode(g.create(aten::stack, inputs) + ->i_(attr::dim, 0) + ->setSourceLocation(std::make_shared(loc)))->output(); +} + +static bool isTensorSubtype(Value* v) { + return v->type()->isSubtypeOf(DynamicType::get()); +} + at::optional> getIntListAttribute(at::optional N, Value* input) { auto list = constant_as>(input); if(list) - return list.value()->elements().vec(); - + return std::vector(list.value()->elements()); // broadcast IntList[3] with value 4 -> {4, 4, 4} if(!N) return at::nullopt; - auto r = constant_as(input); if(!r) return at::nullopt; - // broadcast to attribute size return std::vector(*N, *r); } @@ -437,46 +455,51 @@ at::optional> tryMatchSchema( } // check input types - std::vector matched_inputs; + std::vector flat_inputs; for(size_t i = 0; i < schema.arguments.size(); ++i) { - Value* value = positional_inputs[i]->value; + NamedValue v = *positional_inputs[i]; const auto& arg = schema.arguments[i]; // some functions that take lists of integers for fixed size arrays // also allow single ints to be passed in their place. // the single int is then repeated to the length of the list - if (isIntUsedAsIntList(value, arg)) { - std::vector repeated(*arg.N, value); - value = graph.insertNode(graph.createList(IntType::get(), repeated))->output(); + if (isIntUsedAsIntList(v.value, arg)) { + std::vector repeated(*arg.N, v.value); + v.value = graph.insertNode(graph.createList(IntType::get(), repeated))->output(); } - // Allow homogeneous tuples to be casted implicitly to lists of appropriate types - if (arg.type->kind() == TypeKind::ListType && - value->type()->kind() == TypeKind::TupleType && - value->type()->isSubtypeOf(arg.type)) { - auto unpacked = createTupleUnpack(value); - auto elem_type = arg.type->expect()->getElementType(); - value = graph.insertNode(graph.createList(elem_type, unpacked))->output(); + // Allow tuples that only contain integers to turn into lists of integers + if(*ListType::ofInts() == *arg.type && + v.value->type()->kind() == TypeKind::TupleType && + v.value->type()->isSubtypeOf(ListType::ofInts())) { + auto unpacked = createTupleUnpack(v.value); + v.value = graph.insertNode(graph.createList(IntType::get(), unpacked))->output(); } - if (value->node()->kind() == prim::None){ + if (v.value->node()->kind() == prim::None){ if (arg.type->isSubtypeOf(NumberType::get())) - value = insertConstant(graph, at::Scalar(NAN), loc); + v.value = insertConstant(graph, at::Scalar(NAN), loc); else - value = graph.insertNode(graph.createUndefined())->output(); + v.value = graph.insertNode(graph.createUndefined())->output(); } - if(!value->type()->isSubtypeOf(arg.type)) { + if(!v.value->type()->isSubtypeOf(arg.type)) { err() << "expected a value of type " << arg.type->str() << " for argument '" << arg.name << "' but found " - << value->type()->str() << "\n" - << positional_inputs[i]->loc; + << v.value->type()->str() << "\n" + << v.loc; return at::nullopt; } - matched_inputs.push_back(value); + // we only support tensor lists for builtins, where they must be flattened + if(arg.type->isSubtypeOf(ListType::ofTensors())) { + auto outputs = createTupleUnpack(v.value); + flat_inputs.insert(flat_inputs.end(), outputs.begin(), outputs.end()); + } else { + flat_inputs.push_back(v.value); + } } - return matched_inputs; + return flat_inputs; } @@ -490,27 +513,27 @@ static std::shared_ptr tryEmitBuiltin( at::ArrayRef attributes) { auto graph = method.graph(); - auto matched_inputs = tryMatchSchema(op->schema(), loc, *graph, inputs, attributes, failure_messages); - if(!matched_inputs) + auto flat_inputs = tryMatchSchema(op->schema, loc, *graph, inputs, attributes, failure_messages); + if(!flat_inputs) return nullptr; // we successfully matched this schema, construct the node NodeKind kind(Symbol::aten(name)); - auto n = graph->insertNode(graph->create(kind, *matched_inputs, 0)) + auto n = graph->insertNode(graph->create(kind, *flat_inputs, 0)) ->setSourceLocation(std::make_shared(loc)); // special case for chunk when the chunks= is known // DO NOT ADD MORE SPECIAL CASES HERE, REFACTOR INTO A FUNCTION IF // NEEDED if(n->kind() == aten::chunk) { - auto value = constant_as((*matched_inputs)[1]); + auto value = constant_as((*flat_inputs)[1]); if(!value) { throw ErrorReport(loc) << "argument 'chunks' must be a constant"; } for(int64_t i = 0; i < *value; ++i) n->addOutput(); } else { - for(auto & ret : op->schema().returns) { + for(auto & ret : op->schema.returns) { n->addOutput()->setType(ret.type); } } @@ -565,7 +588,7 @@ std::shared_ptr emitBuiltinCall( } static Value* ensureTensor(const SourceRange& range, Value* v) { - if(!v->type()->isSubtypeOf(DynamicType::get())) { + if(!isTensorSubtype(v)) { throw ErrorReport(range) << "expected a tensor value but found a " << v->type()->str(); } @@ -677,7 +700,7 @@ struct to_ir { if (return_stmt.values().size() == 1 && results.size() == 1) { auto result = results.at(0); if(result->type()->cast()) { - results = createTupleUnpack(result).vec(); + results = createTupleUnpack(result); } } if (typed_def.schema && typed_def.schema->returns.size() != results.size()) { @@ -688,16 +711,12 @@ struct to_ir { auto range = return_stmt.range(); size_t return_type_idx = 0; for (auto& r : results) { - // TODO: support tuples and lists as returns - auto return_kind = r->type()->kind(); - if (return_kind != TypeKind::TensorType && - return_kind != TypeKind::DynamicType && - return_kind != TypeKind::IntType && - return_kind != TypeKind::FloatType) { - throw ErrorReport(return_stmt.range()) << "The only supported return types " - << "are tensors, ints and floats"; + if(r->type()->isSubtypeOf(NumberType::get())) { + graph->registerOutput(numToTensor(range, r)); + } else { + ensureTensor(range, r); + graph->registerOutput(r); } - graph->registerOutput(r); TypePtr type = DynamicType::get(); if (typed_def.schema) { type = typed_def.schema->returns.at(return_type_idx).type; @@ -1368,11 +1387,6 @@ struct to_ir { auto values = getValues(ll.inputs(), /*maybe_unpack=*/true, identity); return graph->insertNode(graph->createTuple(values))->output(); } break; - case TK_TUPLE_LITERAL: { - auto ll = TupleLiteral(tree); - auto values = getValues(ll.inputs(), /*maybe_unpack=*/true, identity); - return graph->insertNode(graph->createTuple(values))->output(); - } break; default: throw ErrorReport(tree) << "NYI: " << tree; break; diff --git a/torch/csrc/jit/script/compiler.h b/torch/csrc/jit/script/compiler.h index 3c4dcb07a248ee..0b87cf56be6ad3 100644 --- a/torch/csrc/jit/script/compiler.h +++ b/torch/csrc/jit/script/compiler.h @@ -68,7 +68,7 @@ struct SugaredValue : public std::enable_shared_from_this { SourceRange loc, Method & m, // note: names for args will be 'argument 0', 'argument 1', etc.. - at::ArrayRef inputs_, + at::ArrayRef inputs, at::ArrayRef attributes, size_t n_binders) { // n_binders is always set to the number of variables an expression is @@ -89,7 +89,7 @@ struct SugaredValue : public std::enable_shared_from_this { throw ErrorReport(loc) << "cannot call a " << kind(); } - virtual ~SugaredValue() = default; + virtual ~SugaredValue() {} }; // most things in the environment are just simple value types diff --git a/torch/csrc/jit/script/init.cpp b/torch/csrc/jit/script/init.cpp index 39bb51ed89ca5d..cb7893234dc747 100644 --- a/torch/csrc/jit/script/init.cpp +++ b/torch/csrc/jit/script/init.cpp @@ -370,15 +370,10 @@ static void gatherParametersAndBuffers(std::vector & values, const } } -Stack createStack(const py::tuple& tuple, const Method& method) { - auto relevant_inputs = method.graph()->inputs().slice(0, method.num_inputs()); - return createStack(tuple, relevant_inputs); -} - py::object runMethodFromPython(Method& m, py::args args) { - auto stack = createStack(args, m); + auto stack = createStack(args); m.run(stack); - return wrapStack(std::move(stack), m.graph()->outputs()); + return wrapStack(std::move(stack)); } void initJitScriptBindings(PyObject* module) { @@ -507,8 +502,7 @@ void initJitScriptBindings(PyObject* module) { }) .def("graph_for", [](Module& self, py::args args) { if (self.find_method("forward")) { - Method & m = self.get_method("forward"); - return m.graph_for(createStack(args, m.graph()->inputs())); + return self.get_method("forward").graph_for(createStack(args)); } throw std::runtime_error("Attempted to call graph_for on a Module without a compiled forward()"); }) @@ -536,7 +530,7 @@ void initJitScriptBindings(PyObject* module) { .def("propagate_and_assign_input_and_output_shapes", &Method::propagate_and_assign_input_and_output_shapes) .def("params", &Method::params) .def("graph_for", [](Method& self, py::args args) { - return self.graph_for(createStack(args, self.graph()->inputs())); + return self.graph_for(createStack(args)); }) .def("set_arg_and_return_types", [](Method &self, TypedDef &typed_def, bool method) { std::vector arg_type_args, return_type_args; diff --git a/torch/csrc/jit/script/lexer.h b/torch/csrc/jit/script/lexer.h index 1694889d630d39..912b488dde5d9e 100644 --- a/torch/csrc/jit/script/lexer.h +++ b/torch/csrc/jit/script/lexer.h @@ -75,7 +75,6 @@ namespace script { _(TK_GATHER, "gather", "") \ _(TK_NOTHING, "nothing", "") \ _(TK_LIST_LITERAL, "list-literal", "") \ - _(TK_TUPLE_LITERAL, "tuple-literal", "") \ _(TK_FOR, "for", "for") \ _(TK_IN, "in", "in") \ _(TK_STARRED, "starred", "") \ diff --git a/torch/csrc/jit/script/parser.h b/torch/csrc/jit/script/parser.h index 0cd833dc15e488..abea2778053699 100644 --- a/torch/csrc/jit/script/parser.h +++ b/torch/csrc/jit/script/parser.h @@ -30,7 +30,7 @@ struct Parser { List(makeList(range, std::move(attributes)))); } // exp | expr, | expr, expr, ... - TreeRef parseExpOrExpTuple(int end) { + TreeRef parseExpOrExpList(int end) { auto prefix = parseExp(); if(L.cur().kind == ',') { std::vector exprs = { prefix }; @@ -39,7 +39,7 @@ struct Parser { exprs.push_back(parseExp()); } auto list = List::create(prefix.range(), exprs); - prefix = TupleLiteral::create(list.range(), list); + prefix = ListLiteral::create(list.range(), list); } return prefix; } @@ -61,14 +61,7 @@ struct Parser { } break; case '(': { L.next(); - if (L.nextIf(')')) { - /// here we have the empty tuple case - std::vector vecExpr; - List listExpr = List::create(L.cur().range, vecExpr); - prefix = TupleLiteral::create(L.cur().range, listExpr); - break; - } - prefix = parseExpOrExpTuple(')'); + prefix = parseExpOrExpList(')'); L.expect(')'); } break; case '[': { @@ -249,7 +242,7 @@ struct Parser { // first[,other,lhs] = rhs Assign parseAssign(List list) { auto red = parseOptionalReduction(); - auto rhs = parseExpOrExpTuple(TK_NEWLINE); + auto rhs = parseExpOrExpList(TK_NEWLINE); L.expect(TK_NEWLINE); return Assign::create(list.range(), list, AssignKind(red), Expr(rhs)); } diff --git a/torch/csrc/jit/script/python_tree_views.cpp b/torch/csrc/jit/script/python_tree_views.cpp index 7ece5e055a33df..569d1b0e66fdf3 100644 --- a/torch/csrc/jit/script/python_tree_views.cpp +++ b/torch/csrc/jit/script/python_tree_views.cpp @@ -193,10 +193,6 @@ void initTreeViewBindings(PyObject *module) { .def(py::init([](const SourceRange& range, std::vector args) { return ListLiteral::create(range, wrap_list(range, std::move(args))); })); - py::class_(m, "TupleLiteral") - .def(py::init([](const SourceRange& range, std::vector args) { - return TupleLiteral::create(range, wrap_list(range, std::move(args))); - })); py::class_(m, "Gather") .def(py::init([](const Expr& base, const Expr& index) { return Gather::create(base.range(), base, index); diff --git a/torch/csrc/jit/script/tree.h b/torch/csrc/jit/script/tree.h index 0b9bc7009e0162..e3d69d2790682d 100644 --- a/torch/csrc/jit/script/tree.h +++ b/torch/csrc/jit/script/tree.h @@ -89,7 +89,7 @@ struct Tree : std::enable_shared_from_this { throw std::runtime_error(ss.str()); } } - virtual ~Tree() = default; + virtual ~Tree() {} private: int kind_; diff --git a/torch/csrc/jit/script/tree_views.h b/torch/csrc/jit/script/tree_views.h index 10ac01799c0607..6cc934ab4d177a 100644 --- a/torch/csrc/jit/script/tree_views.h +++ b/torch/csrc/jit/script/tree_views.h @@ -58,7 +58,6 @@ namespace script { // | Gather(Expr value, Expr indices) TK_GATHER // | Var(Ident name) TK_VAR // | ListLiteral(List inputs) TK_LIST_LITERAL -// | TupleLiteral(List inputs) TK_TUPLE_LITERAL // | Starred(Expr expr) TK_STARRED // // -- NB: only allowed expressions are Const or List(Const) @@ -256,7 +255,6 @@ struct Expr : public TreeView { case TK_GATHER: case TK_VAR: case TK_LIST_LITERAL: - case TK_TUPLE_LITERAL: case '@': case TK_POW: return; @@ -696,17 +694,6 @@ struct ListLiteral : public Expr { } }; -struct TupleLiteral : public Expr { - explicit TupleLiteral(const TreeRef& tree) : Expr(tree) { - tree_->match(TK_TUPLE_LITERAL); - } - List inputs() const { - return subtree(0); - } - static TupleLiteral create(const SourceRange& range, const List& inputs) { - return TupleLiteral(Compound::create(TK_TUPLE_LITERAL, range, {inputs})); - } -}; struct Starred : public Expr { explicit Starred(const TreeRef& tree) : Expr(tree) { diff --git a/torch/csrc/jit/stack.h b/torch/csrc/jit/stack.h index 7a23aa55df538f..2c74ae7e0a4c77 100644 --- a/torch/csrc/jit/stack.h +++ b/torch/csrc/jit/stack.h @@ -77,8 +77,8 @@ inline void pack(Stack & stack, T&& v) { } template<> -inline void pack(Stack & stack, std::vector&& v) { - for(auto& t : v) { +inline void pack(Stack & stack, std::vector&& ts) { + for(auto& t : ts) { stack.push_back(IValue(std::move(t))); } } diff --git a/torch/csrc/jit/symbolic_variable.h b/torch/csrc/jit/symbolic_variable.h index ef6d41005789f8..e4d2f98ba0ea0f 100644 --- a/torch/csrc/jit/symbolic_variable.h +++ b/torch/csrc/jit/symbolic_variable.h @@ -119,20 +119,18 @@ struct SymbolicVariable { return create(t("narrow"), { *this, insertConstant(dim), insertConstant(start), insertConstant(length) }, 1)[0]; } static SymbolicVariable cat(ArrayRef inputs, Value* dim) { - Graph *g = dim->owningGraph(); - auto value_inputs = fmap(inputs, [](const SymbolicVariable & v) { return v.value(); }); - Value *input_list = g->insertNode(g->createList(DynamicType::get(), value_inputs))->output(); - return create(aten::cat, {input_list, dim})[0]; + std::vector all_inputs = inputs; + all_inputs.push_back(dim); + return create(aten::cat, all_inputs)[0]; } static SymbolicVariable cat(ArrayRef inputs, int dim) { JIT_ASSERT(inputs.size() > 0); return SymbolicVariable::cat(inputs, inputs[0].insertConstant(dim)); } static SymbolicVariable stack(ArrayRef inputs, Value* dim) { - Graph *g = dim->owningGraph(); - auto value_inputs = fmap(inputs, [](const SymbolicVariable & v) { return v.value(); }); - Value *input_list = g->insertNode(g->createList(DynamicType::get(), value_inputs))->output(); - return create(aten::stack, {input_list, dim})[0]; + std::vector all_inputs = inputs; + all_inputs.push_back(dim); + return create(aten::stack, all_inputs)[0]; } static SymbolicVariable stack(ArrayRef inputs, int dim) { JIT_ASSERT(inputs.size() > 0); diff --git a/torch/csrc/jit/test_jit.cpp b/torch/csrc/jit/test_jit.cpp index d5d204f9465bd8..8c9763f88353e5 100644 --- a/torch/csrc/jit/test_jit.cpp +++ b/torch/csrc/jit/test_jit.cpp @@ -220,9 +220,6 @@ static void fusionTests() { testOne(1,2,0,2); - auto createFusedConcat = [](Graph & graph, at::ArrayRef inputs, int64_t dim) -> Value* { - return graph.insertNode(graph.create(prim::FusedConcat, inputs)->i_(attr::dim, dim))->output(); - }; auto testConcat = [&](int dim) { Graph graph; @@ -230,7 +227,7 @@ static void fusionTests() { Var i1 = Var::asNewInput(graph); auto o0 = i0 * i1; o0.addAsOutput(); - Var(createFusedConcat(graph, {i0, o0}, dim)).addAsOutput(); + Var::cat({i0, o0}, dim).addAsOutput(); auto a = at::rand({3,4,5}, at::kCUDA); auto b = at::rand({4,3,5}, at::kCUDA).transpose(0,1); @@ -779,9 +776,6 @@ void argumentSpecTest() { REQUIRE(!(c == a)); REQUIRE(spec.count(c) == 0); - Stack stack = { var(CF, {1,2}, true), 3, var(CF, {1,2}, true) }; - ArgumentSpec with_const(true, stack); - REQUIRE(with_const.at(2).sizes().size() == 2); } void shapeAnalysisTest() { diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp index a0e2f65e617754..aec6eb4ddc9447 100644 --- a/torch/csrc/jit/tracer.cpp +++ b/torch/csrc/jit/tracer.cpp @@ -38,9 +38,9 @@ void addInputs(Node *n, const char * name, const std::string& value) { b void addInputs(Node *n, const char * name, const at::SparseTensorRef& value) { badArgType(); } void addInputs(Node *n, const char * name, at::TensorList value) { - Graph *g = n->owningGraph(); - Node *list_node = g->appendNode(g->createList(DynamicType::get(), fmap(value, getValueTrace))); - n->addInput(list_node->output()); + for (auto & t : value) { + n->addInput(getValueTrace(t)); + } } void addInputs(Node *n, const char * name, at::IntList value) { diff --git a/torch/csrc/jit/type.cpp b/torch/csrc/jit/type.cpp index ddb4dfad0154ad..ebcc91a908c213 100644 --- a/torch/csrc/jit/type.cpp +++ b/torch/csrc/jit/type.cpp @@ -46,31 +46,31 @@ std::ostream& operator<<(std::ostream & out, const Type & t) { return out; } -DynamicTypePtr DynamicType::get() { +TypePtr DynamicType::get() { static auto value = DynamicType::create(); return value; } -NumberTypePtr NumberType::get() { +TypePtr NumberType::get() { static auto value = NumberType::create(); return value; } -IntTypePtr IntType::get() { +TypePtr IntType::get() { static auto value = IntType::create(); return value; } -FloatTypePtr FloatType::get() { +TypePtr FloatType::get() { static auto value = FloatType::create(); return value; } -NoneTypePtr NoneType::get() { +TypePtr NoneType::get() { static auto value = NoneType::create(); return value; } -ListTypePtr ListType::ofTensors() { +TypePtr ListType::ofTensors() { static auto value = ListType::create(DynamicType::get()); return value; } -ListTypePtr ListType::ofInts() { +TypePtr ListType::ofInts() { static auto value = ListType::create(IntType::get()); return value; } diff --git a/torch/csrc/jit/type.h b/torch/csrc/jit/type.h index 713718e40681c8..7b7d708a549b32 100644 --- a/torch/csrc/jit/type.h +++ b/torch/csrc/jit/type.h @@ -80,7 +80,7 @@ struct TORCH_API Type : std::enable_shared_from_this { JIT_ASSERT(T::Kind == kind()); return std::static_pointer_cast(shared_from_this()); } - virtual ~Type() = default; + virtual ~Type() {} }; inline bool operator!=(const Type & lhs, const Type & rhs) { @@ -104,7 +104,7 @@ struct TORCH_API DynamicType : public Type { } static const TypeKind Kind = TypeKind::DynamicType; // global singleton - static DynamicTypePtr get(); + static TypePtr get(); private: DynamicType() : Type(TypeKind::DynamicType) {} @@ -186,16 +186,16 @@ struct TORCH_API TensorType : public Type { : Type(TypeKind::TensorType) , scalar_type_(tensor.type().scalarType()) , device_(tensor.type().is_cuda() ? tensor.get_device() : -1) - , sizes_(tensor.sizes().vec()) - , strides_(tensor.strides().vec()) {} + , sizes_(tensor.sizes()) + , strides_(tensor.strides()) {} TensorType(at::ScalarType scalar_type, int device, at::IntList sizes) : TensorType(scalar_type, device, sizes, TensorType::contiguousStridesOf(sizes)) {} TensorType(at::ScalarType scalar_type, int device, at::IntList sizes, at::IntList strides) : Type(TypeKind::TensorType) , scalar_type_(scalar_type) , device_(device) - , sizes_(sizes.vec()) - , strides_(strides.vec()) + , sizes_(sizes) + , strides_(strides) {} static std::vector contiguousStridesOf(at::IntList sizes) { std::vector strides(sizes.size()); @@ -237,8 +237,8 @@ struct TORCH_API ListType : public Type { return elem; } // common cast List[Tensor] - static ListTypePtr ofTensors(); - static ListTypePtr ofInts(); + static TypePtr ofTensors(); + static TypePtr ofInts(); private: ListType(TypePtr elem) : Type(TypeKind::ListType), elem(elem) {} @@ -326,7 +326,7 @@ struct TORCH_API NumberType : public Type { } static const TypeKind Kind = TypeKind::NumberType; // global singleton - static NumberTypePtr get(); + static TypePtr get(); private: NumberType() : Type(TypeKind::NumberType) {} @@ -351,7 +351,7 @@ struct TORCH_API FloatType : public Type { } static const TypeKind Kind = TypeKind::FloatType; // global singleton - static FloatTypePtr get(); + static TypePtr get(); private: FloatType() : Type(TypeKind::FloatType) {} @@ -376,7 +376,7 @@ struct TORCH_API IntType : public Type { } static const TypeKind Kind = TypeKind::IntType; // global singleton - static IntTypePtr get(); + static TypePtr get(); private: IntType() : Type(TypeKind::IntType) {} @@ -401,7 +401,7 @@ struct NoneType : public Type { } static const TypeKind Kind = TypeKind::NoneType; // global singleton - static NoneTypePtr get(); + static TypePtr get(); private: NoneType() : Type(TypeKind::NoneType) {} diff --git a/torch/csrc/jit/variable_tensor_list.h b/torch/csrc/jit/variable_tensor_list.h index 0916fe6ac051d2..eeae2a66b17e5f 100644 --- a/torch/csrc/jit/variable_tensor_list.h +++ b/torch/csrc/jit/variable_tensor_list.h @@ -6,10 +6,10 @@ namespace torch { namespace jit { // a wrapper to mark places where we expect all the at::Tensors to be // variables struct variable_tensor_list : public std::vector { - variable_tensor_list() = default; + variable_tensor_list() {} template variable_tensor_list(InputIt first, InputIt last) - : std::vector(first, last) {} + : std::vector(first, last) {} explicit variable_tensor_list(std::vector && tensor) : std::vector(std::move(tensor)) {} }; diff --git a/torch/csrc/onnx/init.cpp b/torch/csrc/onnx/init.cpp index 64747c8c4b83a9..b09824ec77b4a5 100644 --- a/torch/csrc/onnx/init.cpp +++ b/torch/csrc/onnx/init.cpp @@ -1,33 +1,36 @@ #include "torch/csrc/onnx/init.h" +#include "torch/csrc/onnx/onnx.npb.h" #include "torch/csrc/onnx/onnx.h" -#include "onnx/onnx.pb.h" namespace torch { namespace onnx { void initONNXBindings(PyObject* module) { auto m = py::handle(module).cast(); auto onnx = m.def_submodule("_onnx"); - py::enum_<::ONNX_NAMESPACE::TensorProto_DataType>(onnx, "TensorProtoDataType") - .value("UNDEFINED", ::ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED) - .value("FLOAT", ::ONNX_NAMESPACE::TensorProto_DataType_FLOAT) - .value("UINT8", ::ONNX_NAMESPACE::TensorProto_DataType_UINT8) - .value("INT8", ::ONNX_NAMESPACE::TensorProto_DataType_INT8) - .value("UINT16", ::ONNX_NAMESPACE::TensorProto_DataType_UINT16) - .value("INT16", ::ONNX_NAMESPACE::TensorProto_DataType_INT16) - .value("INT32", ::ONNX_NAMESPACE::TensorProto_DataType_INT32) - .value("INT64", ::ONNX_NAMESPACE::TensorProto_DataType_INT64) - .value("STRING", ::ONNX_NAMESPACE::TensorProto_DataType_STRING) - .value("BOOL", ::ONNX_NAMESPACE::TensorProto_DataType_BOOL) - .value("FLOAT16", ::ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) - .value("DOUBLE", ::ONNX_NAMESPACE::TensorProto_DataType_DOUBLE) - .value("UINT32", ::ONNX_NAMESPACE::TensorProto_DataType_UINT32) - .value("UINT64", ::ONNX_NAMESPACE::TensorProto_DataType_UINT64) - .value("COMPLEX64", ::ONNX_NAMESPACE::TensorProto_DataType_COMPLEX64) - .value("COMPLEX128", ::ONNX_NAMESPACE::TensorProto_DataType_COMPLEX128); + py::enum_(onnx, "TensorProtoDataType") + .value("UNDEFINED", onnx_TensorProto_DataType_UNDEFINED) + .value("FLOAT", onnx_TensorProto_DataType_FLOAT) + .value("UINT8", onnx_TensorProto_DataType_UINT8) + .value("INT8", onnx_TensorProto_DataType_INT8) + .value("UINT16", onnx_TensorProto_DataType_UINT16) + .value("INT16", onnx_TensorProto_DataType_INT16) + .value("INT32", onnx_TensorProto_DataType_INT32) + .value("INT64", onnx_TensorProto_DataType_INT64) + .value("STRING", onnx_TensorProto_DataType_STRING) + .value("BOOL", onnx_TensorProto_DataType_BOOL) + .value("FLOAT16", onnx_TensorProto_DataType_FLOAT16) + .value("DOUBLE", onnx_TensorProto_DataType_DOUBLE) + .value("UINT32", onnx_TensorProto_DataType_UINT32) + .value("UINT64", onnx_TensorProto_DataType_UINT64) + .value("COMPLEX64", onnx_TensorProto_DataType_COMPLEX64) + .value("COMPLEX128", onnx_TensorProto_DataType_COMPLEX128); py::enum_(onnx, "OperatorExportTypes") .value("ONNX", OperatorExportTypes::ONNX) .value("ONNX_ATEN", OperatorExportTypes::ONNX_ATEN) .value("ONNX_ATEN_FALLBACK", OperatorExportTypes::ONNX_ATEN_FALLBACK) .value("RAW", OperatorExportTypes::RAW); + + py::class_(onnx, "ModelProto") + .def("prettyPrint", &ModelProto::prettyPrint); } }} // namespace torch::onnx diff --git a/torch/csrc/onnx/onnx.cpp b/torch/csrc/onnx/onnx.cpp new file mode 100644 index 00000000000000..fa93f6866d5ed6 --- /dev/null +++ b/torch/csrc/onnx/onnx.cpp @@ -0,0 +1,214 @@ +#include "torch/csrc/onnx/onnx.h" + +namespace torch { namespace onnx { + +template <> +bool micropb_encode(pb_ostream_t *stream, std::string* arg) { + return pb_encode_string(stream, reinterpret_cast(arg->c_str()), arg->size()); +} +// NB: Overloads don't work so great for signed variables. Hope this doesn't +// come up! +template <> +bool micropb_encode(pb_ostream_t *stream, int64_t* arg) { + // Yes, this looks dodgy, and yes, this is what the docs say to do: + // https://jpa.kapsi.fi/nanopb/docs/reference.html#pb-encode-varint + return pb_encode_varint(stream, *reinterpret_cast(arg)); +} +template <> +bool micropb_encode(pb_ostream_t *stream, float* arg) { + return pb_encode_fixed32(stream, static_cast(arg)); +} +template <> +bool micropb_encode(pb_ostream_t *stream, double* arg) { + return pb_encode_fixed64(stream, static_cast(arg)); +} + +template <> +bool micropb_encode(pb_ostream_t *stream, Dimension* arg) { + return pb_encode_submessage(stream, onnx_TensorShapeProto_Dimension_fields, + static_cast(arg)); +} + +// TODO: I'm not entirely sure why this can't be in the header... +bool micropb_callback_string_from_tensor(pb_ostream_t *stream, const pb_field_t *field, void * const *arg) { + at::Tensor* t = static_cast(*arg); + AT_ASSERT(t->is_contiguous()); + // Packed array format! + pb_encode_tag_for_field(stream, field); + pb_encode_string(stream, (pb_byte_t*)(t->data_ptr()), t->type().elementSizeInBytes()*t->numel()); + + return true; +} + +GraphProto* AttributeProto::add_graphs() { + auto ptr = new GraphProto(); + graphs.emplace_back(ptr); + return ptr; +} + +constexpr char indent_char = ' '; +constexpr size_t indent_multiplier = 2; + +std::string idt(size_t indent) { + return std::string(indent * indent_multiplier, indent_char); +} + +std::string nlidt(size_t indent) { + return std::string("\n") + idt(indent); +} + +void TensorProto::dump(std::ostream& stream, size_t indent) { + stream << "TensorProto shape: ["; + for (size_t i = 0; i < dims.size(); ++i) { + stream << *dims[i] << (i == dims.size() - 1 ? "" : " "); + } + stream << "]"; +} + +void TensorShapeProto::dump(std::ostream& stream, size_t indent) { + for (size_t i=0; i < dims.size(); ++i) { + auto &dim = dims[i]; + if (dim->has_dim_value) { + stream << dim->dim_value; + } else { + stream << "?"; + } + stream << (i == dims.size() - 1 ? "" : " "); + } +} + +void TypeProtoTensor::dump(std::ostream& stream, size_t indent) { + stream << "Tensor dims: "; + shape->dump(stream); +} + +void TypeProto::dump(std::ostream& stream, size_t indent) { + tensor_type->dump(stream); +} + +void ValueInfoProto::dump(std::ostream& stream, size_t indent) { + stream << "{name: \"" << name + << "\", type:"; + type->dump(stream); + stream << "}"; +} + +void AttributeProto::dump(std::ostream& stream, size_t indent) { + stream << "{ name: '" << name << "', type: "; + if (proto.has_f) { + stream << "float, value: " << proto.f; + } else if (proto.has_i) { + stream << "int, value: " << proto.i; + } else if (s.length()) { + stream << "string, value: '" << s << "'"; + } else if (g) { + stream << "graph, value:\n"; + g->dump(stream, indent+1); + stream << nlidt(indent); + } else if (t) { + stream << "tensor, value:"; + t->dump(stream, indent+1); + } else if (floats.size()) { + stream << "floats, values: ["; + for (size_t i=0; i < floats.size(); ++i) + stream << *floats[i] << (i == floats.size() - 1 ? "" : " "); + stream << "]"; + } else if (ints.size()) { + stream << "ints, values: ["; + for (size_t i=0; i < ints.size(); ++i) + stream << *ints[i] << (i == ints.size() - 1 ? "" : " "); + stream << "]"; + } else if (strings.size()) { + stream << "strings, values: ["; + for (size_t i=0; i < strings.size(); ++i) + stream << "'" << *strings[i] << "'" << (i == strings.size() - 1 ? "" : " "); + stream << "]"; + } else if (tensors.size()) { + stream << "tensors, values: ["; + for (auto& t : tensors) { + t->dump(stream, indent+1); + } + stream << "]"; + } else if (graphs.size()) { + stream << "graphs, values: ["; + for (auto& g : graphs) { + g->dump(stream, indent+1); + } + stream << "]"; + } else { + stream << "UNKNOWN"; + } + stream << "}"; +} + +void NodeProto::dump(std::ostream& stream, size_t indent) { + stream << "Node {type: \"" << op_type << "\", inputs: ["; + for (size_t i=0; i < inputs.size(); ++i) { + stream << *inputs[i] << (i == inputs.size() - 1 ? "" : ","); + } + stream << "], outputs: ["; + for (size_t i=0; i < outputs.size(); ++i) { + stream << *outputs[i] << (i == outputs.size() - 1 ? "" : ","); + } + stream << "], attributes: ["; + for (size_t i=0; i < attributes.size(); ++i) { + attributes[i]->dump(stream, indent+1); + stream << (i == attributes.size() - 1 ? "" : ","); + } + stream << "]}"; +} + +void GraphProto::dump(std::ostream& stream, size_t indent) { + stream << idt(indent) << "GraphProto {" << nlidt(indent+1) + << "name: \"" << name << "\"" << nlidt(indent+1) + << "inputs: ["; + for (size_t i=0; i < inputs.size(); ++i) { + inputs[i]->dump(stream, indent+2); + stream << (i == inputs.size() - 1 ? "" : ","); + } + stream << "]" << nlidt(indent+1) + << "outputs: ["; + for (size_t i=0; i < outputs.size(); ++i) { + outputs[i]->dump(stream, indent+2); + stream << (i == outputs.size() - 1 ? "" : ","); + } + stream << "]" << nlidt(indent+1) + << "initializers: ["; + for (size_t i=0; i < initializers.size(); ++i) { + initializers[i]->dump(stream, indent+2); + stream << (i == initializers.size() - 1 ? "" : ","); + } + stream << "]" << nlidt(indent+1) + << "nodes: [" << nlidt(indent+2); + for (size_t i=0; i < nodes.size(); ++i) { + nodes[i]->dump(stream, indent+2); + if (i != nodes.size() - 1) stream << "," << nlidt(indent+2); + } + stream << nlidt(indent+1) << "]\n" << idt(indent) << "}\n"; +} + +void OperatorSetIdProto::dump(std::ostream& stream, size_t indent) { + stream << "OperatorSetIdProto { domain: " << domain << "}"; +} + +void ModelProto::dump(std::ostream& stream, size_t indent) { + stream << idt(indent) + << "ModelProto {" << nlidt(indent+1) + << "producer_name: \"" << producer_name << "\"" << nlidt(indent+1) + << "domain: \"" << domain << "\"" << nlidt(indent+1) + << "doc_string: \"" << doc_string << "\""; + if (graph) { + stream << nlidt(indent+1) << "graph:\n"; + graph->dump(stream, indent+2); + } + if (opset_import.size()) { + stream << idt(indent+1) << "opset_import: ["; + for (auto &opset_imp : opset_import) { + opset_imp->dump(stream, indent+2); + } + stream << "],\n"; + } + stream << idt(indent) << "}\n"; +} + +}} // namespace onnx diff --git a/torch/csrc/onnx/onnx.h b/torch/csrc/onnx/onnx.h index 76170e18110f1b..7fa38cc03898e9 100644 --- a/torch/csrc/onnx/onnx.h +++ b/torch/csrc/onnx/onnx.h @@ -1,11 +1,435 @@ #pragma once +#include "torch/csrc/onnx/onnx.npb.h" +#include "torch/csrc/WindowsTorchApiMacro.h" + +#include +#include + +#include +#include +#include + namespace torch { namespace onnx { +using DataType = onnx_TensorProto_DataType; +using Dimension = onnx_TensorShapeProto_Dimension; + +// Note [Unique vector] +// ~~~~~~~~~~~~~~~~~~~~ +// Why do we need vectors of unique pointers? A Google-style C++ Protobuf API +// returns raw pointers T* which are expected to stay valid as long as the +// enclosing protobuf is live. However, if we store T directly in a vector, if +// the vector ever resizes (which it may, because we don't know a priori how +// many elements are in the vector) all of these pointers will be invalidated. +// Thus, up-front, we have to give them permanent, dynamically allocated +// addresses. +template +using unique_vector = std::vector>; + +// Helper function for encoding inside callbacks +template +bool micropb_encode(pb_ostream_t *stream, T* arg) { + static_assert(Field != nullptr, "no overload in micropb_encode"); + return pb_encode_submessage(stream, Field, static_cast(&arg->proto)); +} +template <> bool micropb_encode(pb_ostream_t *stream, std::string* arg); +template <> bool micropb_encode(pb_ostream_t *stream, int64_t* arg); +template <> bool micropb_encode(pb_ostream_t *stream, float* arg); +template <> bool micropb_encode(pb_ostream_t *stream, double* arg); +template <> bool micropb_encode(pb_ostream_t *stream, Dimension* arg); +// NB: If we ever add support for signed protobuf integers, we'll need a special +// wrapper, since we can't overload over them (they look the same from C++ side) + +// Callback functions of type pb_callback_t. + +// Write out a single protobuf field inside a message +template +bool micropb_callback(pb_ostream_t *stream, const pb_field_t *field, void * const *arg) { + if (!pb_encode_tag_for_field(stream, field)) return false; + if (!micropb_encode(stream, static_cast(*arg))) return false; + return true; +} + +// Write out a repeated protobuf field inside a message +template +bool micropb_callback_list(pb_ostream_t *stream, const pb_field_t *field, void * const *arg) { + std::vector>* vals = static_cast>*>(*arg); + for (std::unique_ptr& val : *vals) { + auto ptr = static_cast(val.get()); + if (!micropb_callback(stream, field, &ptr)) return false; + } + return true; +} + +bool micropb_callback_string_from_tensor(pb_ostream_t *stream, const pb_field_t *field, void * const *arg); + +// MicroProto helper class +template +struct MicroProto { + // The actual nanopb generated protobuf struct we are filling. + T proto; + + // The constructor takes the protobuf struct by value for initialization + // (since it is a C-style struct). In the constructor you're + // expected to call this with something like onnx_TensorProto_init_default + MicroProto(T proto) : proto(proto) {} + + // Usage: + // std::string owning_slot; + // proto.string_field = string(&owning_slot, value_to_set) + // + // This function takes a string 's' and copies it into the + // owning slot specified by 'slot'. It then returns a callback + // intended to be assigned into the particular protobuf field. + // The employed callback reads out the string from owning + // slot and writes it out to the protobuf. + // + // You should call this function IN THE SETTER METHOD, because + // the no-op callback is different from a callback with an empty + // string: in the former case, the field is absent; in the latter, + // the field is present but an empty string. + pb_callback_t string(std::string* slot, const std::string& s) { + *slot = s; // copy construct + pb_callback_t r; + r.funcs.encode = µpb_callback; + r.arg = static_cast(slot); + return r; // RVO + } + + // Usage: + // at::Tensor owning_slot; + // proto.string_field = string_from_tensor(&owning_slot, value_to_set) + // + // This function takes an at::Tensor and copies it into the + // owning slot specified by 'slot'. It then returns a callback + // intended to be assigned into the particular protobuf field. + // The employed callback reads out the tensor's data as if it + // were a string (adjusting for endianness, if necessary) + // writes it out to the protobuf. + // + // You should call this function IN THE SETTER METHOD, because + // the no-op callback is different from a callback with an undefined + // Tensor. + pb_callback_t string_from_tensor(at::Tensor* slot, const at::Tensor& t) { + *slot = t; // copy construct + pb_callback_t r; + r.funcs.encode = µpb_callback_string_from_tensor; + r.arg = static_cast(slot); + return r; // RVO + } + + // Usage: + // unique_vector owning_slot; + // proto.list_field = list(&owning_slot) + // + // This function returns a callback intended to be + // assigned into a particular protobuf field. The employed + // callback reads out the vector of elements from the owning + // slot and writes the entries into the protobuf. + // + // You should call this function IN THE CONSTRUCTOR, because + // the no-op callback is equivalent to a callback with an empty + // list. (While it's harmless to call this in the setter, but + // a bit wasteful.) + template + pb_callback_t list(unique_vector* slot) { + pb_callback_t r; + r.funcs.encode = µpb_callback_list; + r.arg = static_cast(slot); + return r; // RVO + } + + template + pb_callback_t msg(std::unique_ptr* slot) { + *slot = std::unique_ptr(new S()); // default construct + pb_callback_t r; + r.funcs.encode = µpb_callback; + r.arg = static_cast(slot->get()); + return r; // RVO + } +}; + +#define DEFINE_CONST(C) \ +const auto k##C = onnx_TensorProto_DataType_##C; +DEFINE_CONST(FLOAT) +DEFINE_CONST(UINT8) +DEFINE_CONST(INT8) +DEFINE_CONST(UINT16) +DEFINE_CONST(INT16) +DEFINE_CONST(INT32) +DEFINE_CONST(INT64) +DEFINE_CONST(STRING) +DEFINE_CONST(BOOL) +DEFINE_CONST(FLOAT16) +DEFINE_CONST(DOUBLE) +DEFINE_CONST(UINT32) +DEFINE_CONST(UINT64) +DEFINE_CONST(COMPLEX64) +DEFINE_CONST(COMPLEX128) +#undef DEFINE_CONST + +#define DEFINE_CONST(C) \ +const auto a##C = onnx_AttributeProto_AttributeType_##C; +DEFINE_CONST(FLOAT) +DEFINE_CONST(INT) +DEFINE_CONST(STRING) +DEFINE_CONST(TENSOR) +DEFINE_CONST(GRAPH) +DEFINE_CONST(FLOATS) +DEFINE_CONST(INTS) +DEFINE_CONST(STRINGS) +DEFINE_CONST(TENSORS) +DEFINE_CONST(GRAPHS) +#undef DEFINE_CONST + +// C++ wrappers which simulate the Google C++ Protobuf API +// +// These are NOT COMPLETE wrappers. If you find something is missing, add it! + +class AttributeProto; +class TensorShapeProto; +class TypeProtoTensor; +class TensorProto; +class TypeProto; +class ValueInfoProto; +class NodeProto; +class GraphProto; +class ModelProto; + +class TensorProto : public MicroProto { +private: + std::string name; // namespace ValueInfoProto. + unique_vector dims; + at::Tensor raw_data; + std::string dump_; +public: + TensorProto() : MicroProto(onnx_TensorProto_init_default) { + proto.dims = list(&dims); + } + void set_name(const std::string& s) { proto.name = string(&name, s); } + void add_dims(int64_t d) { dims.emplace_back(new int64_t(d)); } + // Google Protobuf divergence! + void set_raw_data(const at::Tensor& t) { proto.raw_data = string_from_tensor(&raw_data, t); } + void set_external_data_present() { proto.raw_data = string(&dump_, "__EXTERNAL"); } + void set_data_type(onnx_TensorProto_DataType t) { proto.has_data_type = true; proto.data_type = t; } + std::string get_name() const { return name; } + void dump(std::ostream& stream, size_t indent = 0); +}; + +class TensorShapeProto : public MicroProto { +private: + unique_vector dims; +public: + TensorShapeProto() : MicroProto(onnx_TensorShapeProto_init_default) { + proto.dim = list(&dims); + } + void add_dim(std::int64_t d) { + Dimension* p_d = new Dimension(); + p_d->has_dim_value = true; + p_d->dim_value = d; + dims.emplace_back(p_d); + } + void dump(std::ostream& stream, size_t indent = 0); +}; + +class TypeProtoTensor : public MicroProto { +private: + std::unique_ptr shape; +public: + TypeProtoTensor() : MicroProto(onnx_TypeProto_Tensor_init_default) {} + void set_data_type(onnx_TensorProto_DataType t) { proto.has_elem_type = true; proto.elem_type = t; } + TensorShapeProto* mutable_shape() { + proto.shape = msg(&shape); + return shape.get(); + } + void dump(std::ostream& stream, size_t indent = 0); +}; + +class TypeProto : public MicroProto { +private: + std::unique_ptr tensor_type; +public: + TypeProto() : MicroProto(onnx_TypeProto_init_default) {} + TypeProtoTensor* mutable_tensor_type() { + proto.tensor_type = msg(&tensor_type); + return tensor_type.get(); + } + void dump(std::ostream& stream, size_t indent = 0); +}; + +class ValueInfoProto : public MicroProto { +private: + std::string name; + std::unique_ptr type; +public: + ValueInfoProto() : MicroProto(onnx_ValueInfoProto_init_default) {} + std::string get_name() { return name; } + void set_name(const std::string& s) { proto.name = string(&name, s); } + TypeProto* mutable_type() { + proto.type = msg(&type); + return type.get(); + } + void dump(std::ostream& stream, size_t indent = 0); +}; + +class AttributeProto : public MicroProto { +private: + std::string name; + std::string s; + std::unique_ptr g; + std::unique_ptr t; + unique_vector floats; + unique_vector ints; + unique_vector strings; + unique_vector tensors; + unique_vector graphs; +public: + AttributeProto() : MicroProto(onnx_AttributeProto_init_default) { + proto.floats = list(&floats); + proto.ints = list(&ints); + proto.strings = list(&strings); + proto.tensors = list(&tensors); + proto.graphs = list(&graphs); + } + void set_name(const std::string& s) { proto.name = string(&name, s); } + void set_type(onnx_AttributeProto_AttributeType t) { proto.has_type = true; proto.type = t; } + void set_f(float f) { proto.has_f = true; proto.f = f; } + void set_i(int64_t i) { proto.has_i = true; proto.i = i; } + void set_s(std::string s_) { proto.s = string(&s, s_); } + // See https://developers.google.com/protocol-buffers/docs/reference/cpp-generated#embeddedmessage + GraphProto* mutable_g() { proto.g = msg(&g); return g.get(); } + TensorProto* mutable_t() { proto.t = msg(&t); return t.get(); } + void add_floats(float f) { floats.emplace_back(new float(f)); } + void add_ints(int64_t i) { ints.emplace_back(new int64_t(i)); } + void add_strings(std::string s) { strings.emplace_back(new std::string(s)); } + TensorProto* add_tensors() { + auto ptr = new TensorProto(); + tensors.emplace_back(ptr); + return ptr; + } + GraphProto* add_graphs(); + void dump(std::ostream& stream, size_t indent = 0); +}; + +class NodeProto : public MicroProto { +private: + std::string op_type; + std::string domain; + std::string doc_string; + unique_vector inputs; + unique_vector outputs; + unique_vector attributes; +public: + NodeProto() : MicroProto(onnx_NodeProto_init_default) { + proto.input = list(&inputs); + proto.output = list(&outputs); + proto.attribute = list(&attributes); + } + void add_input(const std::string& s) { inputs.emplace_back(new std::string(s)); } + void clear_input() { inputs.clear(); } + void add_output(const std::string& s) { outputs.emplace_back(new std::string(s)); } + void clear_output() { outputs.clear(); } + AttributeProto* add_attribute() { + auto ptr = new AttributeProto(); + attributes.emplace_back(ptr); + return ptr; + } + void set_op_type(const std::string& s) { proto.op_type = string(&op_type, s); } + void set_domain(const std::string& s) { proto.domain = string(&domain, s); } + void set_doc_string(const std::string& s) { proto.doc_string = string(&doc_string, s); } + void dump(std::ostream& stream, size_t indent = 0); +}; + +class GraphProto : public MicroProto { +private: + std::string name; + unique_vector inputs; + unique_vector outputs; + unique_vector nodes; + unique_vector initializers; +public: + GraphProto() : MicroProto(onnx_GraphProto_init_default) { + proto.input = list(&inputs); + proto.output = list(&outputs); + proto.node = list(&nodes); + proto.initializer = list(&initializers); + } + void set_name(const std::string& s) { proto.name = string(&name, s); } + ValueInfoProto* add_input() { + auto ptr = new ValueInfoProto(); + inputs.emplace_back(ptr); + return ptr; + } + std::string get_input_name(size_t i) { return inputs.at(i)->get_name(); } + ValueInfoProto* add_output() { + auto ptr = new ValueInfoProto(); + outputs.emplace_back(ptr); + return ptr; + } + NodeProto* add_node() { + auto ptr = new NodeProto(); + nodes.emplace_back(ptr); + return ptr; + } + TensorProto* add_initializer() { + auto ptr = new TensorProto(); + initializers.emplace_back(ptr); + return ptr; + } + void dump(std::ostream& stream, size_t indent = 0); +}; + +class OperatorSetIdProto : public MicroProto { +private: + std::string domain; +public: + OperatorSetIdProto() : MicroProto(onnx_OperatorSetIdProto_init_default) {} + void set_domain(const std::string& s) { proto.domain = string(&domain, s); } + void set_version(int64_t v) { proto.has_version = true; proto.version = v; } + void dump(std::ostream& stream, size_t indent = 0); +}; + +class ModelProto : public MicroProto { +private: + std::string producer_name; + std::string producer_version; + std::string domain; + std::string doc_string; + std::unique_ptr graph; + unique_vector opset_import; +public: + ModelProto() : MicroProto(onnx_ModelProto_init_default) { + proto.has_ir_version = true; + proto.ir_version = onnx_Version_IR_VERSION; + proto.opset_import = list(&opset_import); + } + void set_model_version(int64_t i) { proto.has_model_version = true; proto.model_version = i; } + void set_doc_string(const std::string& s) { proto.doc_string = string(&doc_string, s); } + void set_producer_name(const std::string& s) { proto.producer_name = string(&producer_name, s); } + void set_producer_version(const std::string& s) { proto.producer_version = string(&producer_version, s); } + GraphProto* mutable_graph() { + proto.graph = msg(&graph); + return graph.get(); + } + OperatorSetIdProto* add_opset_import() { + auto ptr = new OperatorSetIdProto(); + opset_import.emplace_back(ptr); + return ptr; + } + TORCH_API void dump(std::ostream& stream, size_t indent = 0); + std::string prettyPrint() { + std::stringstream ss; + dump(ss, 0); + return ss.str(); + } +}; + enum class OperatorExportTypes { ONNX, // Strict ONNX export ONNX_ATEN, // ONNX With ATen op everywhere ONNX_ATEN_FALLBACK, // ONNX export with ATen fallback RAW, // Raw export (no ONNX) }; + }} // namespace torch::onnx diff --git a/torch/csrc/onnx/onnx.npb.cpp b/torch/csrc/onnx/onnx.npb.cpp new file mode 100644 index 00000000000000..2d8ee60eaff414 --- /dev/null +++ b/torch/csrc/onnx/onnx.npb.cpp @@ -0,0 +1,162 @@ +/* Automatically generated nanopb constant definitions */ +/* Generated by nanopb-0.3.9-dev */ + +#include "onnx.npb.h" + +/* @@protoc_insertion_point(includes) */ +#if PB_PROTO_HEADER_VERSION != 30 +#error Regenerate this file with the current version of nanopb generator. +#endif + + + +const pb_field_t onnx_AttributeProto_fields[14] = { + PB_FIELD( 1, STRING , OPTIONAL, CALLBACK, FIRST, onnx_AttributeProto, name, name, 0), + PB_FIELD( 2, FLOAT , OPTIONAL, STATIC , OTHER, onnx_AttributeProto, f, name, 0), + PB_FIELD( 3, INT64 , OPTIONAL, STATIC , OTHER, onnx_AttributeProto, i, f, 0), + PB_FIELD( 4, BYTES , OPTIONAL, CALLBACK, OTHER, onnx_AttributeProto, s, i, 0), + PB_FIELD( 5, MESSAGE , OPTIONAL, CALLBACK, OTHER, onnx_AttributeProto, t, s, &onnx_TensorProto_fields), + PB_FIELD( 6, MESSAGE , OPTIONAL, CALLBACK, OTHER, onnx_AttributeProto, g, t, &onnx_GraphProto_fields), + PB_FIELD( 7, FLOAT , REPEATED, CALLBACK, OTHER, onnx_AttributeProto, floats, g, 0), + PB_FIELD( 8, INT64 , REPEATED, CALLBACK, OTHER, onnx_AttributeProto, ints, floats, 0), + PB_FIELD( 9, BYTES , REPEATED, CALLBACK, OTHER, onnx_AttributeProto, strings, ints, 0), + PB_FIELD( 10, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_AttributeProto, tensors, strings, &onnx_TensorProto_fields), + PB_FIELD( 11, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_AttributeProto, graphs, tensors, &onnx_GraphProto_fields), + PB_FIELD( 13, STRING , OPTIONAL, CALLBACK, OTHER, onnx_AttributeProto, doc_string, graphs, 0), + PB_FIELD( 20, UENUM , OPTIONAL, STATIC , OTHER, onnx_AttributeProto, type, doc_string, 0), + PB_LAST_FIELD +}; + +const pb_field_t onnx_ValueInfoProto_fields[4] = { + PB_FIELD( 1, STRING , OPTIONAL, CALLBACK, FIRST, onnx_ValueInfoProto, name, name, 0), + PB_FIELD( 2, MESSAGE , OPTIONAL, CALLBACK, OTHER, onnx_ValueInfoProto, type, name, &onnx_TypeProto_fields), + PB_FIELD( 3, STRING , OPTIONAL, CALLBACK, OTHER, onnx_ValueInfoProto, doc_string, type, 0), + PB_LAST_FIELD +}; + +const pb_field_t onnx_NodeProto_fields[8] = { + PB_FIELD( 1, STRING , REPEATED, CALLBACK, FIRST, onnx_NodeProto, input, input, 0), + PB_FIELD( 2, STRING , REPEATED, CALLBACK, OTHER, onnx_NodeProto, output, input, 0), + PB_FIELD( 3, STRING , OPTIONAL, CALLBACK, OTHER, onnx_NodeProto, name, output, 0), + PB_FIELD( 4, STRING , OPTIONAL, CALLBACK, OTHER, onnx_NodeProto, op_type, name, 0), + PB_FIELD( 5, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_NodeProto, attribute, op_type, &onnx_AttributeProto_fields), + PB_FIELD( 6, STRING , OPTIONAL, CALLBACK, OTHER, onnx_NodeProto, doc_string, attribute, 0), + PB_FIELD( 7, STRING , OPTIONAL, CALLBACK, OTHER, onnx_NodeProto, domain, doc_string, 0), + PB_LAST_FIELD +}; + +const pb_field_t onnx_ModelProto_fields[10] = { + PB_FIELD( 1, INT64 , OPTIONAL, STATIC , FIRST, onnx_ModelProto, ir_version, ir_version, 0), + PB_FIELD( 2, STRING , OPTIONAL, CALLBACK, OTHER, onnx_ModelProto, producer_name, ir_version, 0), + PB_FIELD( 3, STRING , OPTIONAL, CALLBACK, OTHER, onnx_ModelProto, producer_version, producer_name, 0), + PB_FIELD( 4, STRING , OPTIONAL, CALLBACK, OTHER, onnx_ModelProto, domain, producer_version, 0), + PB_FIELD( 5, INT64 , OPTIONAL, STATIC , OTHER, onnx_ModelProto, model_version, domain, 0), + PB_FIELD( 6, STRING , OPTIONAL, CALLBACK, OTHER, onnx_ModelProto, doc_string, model_version, 0), + PB_FIELD( 7, MESSAGE , OPTIONAL, CALLBACK, OTHER, onnx_ModelProto, graph, doc_string, &onnx_GraphProto_fields), + PB_FIELD( 8, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_ModelProto, opset_import, graph, &onnx_OperatorSetIdProto_fields), + PB_FIELD( 14, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_ModelProto, metadata_props, opset_import, &onnx_StringStringEntryProto_fields), + PB_LAST_FIELD +}; + +const pb_field_t onnx_StringStringEntryProto_fields[3] = { + PB_FIELD( 1, STRING , OPTIONAL, CALLBACK, FIRST, onnx_StringStringEntryProto, key, key, 0), + PB_FIELD( 2, STRING , OPTIONAL, CALLBACK, OTHER, onnx_StringStringEntryProto, value, key, 0), + PB_LAST_FIELD +}; + +const pb_field_t onnx_GraphProto_fields[8] = { + PB_FIELD( 1, MESSAGE , REPEATED, CALLBACK, FIRST, onnx_GraphProto, node, node, &onnx_NodeProto_fields), + PB_FIELD( 2, STRING , OPTIONAL, CALLBACK, OTHER, onnx_GraphProto, name, node, 0), + PB_FIELD( 5, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_GraphProto, initializer, name, &onnx_TensorProto_fields), + PB_FIELD( 10, STRING , OPTIONAL, CALLBACK, OTHER, onnx_GraphProto, doc_string, initializer, 0), + PB_FIELD( 11, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_GraphProto, input, doc_string, &onnx_ValueInfoProto_fields), + PB_FIELD( 12, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_GraphProto, output, input, &onnx_ValueInfoProto_fields), + PB_FIELD( 13, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_GraphProto, value_info, output, &onnx_ValueInfoProto_fields), + PB_LAST_FIELD +}; + +const pb_field_t onnx_TensorProto_fields[13] = { + PB_FIELD( 1, INT64 , REPEATED, CALLBACK, FIRST, onnx_TensorProto, dims, dims, 0), + PB_FIELD( 2, UENUM , OPTIONAL, STATIC , OTHER, onnx_TensorProto, data_type, dims, 0), + PB_FIELD( 3, MESSAGE , OPTIONAL, STATIC , OTHER, onnx_TensorProto, segment, data_type, &onnx_TensorProto_Segment_fields), + PB_FIELD( 4, FLOAT , REPEATED, CALLBACK, OTHER, onnx_TensorProto, float_data, segment, 0), + PB_FIELD( 5, INT32 , REPEATED, CALLBACK, OTHER, onnx_TensorProto, int32_data, float_data, 0), + PB_FIELD( 6, BYTES , REPEATED, CALLBACK, OTHER, onnx_TensorProto, string_data, int32_data, 0), + PB_FIELD( 7, INT64 , REPEATED, CALLBACK, OTHER, onnx_TensorProto, int64_data, string_data, 0), + PB_FIELD( 8, STRING , OPTIONAL, CALLBACK, OTHER, onnx_TensorProto, name, int64_data, 0), + PB_FIELD( 9, BYTES , OPTIONAL, CALLBACK, OTHER, onnx_TensorProto, raw_data, name, 0), + PB_FIELD( 10, DOUBLE , REPEATED, CALLBACK, OTHER, onnx_TensorProto, double_data, raw_data, 0), + PB_FIELD( 11, UINT64 , REPEATED, CALLBACK, OTHER, onnx_TensorProto, uint64_data, double_data, 0), + PB_FIELD( 12, STRING , OPTIONAL, CALLBACK, OTHER, onnx_TensorProto, doc_string, uint64_data, 0), + PB_LAST_FIELD +}; + +const pb_field_t onnx_TensorProto_Segment_fields[3] = { + PB_FIELD( 1, INT64 , OPTIONAL, STATIC , FIRST, onnx_TensorProto_Segment, begin, begin, 0), + PB_FIELD( 2, INT64 , OPTIONAL, STATIC , OTHER, onnx_TensorProto_Segment, end, begin, 0), + PB_LAST_FIELD +}; + +const pb_field_t onnx_TensorShapeProto_fields[2] = { + PB_FIELD( 1, MESSAGE , REPEATED, CALLBACK, FIRST, onnx_TensorShapeProto, dim, dim, &onnx_TensorShapeProto_Dimension_fields), + PB_LAST_FIELD +}; + +const pb_field_t onnx_TensorShapeProto_Dimension_fields[3] = { + PB_FIELD( 1, INT64 , OPTIONAL, STATIC , FIRST, onnx_TensorShapeProto_Dimension, dim_value, dim_value, 0), + PB_FIELD( 2, STRING , OPTIONAL, CALLBACK, OTHER, onnx_TensorShapeProto_Dimension, dim_param, dim_value, 0), + PB_LAST_FIELD +}; + +const pb_field_t onnx_TypeProto_fields[2] = { + PB_FIELD( 1, MESSAGE , OPTIONAL, CALLBACK, FIRST, onnx_TypeProto, tensor_type, tensor_type, &onnx_TypeProto_Tensor_fields), + PB_LAST_FIELD +}; + +const pb_field_t onnx_TypeProto_Tensor_fields[3] = { + PB_FIELD( 1, UENUM , OPTIONAL, STATIC , FIRST, onnx_TypeProto_Tensor, elem_type, elem_type, 0), + PB_FIELD( 2, MESSAGE , OPTIONAL, CALLBACK, OTHER, onnx_TypeProto_Tensor, shape, elem_type, &onnx_TensorShapeProto_fields), + PB_LAST_FIELD +}; + +const pb_field_t onnx_OperatorSetIdProto_fields[3] = { + PB_FIELD( 1, STRING , OPTIONAL, CALLBACK, FIRST, onnx_OperatorSetIdProto, domain, domain, 0), + PB_FIELD( 2, INT64 , OPTIONAL, STATIC , OTHER, onnx_OperatorSetIdProto, version, domain, 0), + PB_LAST_FIELD +}; + + + + + +/* Check that field information fits in pb_field_t */ +#if !defined(PB_FIELD_32BIT) +/* If you get an error here, it means that you need to define PB_FIELD_32BIT + * compile-time option. You can do that in pb.h or on compiler command line. + * + * The reason you need to do this is that some of your messages contain tag + * numbers or field sizes that are larger than what can fit in 8 or 16 bit + * field descriptors. + */ +PB_STATIC_ASSERT((pb_membersize(onnx_TensorProto, segment) < 65536), YOU_MUST_DEFINE_PB_FIELD_32BIT_FOR_MESSAGES_onnx_AttributeProto_onnx_ValueInfoProto_onnx_NodeProto_onnx_ModelProto_onnx_StringStringEntryProto_onnx_GraphProto_onnx_TensorProto_onnx_TensorProto_Segment_onnx_TensorShapeProto_onnx_TensorShapeProto_Dimension_onnx_TypeProto_onnx_TypeProto_Tensor_onnx_OperatorSetIdProto) +#endif + +#if !defined(PB_FIELD_16BIT) && !defined(PB_FIELD_32BIT) +/* If you get an error here, it means that you need to define PB_FIELD_16BIT + * compile-time option. You can do that in pb.h or on compiler command line. + * + * The reason you need to do this is that some of your messages contain tag + * numbers or field sizes that are larger than what can fit in the default + * 8 bit descriptors. + */ +PB_STATIC_ASSERT((pb_membersize(onnx_TensorProto, segment) < 256), YOU_MUST_DEFINE_PB_FIELD_16BIT_FOR_MESSAGES_onnx_AttributeProto_onnx_ValueInfoProto_onnx_NodeProto_onnx_ModelProto_onnx_StringStringEntryProto_onnx_GraphProto_onnx_TensorProto_onnx_TensorProto_Segment_onnx_TensorShapeProto_onnx_TensorShapeProto_Dimension_onnx_TypeProto_onnx_TypeProto_Tensor_onnx_OperatorSetIdProto) +#endif + + +/* On some platforms (such as AVR), double is really float. + * These are not directly supported by nanopb, but see example_avr_double. + * To get rid of this error, remove any double fields from your .proto. + */ +PB_STATIC_ASSERT(sizeof(double) == 8, DOUBLE_MUST_BE_8_BYTES) + +/* @@protoc_insertion_point(eof) */ diff --git a/torch/csrc/onnx/onnx.npb.h b/torch/csrc/onnx/onnx.npb.h new file mode 100644 index 00000000000000..84d3b318643830 --- /dev/null +++ b/torch/csrc/onnx/onnx.npb.h @@ -0,0 +1,333 @@ +/* Automatically generated nanopb header */ +/* Generated by nanopb-0.3.9-dev */ + +#ifndef PB_ONNX_ONNX_PB_H_INCLUDED +#define PB_ONNX_ONNX_PB_H_INCLUDED +#include + +/* @@protoc_insertion_point(includes) */ +#if PB_PROTO_HEADER_VERSION != 30 +#error Regenerate this file with the current version of nanopb generator. +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* Enum definitions */ +typedef enum _onnx_Version { + onnx_Version__START_VERSION = 0, + onnx_Version_IR_VERSION_2017_10_10 = 1, + onnx_Version_IR_VERSION_2017_10_30 = 2, + onnx_Version_IR_VERSION = 3 +} onnx_Version; +#define _onnx_Version_MIN onnx_Version__START_VERSION +#define _onnx_Version_MAX onnx_Version_IR_VERSION +#define _onnx_Version_ARRAYSIZE ((onnx_Version)(onnx_Version_IR_VERSION+1)) + +typedef enum _onnx_AttributeProto_AttributeType { + onnx_AttributeProto_AttributeType_UNDEFINED = 0, + onnx_AttributeProto_AttributeType_FLOAT = 1, + onnx_AttributeProto_AttributeType_INT = 2, + onnx_AttributeProto_AttributeType_STRING = 3, + onnx_AttributeProto_AttributeType_TENSOR = 4, + onnx_AttributeProto_AttributeType_GRAPH = 5, + onnx_AttributeProto_AttributeType_FLOATS = 6, + onnx_AttributeProto_AttributeType_INTS = 7, + onnx_AttributeProto_AttributeType_STRINGS = 8, + onnx_AttributeProto_AttributeType_TENSORS = 9, + onnx_AttributeProto_AttributeType_GRAPHS = 10 +} onnx_AttributeProto_AttributeType; +#define _onnx_AttributeProto_AttributeType_MIN onnx_AttributeProto_AttributeType_UNDEFINED +#define _onnx_AttributeProto_AttributeType_MAX onnx_AttributeProto_AttributeType_GRAPHS +#define _onnx_AttributeProto_AttributeType_ARRAYSIZE ((onnx_AttributeProto_AttributeType)(onnx_AttributeProto_AttributeType_GRAPHS+1)) + +typedef enum _onnx_TensorProto_DataType { + onnx_TensorProto_DataType_UNDEFINED = 0, + onnx_TensorProto_DataType_FLOAT = 1, + onnx_TensorProto_DataType_UINT8 = 2, + onnx_TensorProto_DataType_INT8 = 3, + onnx_TensorProto_DataType_UINT16 = 4, + onnx_TensorProto_DataType_INT16 = 5, + onnx_TensorProto_DataType_INT32 = 6, + onnx_TensorProto_DataType_INT64 = 7, + onnx_TensorProto_DataType_STRING = 8, + onnx_TensorProto_DataType_BOOL = 9, + onnx_TensorProto_DataType_FLOAT16 = 10, + onnx_TensorProto_DataType_DOUBLE = 11, + onnx_TensorProto_DataType_UINT32 = 12, + onnx_TensorProto_DataType_UINT64 = 13, + onnx_TensorProto_DataType_COMPLEX64 = 14, + onnx_TensorProto_DataType_COMPLEX128 = 15 +} onnx_TensorProto_DataType; +#define _onnx_TensorProto_DataType_MIN onnx_TensorProto_DataType_UNDEFINED +#define _onnx_TensorProto_DataType_MAX onnx_TensorProto_DataType_COMPLEX128 +#define _onnx_TensorProto_DataType_ARRAYSIZE ((onnx_TensorProto_DataType)(onnx_TensorProto_DataType_COMPLEX128+1)) + +/* Struct definitions */ +typedef struct _onnx_GraphProto { + pb_callback_t node; + pb_callback_t name; + pb_callback_t initializer; + pb_callback_t doc_string; + pb_callback_t input; + pb_callback_t output; + pb_callback_t value_info; +/* @@protoc_insertion_point(struct:onnx_GraphProto) */ +} onnx_GraphProto; + +typedef struct _onnx_NodeProto { + pb_callback_t input; + pb_callback_t output; + pb_callback_t name; + pb_callback_t op_type; + pb_callback_t attribute; + pb_callback_t doc_string; + pb_callback_t domain; +/* @@protoc_insertion_point(struct:onnx_NodeProto) */ +} onnx_NodeProto; + +typedef struct _onnx_StringStringEntryProto { + pb_callback_t key; + pb_callback_t value; +/* @@protoc_insertion_point(struct:onnx_StringStringEntryProto) */ +} onnx_StringStringEntryProto; + +typedef struct _onnx_TensorShapeProto { + pb_callback_t dim; +/* @@protoc_insertion_point(struct:onnx_TensorShapeProto) */ +} onnx_TensorShapeProto; + +typedef struct _onnx_TypeProto { + pb_callback_t tensor_type; +/* @@protoc_insertion_point(struct:onnx_TypeProto) */ +} onnx_TypeProto; + +typedef struct _onnx_ValueInfoProto { + pb_callback_t name; + pb_callback_t type; + pb_callback_t doc_string; +/* @@protoc_insertion_point(struct:onnx_ValueInfoProto) */ +} onnx_ValueInfoProto; + +typedef struct _onnx_AttributeProto { + pb_callback_t name; + bool has_f; + float f; + bool has_i; + int64_t i; + pb_callback_t s; + pb_callback_t t; + pb_callback_t g; + pb_callback_t floats; + pb_callback_t ints; + pb_callback_t strings; + pb_callback_t tensors; + pb_callback_t graphs; + pb_callback_t doc_string; + bool has_type; + onnx_AttributeProto_AttributeType type; +/* @@protoc_insertion_point(struct:onnx_AttributeProto) */ +} onnx_AttributeProto; + +typedef struct _onnx_ModelProto { + bool has_ir_version; + int64_t ir_version; + pb_callback_t producer_name; + pb_callback_t producer_version; + pb_callback_t domain; + bool has_model_version; + int64_t model_version; + pb_callback_t doc_string; + pb_callback_t graph; + pb_callback_t opset_import; + pb_callback_t metadata_props; +/* @@protoc_insertion_point(struct:onnx_ModelProto) */ +} onnx_ModelProto; + +typedef struct _onnx_OperatorSetIdProto { + pb_callback_t domain; + bool has_version; + int64_t version; +/* @@protoc_insertion_point(struct:onnx_OperatorSetIdProto) */ +} onnx_OperatorSetIdProto; + +typedef struct _onnx_TensorProto_Segment { + bool has_begin; + int64_t begin; + bool has_end; + int64_t end; +/* @@protoc_insertion_point(struct:onnx_TensorProto_Segment) */ +} onnx_TensorProto_Segment; + +typedef struct _onnx_TensorShapeProto_Dimension { + bool has_dim_value; + int64_t dim_value; + pb_callback_t dim_param; +/* @@protoc_insertion_point(struct:onnx_TensorShapeProto_Dimension) */ +} onnx_TensorShapeProto_Dimension; + +typedef struct _onnx_TypeProto_Tensor { + bool has_elem_type; + onnx_TensorProto_DataType elem_type; + pb_callback_t shape; +/* @@protoc_insertion_point(struct:onnx_TypeProto_Tensor) */ +} onnx_TypeProto_Tensor; + +typedef struct _onnx_TensorProto { + pb_callback_t dims; + bool has_data_type; + onnx_TensorProto_DataType data_type; + bool has_segment; + onnx_TensorProto_Segment segment; + pb_callback_t float_data; + pb_callback_t int32_data; + pb_callback_t string_data; + pb_callback_t int64_data; + pb_callback_t name; + pb_callback_t raw_data; + pb_callback_t double_data; + pb_callback_t uint64_data; + pb_callback_t doc_string; +/* @@protoc_insertion_point(struct:onnx_TensorProto) */ +} onnx_TensorProto; + +/* Default values for struct fields */ + +/* Initializer values for message structs */ +#define onnx_AttributeProto_init_default {{{NULL}, NULL}, false, 0, false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, false, (onnx_AttributeProto_AttributeType)0} +#define onnx_ValueInfoProto_init_default {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}} +#define onnx_NodeProto_init_default {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}} +#define onnx_ModelProto_init_default {false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}} +#define onnx_StringStringEntryProto_init_default {{{NULL}, NULL}, {{NULL}, NULL}} +#define onnx_GraphProto_init_default {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}} +#define onnx_TensorProto_init_default {{{NULL}, NULL}, false, (onnx_TensorProto_DataType)0, false, onnx_TensorProto_Segment_init_default, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}} +#define onnx_TensorProto_Segment_init_default {false, 0, false, 0} +#define onnx_TensorShapeProto_init_default {{{NULL}, NULL}} +#define onnx_TensorShapeProto_Dimension_init_default {false, 0, {{NULL}, NULL}} +#define onnx_TypeProto_init_default {{{NULL}, NULL}} +#define onnx_TypeProto_Tensor_init_default {false, (onnx_TensorProto_DataType)0, {{NULL}, NULL}} +#define onnx_OperatorSetIdProto_init_default {{{NULL}, NULL}, false, 0} +#define onnx_AttributeProto_init_zero {{{NULL}, NULL}, false, 0, false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, false, (onnx_AttributeProto_AttributeType)0} +#define onnx_ValueInfoProto_init_zero {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}} +#define onnx_NodeProto_init_zero {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}} +#define onnx_ModelProto_init_zero {false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}} +#define onnx_StringStringEntryProto_init_zero {{{NULL}, NULL}, {{NULL}, NULL}} +#define onnx_GraphProto_init_zero {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}} +#define onnx_TensorProto_init_zero {{{NULL}, NULL}, false, (onnx_TensorProto_DataType)0, false, onnx_TensorProto_Segment_init_zero, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}} +#define onnx_TensorProto_Segment_init_zero {false, 0, false, 0} +#define onnx_TensorShapeProto_init_zero {{{NULL}, NULL}} +#define onnx_TensorShapeProto_Dimension_init_zero {false, 0, {{NULL}, NULL}} +#define onnx_TypeProto_init_zero {{{NULL}, NULL}} +#define onnx_TypeProto_Tensor_init_zero {false, (onnx_TensorProto_DataType)0, {{NULL}, NULL}} +#define onnx_OperatorSetIdProto_init_zero {{{NULL}, NULL}, false, 0} + +/* Field tags (for use in manual encoding/decoding) */ +#define onnx_GraphProto_node_tag 1 +#define onnx_GraphProto_name_tag 2 +#define onnx_GraphProto_initializer_tag 5 +#define onnx_GraphProto_doc_string_tag 10 +#define onnx_GraphProto_input_tag 11 +#define onnx_GraphProto_output_tag 12 +#define onnx_GraphProto_value_info_tag 13 +#define onnx_NodeProto_input_tag 1 +#define onnx_NodeProto_output_tag 2 +#define onnx_NodeProto_name_tag 3 +#define onnx_NodeProto_op_type_tag 4 +#define onnx_NodeProto_domain_tag 7 +#define onnx_NodeProto_attribute_tag 5 +#define onnx_NodeProto_doc_string_tag 6 +#define onnx_StringStringEntryProto_key_tag 1 +#define onnx_StringStringEntryProto_value_tag 2 +#define onnx_TensorShapeProto_dim_tag 1 +#define onnx_TypeProto_tensor_type_tag 1 +#define onnx_ValueInfoProto_name_tag 1 +#define onnx_ValueInfoProto_type_tag 2 +#define onnx_ValueInfoProto_doc_string_tag 3 +#define onnx_AttributeProto_name_tag 1 +#define onnx_AttributeProto_doc_string_tag 13 +#define onnx_AttributeProto_type_tag 20 +#define onnx_AttributeProto_f_tag 2 +#define onnx_AttributeProto_i_tag 3 +#define onnx_AttributeProto_s_tag 4 +#define onnx_AttributeProto_t_tag 5 +#define onnx_AttributeProto_g_tag 6 +#define onnx_AttributeProto_floats_tag 7 +#define onnx_AttributeProto_ints_tag 8 +#define onnx_AttributeProto_strings_tag 9 +#define onnx_AttributeProto_tensors_tag 10 +#define onnx_AttributeProto_graphs_tag 11 +#define onnx_ModelProto_ir_version_tag 1 +#define onnx_ModelProto_opset_import_tag 8 +#define onnx_ModelProto_producer_name_tag 2 +#define onnx_ModelProto_producer_version_tag 3 +#define onnx_ModelProto_domain_tag 4 +#define onnx_ModelProto_model_version_tag 5 +#define onnx_ModelProto_doc_string_tag 6 +#define onnx_ModelProto_graph_tag 7 +#define onnx_ModelProto_metadata_props_tag 14 +#define onnx_OperatorSetIdProto_domain_tag 1 +#define onnx_OperatorSetIdProto_version_tag 2 +#define onnx_TensorProto_Segment_begin_tag 1 +#define onnx_TensorProto_Segment_end_tag 2 +#define onnx_TensorShapeProto_Dimension_dim_value_tag 1 +#define onnx_TensorShapeProto_Dimension_dim_param_tag 2 +#define onnx_TypeProto_Tensor_elem_type_tag 1 +#define onnx_TypeProto_Tensor_shape_tag 2 +#define onnx_TensorProto_dims_tag 1 +#define onnx_TensorProto_data_type_tag 2 +#define onnx_TensorProto_segment_tag 3 +#define onnx_TensorProto_float_data_tag 4 +#define onnx_TensorProto_int32_data_tag 5 +#define onnx_TensorProto_string_data_tag 6 +#define onnx_TensorProto_int64_data_tag 7 +#define onnx_TensorProto_name_tag 8 +#define onnx_TensorProto_doc_string_tag 12 +#define onnx_TensorProto_raw_data_tag 9 +#define onnx_TensorProto_double_data_tag 10 +#define onnx_TensorProto_uint64_data_tag 11 + +/* Struct field encoding specification for nanopb */ +extern const pb_field_t onnx_AttributeProto_fields[14]; +extern const pb_field_t onnx_ValueInfoProto_fields[4]; +extern const pb_field_t onnx_NodeProto_fields[8]; +extern const pb_field_t onnx_ModelProto_fields[10]; +extern const pb_field_t onnx_StringStringEntryProto_fields[3]; +extern const pb_field_t onnx_GraphProto_fields[8]; +extern const pb_field_t onnx_TensorProto_fields[13]; +extern const pb_field_t onnx_TensorProto_Segment_fields[3]; +extern const pb_field_t onnx_TensorShapeProto_fields[2]; +extern const pb_field_t onnx_TensorShapeProto_Dimension_fields[3]; +extern const pb_field_t onnx_TypeProto_fields[2]; +extern const pb_field_t onnx_TypeProto_Tensor_fields[3]; +extern const pb_field_t onnx_OperatorSetIdProto_fields[3]; + +/* Maximum encoded size of messages (where known) */ +/* onnx_AttributeProto_size depends on runtime parameters */ +/* onnx_ValueInfoProto_size depends on runtime parameters */ +/* onnx_NodeProto_size depends on runtime parameters */ +/* onnx_ModelProto_size depends on runtime parameters */ +/* onnx_StringStringEntryProto_size depends on runtime parameters */ +/* onnx_GraphProto_size depends on runtime parameters */ +/* onnx_TensorProto_size depends on runtime parameters */ +#define onnx_TensorProto_Segment_size 22 +/* onnx_TensorShapeProto_size depends on runtime parameters */ +/* onnx_TensorShapeProto_Dimension_size depends on runtime parameters */ +/* onnx_TypeProto_size depends on runtime parameters */ +/* onnx_TypeProto_Tensor_size depends on runtime parameters */ +/* onnx_OperatorSetIdProto_size depends on runtime parameters */ + +/* Message IDs (where set with "msgid" option) */ +#ifdef PB_MSGID + +#define ONNX_MESSAGES \ + + +#endif + +#ifdef __cplusplus +} /* extern "C" */ +#endif +/* @@protoc_insertion_point(eof) */ + +#endif diff --git a/torch/csrc/onnx/onnx.options b/torch/csrc/onnx/onnx.options new file mode 100644 index 00000000000000..dd02d208eb7698 --- /dev/null +++ b/torch/csrc/onnx/onnx.options @@ -0,0 +1,24 @@ +# Note [Callback for nested messages] +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# nanopb's default translation for a nested, non-repeated (possibly +# optional) message is to include it *inline* (no indirection), with +# a boolean has_g/has_t field to indicate its presence or not. Why +# do we not like this? It's not compatible with our ownership model, +# where a TensorProto/GraphProto class owns the protobuf struct it +# is constructing. With the default translation, the protobuf struct +# occurs in two places: a TensorProto, AND the parent protobuf struct +# field. That's bad. Turning it back into a callback solves the +# ownership problem. +# +# Two more bonuses: at the cost of an indirection, we no longer waste fields +# when we aren't actually storing a graph/tensor; furthermore, circular +# dependencies now work! + +onnx.AttributeProto.g type:FT_CALLBACK +onnx.AttributeProto.t type:FT_CALLBACK +onnx.ModelProto.graph type:FT_CALLBACK +onnx.TypeProto.Tensor.shape type:FT_CALLBACK +onnx.TypeProto.tensor_type type:FT_CALLBACK +onnx.ValueInfoProto.type type:FT_CALLBACK +onnx.TypeProto no_unions:true +onnx.TensorShapeProto.Dimension no_unions:true diff --git a/torch/csrc/utils/hash.h b/torch/csrc/utils/hash.h index 954a7b5b7d0814..05a5a27b51223a 100644 --- a/torch/csrc/utils/hash.h +++ b/torch/csrc/utils/hash.h @@ -32,7 +32,7 @@ namespace torch { // DEALINGS IN THE SOFTWARE. inline size_t hash_combine(size_t seed, size_t value) { - return seed ^ (value + 0x9e3779b9 + (seed << 6u) + (seed >> 2u)); + return seed ^ (value + 0x9e3779b9 + (seed << 6) + (seed >> 2)); } //////////////////////////////////////////////////////////////////////////////// diff --git a/torch/csrc/utils/invalid_arguments.cpp b/torch/csrc/utils/invalid_arguments.cpp index 0160bdd2d8e506..f8d5fd1ba1cd63 100644 --- a/torch/csrc/utils/invalid_arguments.cpp +++ b/torch/csrc/utils/invalid_arguments.cpp @@ -16,7 +16,7 @@ std::string py_typename(PyObject *object) { struct Type { virtual bool is_matching(PyObject *object) = 0; - virtual ~Type() = default; + virtual ~Type() {}; }; struct SimpleType: public Type { diff --git a/torch/csrc/utils/invalid_arguments.h b/torch/csrc/utils/invalid_arguments.h index daaccfd877f377..138c3331113b7c 100644 --- a/torch/csrc/utils/invalid_arguments.h +++ b/torch/csrc/utils/invalid_arguments.h @@ -7,9 +7,7 @@ namespace torch { std::string format_invalid_args( - PyObject* given_args, - PyObject* given_kwargs, - const std::string& function_name, + PyObject *args, PyObject *kwargs, const std::string& name, const std::vector& options); } // namespace torch diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h index 0f2f51904c2554..b00bd27c087495 100644 --- a/torch/csrc/utils/python_arg_parser.h +++ b/torch/csrc/utils/python_arg_parser.h @@ -90,8 +90,8 @@ struct PythonArgParser { private: [[noreturn]] - void print_error(PyObject* args, PyObject* kwargs, PyObject* parsed_args[]); - PythonArgs raw_parse(PyObject* args, PyObject* kwargs, PyObject* parsed_args[]); + void print_error(PyObject* args, PyObject* kwargs, PyObject* dst[]); + PythonArgs raw_parse(PyObject* args, PyObject* kwargs, PyObject* dst[]); std::vector signatures_; std::string function_name; diff --git a/torch/csrc/utils/tensor_apply.h b/torch/csrc/utils/tensor_apply.h index 5dfdef98c81db4..47fbaa672c4262 100644 --- a/torch/csrc/utils/tensor_apply.h +++ b/torch/csrc/utils/tensor_apply.h @@ -6,8 +6,8 @@ namespace torch { namespace utils { at::Tensor & apply_(at::Tensor & self, PyObject* fn); -at::Tensor & map_(at::Tensor & self, const at::Tensor & other_, PyObject* fn); -at::Tensor & map2_(at::Tensor & self, const at::Tensor & x_, - const at::Tensor & y_, PyObject* fn); +at::Tensor & map_(at::Tensor & self, const at::Tensor & other, PyObject* fn); +at::Tensor & map2_(at::Tensor & self, const at::Tensor & other1, + const at::Tensor & other2, PyObject* fn); }} // namespace torch::utils diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp index d03fd55f2accfc..3a8b4a7bbc1592 100644 --- a/torch/csrc/utils/tensor_new.cpp +++ b/torch/csrc/utils/tensor_new.cpp @@ -139,10 +139,8 @@ ScalarType infer_scalar_type(PyObject *obj) { } #ifdef USE_NUMPY if (PyArray_Check(obj)) { - return numpy_dtype_to_aten(PyArray_TYPE((PyArrayObject*)obj)); - } - if (PyArray_CheckScalar(obj)) { - return numpy_dtype_to_aten(PyArray_TYPE((PyArrayObject*)(PyArray_FromScalar(obj, NULL)))); + auto array = (PyArrayObject*)obj; + return numpy_dtype_to_aten(PyArray_TYPE(array)); } #endif if (PySequence_Check(obj)) { diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py index a2086ae95b899c..f8b26b121fd3e8 100644 --- a/torch/distributed/__init__.py +++ b/torch/distributed/__init__.py @@ -61,8 +61,7 @@ def init_process_group(backend, init_method='env://', **kwargs): group_name (str, optional): Group name. See description of init methods. To enable ``backend == mpi``, PyTorch needs to built from source on a system that - supports MPI. If you want to use Openmpi with CUDA-aware support, please use Openmpi - major version 2 and above. + supports MPI. """ world_size = kwargs.pop('world_size', -1) diff --git a/torch/distributions/__init__.py b/torch/distributions/__init__.py index 47ee177c2cc959..ca961d88ba0a63 100644 --- a/torch/distributions/__init__.py +++ b/torch/distributions/__init__.py @@ -96,7 +96,6 @@ from .lowrank_multivariate_normal import LowRankMultivariateNormal from .multinomial import Multinomial from .multivariate_normal import MultivariateNormal -from .negative_binomial import NegativeBinomial from .normal import Normal from .one_hot_categorical import OneHotCategorical from .pareto import Pareto @@ -130,7 +129,6 @@ 'LogisticNormal', 'Multinomial', 'MultivariateNormal', - 'NegativeBinomial', 'Normal', 'OneHotCategorical', 'Pareto', diff --git a/torch/distributions/constraint_registry.py b/torch/distributions/constraint_registry.py index f8688af3f3a392..a263082c967fe1 100644 --- a/torch/distributions/constraint_registry.py +++ b/torch/distributions/constraint_registry.py @@ -164,9 +164,7 @@ def _transform_to_positive(constraint): @biject_to.register(constraints.greater_than) -@biject_to.register(constraints.greater_than_eq) @transform_to.register(constraints.greater_than) -@transform_to.register(constraints.greater_than_eq) def _transform_to_greater_than(constraint): return transforms.ComposeTransform([transforms.ExpTransform(), transforms.AffineTransform(constraint.lower_bound, 1)]) @@ -180,9 +178,7 @@ def _transform_to_less_than(constraint): @biject_to.register(constraints.interval) -@biject_to.register(constraints.half_open_interval) @transform_to.register(constraints.interval) -@transform_to.register(constraints.half_open_interval) def _transform_to_interval(constraint): # Handle the special case of the unit interval. lower_is_0 = isinstance(constraint.lower_bound, numbers.Number) and constraint.lower_bound == 0 diff --git a/torch/distributions/constraints.py b/torch/distributions/constraints.py index 0b6eb53b0cd93a..18da2bff1392a4 100644 --- a/torch/distributions/constraints.py +++ b/torch/distributions/constraints.py @@ -27,10 +27,8 @@ 'dependent', 'dependent_property', 'greater_than', - 'greater_than_eq', 'integer_interval', 'interval', - 'half_open_interval', 'is_dependent', 'less_than', 'lower_cholesky', @@ -153,17 +151,6 @@ def check(self, value): return self.lower_bound < value -class _GreaterThanEq(Constraint): - """ - Constrain to a real half line `[lower_bound, inf)`. - """ - def __init__(self, lower_bound): - self.lower_bound = lower_bound - - def check(self, value): - return self.lower_bound <= value - - class _LessThan(Constraint): """ Constrain to a real half line `[-inf, upper_bound)`. @@ -187,18 +174,6 @@ def check(self, value): return (self.lower_bound <= value) & (value <= self.upper_bound) -class _HalfOpenInterval(Constraint): - """ - Constrain to a real interval `[lower_bound, upper_bound)`. - """ - def __init__(self, lower_bound, upper_bound): - self.lower_bound = lower_bound - self.upper_bound = upper_bound - - def check(self, value): - return (self.lower_bound <= value) & (value < self.upper_bound) - - class _Simplex(Constraint): """ Constrain to the unit simplex in the innermost (rightmost) dimension. @@ -265,11 +240,9 @@ def check(self, value): real_vector = _RealVector() positive = _GreaterThan(0.) greater_than = _GreaterThan -greater_than_eq = _GreaterThanEq less_than = _LessThan unit_interval = _Interval(0., 1.) interval = _Interval -half_open_interval = _HalfOpenInterval simplex = _Simplex() lower_triangular = _LowerTriangular() lower_cholesky = _LowerCholesky() diff --git a/torch/distributions/negative_binomial.py b/torch/distributions/negative_binomial.py deleted file mode 100644 index 854ad5b7b087fa..00000000000000 --- a/torch/distributions/negative_binomial.py +++ /dev/null @@ -1,83 +0,0 @@ -import torch -import torch.nn.functional as F -from torch.distributions import constraints -from torch.distributions.distribution import Distribution -from torch.distributions.utils import broadcast_all, probs_to_logits, lazy_property, logits_to_probs - - -class NegativeBinomial(Distribution): - r""" - Creates a Negative Binomial distribution, i.e. distribution - of the number of independent identical Bernoulli trials - needed before `total_count` failures are achieved. The probability - of success of each Bernoulli trial is `probs`. - - Args: - total_count (float or Tensor): non-negative number of negative Bernoulli - trials to stop, although the distribution is still valid for real - valued count - probs (Tensor): Event probabilities of success in the half open interval [0, 1) - logits (Tensor): Event log-odds for probabilities of success - """ - arg_constraints = {'total_count': constraints.greater_than_eq(0), - 'probs': constraints.half_open_interval(0., 1.)} - support = constraints.nonnegative_integer - - def __init__(self, total_count, probs=None, logits=None, validate_args=None): - if (probs is None) == (logits is None): - raise ValueError("Either `probs` or `logits` must be specified, but not both.") - if probs is not None: - self.total_count, self.probs, = broadcast_all(total_count, probs) - self.total_count = self.total_count.type_as(self.probs) - else: - self.total_count, self.logits, = broadcast_all(total_count, logits) - self.total_count = self.total_count.type_as(self.logits) - - self._param = self.probs if probs is not None else self.logits - batch_shape = self._param.size() - super(NegativeBinomial, self).__init__(batch_shape, validate_args=validate_args) - - def _new(self, *args, **kwargs): - return self._param.new(*args, **kwargs) - - @property - def mean(self): - return self.total_count * torch.exp(self.logits) - - @property - def variance(self): - return self.mean / torch.sigmoid(-self.logits) - - @lazy_property - def logits(self): - return probs_to_logits(self.probs, is_binary=True) - - @lazy_property - def probs(self): - return logits_to_probs(self.logits, is_binary=True) - - @property - def param_shape(self): - return self._param.size() - - @lazy_property - def _gamma(self): - return torch.distributions.Gamma(concentration=self.total_count, - rate=torch.exp(-self.logits)) - - def sample(self, sample_shape=torch.Size()): - with torch.no_grad(): - rate = self._gamma.sample(sample_shape=sample_shape) - return torch.poisson(rate) - - def log_prob(self, value): - if self._validate_args: - self._validate_sample(value) - - log_unnormalized_prob = (self.total_count * F.logsigmoid(-self.logits) + - value * F.logsigmoid(self.logits)) - - log_normalization = (-torch.lgamma(self.total_count + value) + torch.lgamma(1. + value) + - torch.lgamma(self.total_count)) - - return log_unnormalized_prob - log_normalization diff --git a/torch/distributions/utils.py b/torch/distributions/utils.py index 0219942aac155a..ccc0ffffa2ec21 100644 --- a/torch/distributions/utils.py +++ b/torch/distributions/utils.py @@ -32,19 +32,30 @@ def _finfo(tensor): return _FINFO[tensor.storage_type()] -# promote numbers to tensors of dtype torch.get_default_dtype() -def _default_promotion(v): - return torch.tensor(v, dtype=torch.get_default_dtype()) +def _broadcast_shape(shapes): + r""" + Given a list of tensor sizes, returns the size of the resulting broadcasted + tensor. + + Args: + shapes (list of torch.Size): list of tensor sizes + """ + shape = torch.Size() + for s in shapes: + shape = torch._C._infer_size(s, shape) + return shape def broadcast_all(*values): r""" Given a list of values (possibly containing numbers), returns a list where each value is broadcasted based on the following rules: - - `torch.*Tensor` instances are broadcasted as per :ref:`_broadcasting-semantics`. + - `torch.*Tensor` instances are broadcasted as per the `broadcasting rules + `_ - numbers.Number instances (scalars) are upcast to tensors having the same size and type as the first tensor passed to `values`. If all the - values are scalars, then they are upcasted to scalar Tensors. + values are scalars, then they are upcasted to Tensors having size + `(1,)`. Args: values (list of `numbers.Number` or `torch.*Tensor`) @@ -53,16 +64,22 @@ def broadcast_all(*values): ValueError: if any of the values is not a `numbers.Number` or `torch.*Tensor` instance """ - if not all(torch.is_tensor(v) or isinstance(v, Number) for v in values): + values = list(values) + scalar_idxs = [i for i in range(len(values)) if isinstance(values[i], Number)] + tensor_idxs = [i for i in range(len(values)) if values[i].__class__.__name__ == 'Tensor'] + if len(scalar_idxs) + len(tensor_idxs) != len(values): raise ValueError('Input arguments must all be instances of numbers.Number or torch.tensor.') - if not all(map(torch.is_tensor, values)): - new_tensor = _default_promotion - for value in values: - if torch.is_tensor(value): - new_tensor = value.new_tensor - break - values = [v if torch.is_tensor(v) else new_tensor(v) for v in values] - return torch.broadcast_tensors(*values) + if tensor_idxs: + broadcast_shape = _broadcast_shape([values[i].size() for i in tensor_idxs]) + for idx in tensor_idxs: + values[idx] = values[idx].expand(broadcast_shape) + template = values[tensor_idxs[0]] + for idx in scalar_idxs: + values[idx] = template.new(template.size()).fill_(values[idx]) + else: + for idx in scalar_idxs: + values[idx] = torch.tensor(float(values[idx])) + return values def _sum_rightmost(value, dim): diff --git a/torch/functional.py b/torch/functional.py index 0133a012981854..19d47f394fa757 100644 --- a/torch/functional.py +++ b/torch/functional.py @@ -10,7 +10,6 @@ 'argmin', 'btrifact', 'btriunpack', - 'broadcast_tensors', 'isfinite', 'isinf', 'isnan', @@ -20,28 +19,6 @@ ] -def broadcast_tensors(*tensors): - r"""broadcast_tensors(*tensors) -> List of Tensors - - Broadcasts the given tensors according to :ref:`_broadcasting-semantics`. - - Args: - *tensors: any number of tensors of the same type - - Example:: - - >>> x = torch.arange(3).view(1, 3) - >>> y = torch.arange(2).view(2, 1) - >>> a, b = torch.broadcast_tensors(x, y) - >>> a.size() - torch.Size([2, 3]) - >>> a - tensor([[0, 1, 2], - [0, 1, 2]]) - """ - return torch._C._VariableFunctions.broadcast_tensors(tensors) - - def split(tensor, split_size_or_sections, dim=0): r"""Splits the tensor into chunks. diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py index d09e970f729470..c0cf4f9d1c2e75 100644 --- a/torch/jit/__init__.py +++ b/torch/jit/__init__.py @@ -403,12 +403,9 @@ def wrapper(*args): else: new_args.append(arg) res = res_mod(*new_args) - assert len(res) % 3 == 0 - if len(res) % 3 != 0: - raise "non-batched-tensor output is not supported yet" - result = [BatchTensor(*res[i * 3: i * 3 + 3]) for i in range(len(res) // 3)] - if len(result) == 1: - return result[0] + # assert len(res) / 3 == 0 + # result = [BatchTensor(*res[i * 3: i * 3 + 3]) for i in range(len(res) // 3)] + result = BatchTensor(*res) return result wrapper.__doc__ = fn.__doc__ return wrapper diff --git a/torch/jit/annotations.py b/torch/jit/annotations.py index 1db7749e07e34e..77e6cf777f2784 100644 --- a/torch/jit/annotations.py +++ b/torch/jit/annotations.py @@ -3,7 +3,7 @@ import ast import inspect import torch -from torch._C import DynamicType, TupleType, FloatType, IntType +from torch._C import DynamicType, TupleType from textwrap import dedent @@ -204,13 +204,9 @@ def as_ann(ann): def ann_to_type(ann): if ann is None: - return DynamicType.get() + return DynamicType() elif ann is torch.Tensor: - return DynamicType.get() + return DynamicType() elif is_tuple(ann): return TupleType([ann_to_type(a) for a in ann.__args__]) - elif ann is float: - return FloatType.get() - elif ann is int: - return IntType.get() raise ValueError("The only supported annotations kinds are Tensor and Tuple[...]") diff --git a/torch/jit/batchop.py b/torch/jit/batchop.py index 053130dc0fb488..bda6a3adca3a88 100644 --- a/torch/jit/batchop.py +++ b/torch/jit/batchop.py @@ -1,9 +1,6 @@ import torch -from torch.jit import BatchTensor -# TODO: there are some commented raise statements -# when we support rasie exception in script, we want to check them @torch.jit.script def batch_tanh(data, mask, dims): data = torch.tanh(data) @@ -17,52 +14,13 @@ def batch_sigmoid(data, mask, dims): @torch.jit.script -def batch_relu(data, mask, dims): - data = torch.relu(data) - return data, mask, dims - - -@torch.jit.script -def batch_neg(data, mask, dims): - data = torch.neg(data) - return data, mask, dims - - -@torch.jit.script -def batch_neg_scalar(data): - return torch.neg(data) - - -@torch.jit.script -def batch_add(data1, mask1, dims1, data2, mask2, dims2, alpha_): - alpha = float(alpha_) - data = torch.add(data1, data2, alpha) - mask = mask1 * mask2 - dims = dims1 or dims2 - return data, mask, dims - - -@torch.jit.script -def batch_add_scalar(data, mask, dims, other, alpha_): - alpha = float(alpha_) - data = torch.add(data, other.type_as(data), alpha) - return data, mask, dims - - -@torch.jit.script -def batch_sub(data1, mask1, dims1, data2, mask2, dims2, alpha_): - alpha = float(alpha_) - data = torch.sub(data1, data2, alpha) +def batch_add(data1, mask1, dims1, data2, mask2, dims2): + data = torch.add(data1, data2) mask = mask1 * mask2 dims = dims1 or dims2 return data, mask, dims -@torch.jit.script -def batch_sub_scalar(data1, data2): - return data1 - data2 - - @torch.jit.script def batch_mul(data1, mask1, dims1, data2, mask2, dims2): data = torch.mul(data1, data2) @@ -71,17 +29,6 @@ def batch_mul(data1, mask1, dims1, data2, mask2, dims2): return data, mask, dims -@torch.jit.script -def batch_mul_scalar(data1, data2): - return data1 * data2 - - -@torch.jit.script -def batch_div(data, mask, dims, other): # div(batchtensor, scalar) - data = torch.div(data, other) - return data, mask, dims - - @torch.jit.script def batch_mm(data1, mask1, dims1, data2, mask2, dims2): data1 = data1 * mask1.type_as(data1) @@ -141,388 +88,26 @@ def batch_select(data, mask, dims, dim_, index_): # raise ValueError("Cannot select 0 dim in BatchTensor") data = data.select(dim, index) if dims[dim - 1]: - mask = mask.select(dim, index) - else: mask = mask.select(dim, 0) + else: + mask = mask.select(dim, index) dims = torch.cat((dims[:dim - 1], dims[dim:dims.size(0)])) return data, mask, dims -@torch.jit.script -def batch_fmod(data, mask, dims, other_): - other = int(other_) - data = torch.fmod(data, other) - return data, mask, dims - - -@torch.jit.script -def batch_zeros_like(data, mask, dims): - res_data = torch.zeros_like(data) - return res_data, mask, dims - - -@torch.jit.script -def batch_index_select(data, mask, dims, dim_, index_data, index_mask, index_dims): - dim = int(dim_) - # if dim == 0: - # raise ValueError("Cannot index_select along 0 dim in BatchTensor") - batch_size = data.size(0) # TODO maybe index_mask will be used at some point - res_data = torch.zeros([0]) - res_mask = torch.zeros([0]) - for i in range(batch_size): - d = data[i].index_select(dim - 1, index_data[i]).unsqueeze(0) - if dims[dim - 1]: - m = mask[i].index_select(dim - 1, index_data[i]).unsqueeze(0) - else: - m = mask[i].unsqueeze(0) - if i == 0: - res_data = d - res_mask = m - else: - res_data = torch.cat((res_data, d), 0) - res_mask = torch.cat((res_mask, m), 0) - return res_data, res_mask, dims - - -@torch.jit.script -def batch_view_as(data, mask, dims, data1, mask1, dims1): - # if data.size(0) != data1.size(0): - # raise ValueError("In view_as, tensor and target tensor should have the same batch_size") - # if not torch.equal(dims, dims1): - # raise ValueError("In batched view_as, dims and target dims should be the same") - data = data.view_as(data1) - mask = mask.view_as(mask1) - dims = dims1 - return data, mask, dims - - # assume data, data1, data2 have same size @torch.jit.script def batch_where(data, mask, dims, data1, mask1, dims1, data2, mask2, dims2): - data = data * mask.type_as(data) - cond_data = data - cond_mask = data - if data.dim() == 1: - for _ in range(data1.dim() - 1): - data = data.unsqueeze(data.dim()) - cond_data = data.expand_as(data1) - cond_mask = data.expand_as(mask1) - res_data = torch.where(cond_data, data1, data2) - res_mask = torch.where(cond_mask, mask1, mask2) + res_data = torch.where(data, data1, data2) + res_mask = torch.where(data, mask1, mask2) res_dims = dims1 or dims2 return res_data, res_mask, res_dims - -@torch.jit.script -def batch_where_scalar(cond_, data1, mask1, dims1, data2, mask2, dims2): - cond = torch.zeros([1], dtype=torch.uint8) * cond_ - res_data = torch.where(cond, data1, data2) - res_mask = torch.where(cond, mask1, mask2) - res_dims = torch.where(cond, dims1, dims2) - return res_data, res_mask, res_dims - - -@torch.jit.script -def batch_update(batch_data, batch_mask, batch_dims, new_data, new_mask, new_dims): - data = torch.where(new_mask, new_data, batch_data) - return data, new_mask, new_dims # TODO: consider whether return new_mask and new_dims - - -@torch.jit.script -def batch_any(data, mask, dims): - return torch.gt(torch.sum(data * mask), 0) - - -@torch.jit.script -def batch_type_as(data, mask, dims, data1, mask1, dims1): - return data.type_as(data1), mask, dims - - -@torch.jit.script -def batch_gt(data, mask, dims, data1, mask1, dims1): - return torch.gt(data, data1), mask * mask1, dims or dims1 - - -@torch.jit.script -def batch_gt_scalar(data1, data2): - return torch.gt(data1, data2) - - -@torch.jit.script -def batch_gt_one_scalar(data, mask, dims, other_): - other = float(other_) - return torch.gt(data, other), mask, dims - - -@torch.jit.script -def batch_lt(data, mask, dims, data1, mask1, dims1): - return torch.lt(data, data1), mask * mask1, dims or dims1 - - -@torch.jit.script -def batch_eq(data, mask, dims, data1, mask1, dims1): - return torch.eq(data, data1), mask * mask1, dims or dims1 - - -@torch.jit.script -def batch_size(data, mask, dims, dim_): - dim = int(dim_) - return data.size(dim) - - -@torch.jit.script -def batch_dim(data, mask, dims): - return data.dim() - - -@torch.jit.script -def batch_squeeze(data, mask, dims, dim_): - if int(dim_) < 0: - dim_ += data.dim() - dim = int(dim_) - # if dim == 0: - # raise ValueError("cannot do squeeze along batch_dim") - data = data.squeeze(dim) - mask = mask.squeeze(dim) - dims = torch.cat((dims[:dim - 1], dims[dim:dims.size(0)])) - return data, mask, dims - - -@torch.jit.script -def batch_unsqueeze(data, mask, dims, dim_): - if int(dim_) < 0: - dim_ += data.dim() + 1 - dim = int(dim_) - # if dim == 0: - # raise ValueError("cannot do unsqueeze along batch_dim") - data = data.unsqueeze(dim) - mask = mask.unsqueeze(dim) - dims = torch.cat((dims[:dim], torch.zeros([1], dtype=torch.uint8), dims[dim:dims.size(0)])) - return data, mask, dims - - -@torch.jit.script -def batch_argmax(data, mask, dims, dim_, keepdim_): - dim = int(dim_) - keepdim = int(keepdim_) - # if dim == 0: - # raise ValueError("cannot do argmax along batch_dim") - batch_size = data.size(0) - res_data = torch.zeros([0]) - for i in range(batch_size): - if dims[dim - 1]: - if dim - 1 != 0: - m = mask[i].transpose(0, dim - 1) - else: - m = mask[i] - valid_num = m.sum(0, keepdim=True) - while(valid_num.dim() >= 1): - valid_num = valid_num[0] - d = data[i].unsqueeze(0).narrow(dim, 0, int(valid_num)) - else: - d = data[i].unsqueeze(0) - d = d.argmax(dim, keepdim) - if i == 0: - res_data = d - else: - res_data = torch.cat([res_data, d], 0) - if keepdim: - mask = mask - else: - mask = mask.select(dim, 0) - dims = torch.cat((dims[:dim - 1], dims[dim:dims.size(0)])) - return res_data, mask, dims - - -@torch.jit.script -def batch_topk(data, mask, dims, k_, dim_, largest_, sorted_): - k = int(k_) - dim = int(dim_) - largest = int(largest_) - sorted = int(sorted_) - # if dim == 0: - # raise ValueError("cannot do topk along batch_dim") - batch_size = data.size(0) - res_data = torch.zeros([0]) - res_index = torch.zeros([0]) - for i in range(batch_size): - if dims[dim - 1]: - if dim - 1 != 0: - m = mask[i].transpose(0, dim - 1) - else: - m = mask[i] - valid_num = m.sum(0, keepdim=True) - while(valid_num.dim() >= 1): - valid_num = valid_num[0] - d = data[i].unsqueeze(0).narrow(dim, 0, int(valid_num)) - else: - d = data[i].unsqueeze(0) - d, idx = d.topk(k, dim, largest, sorted) - if i == 0: - res_data = d - res_index = idx - else: - res_data = torch.cat([res_data, d], 0) - res_index = torch.cat([res_index, idx], 0) - if dims[dim - 1]: - mask = mask.narrow(dim, 0, k) - return res_data, mask, dims, res_index, mask, dims - - -@torch.jit.script -def batch_softmax(data, mask, dims, dim_): - dim = int(dim_) - # if dim == 0: - # raise ValueError("cannot do softmax along batch_dim") - batch_size = data.size(0) - max_len = data.size(dim) - res_data = torch.zeros([0]) - for i in range(batch_size): - if dims[dim - 1]: - if dim - 1 != 0: - m = mask[i].transpose(0, dim - 1) - else: - m = mask[i] - valid_num = m.sum(0, keepdim=True) - while(valid_num.dim() >= 1): - valid_num = valid_num[0] - valid_num = int(valid_num) - d = data[i].unsqueeze(0).narrow(dim, 0, valid_num).softmax(dim) - if valid_num < max_len: - d = torch.cat([d, data[i].unsqueeze(0).narrow(dim, valid_num, max_len - valid_num)], dim) - else: - d = data[i].unsqueeze(0).softmax(dim) - if i == 0: - res_data = d - else: - res_data = torch.cat([res_data, d], 0) - return res_data, mask, dims - - -# size argument in dynamic dimension has to be -1 -# in static dimension, size has to be specified, -1 is not supported -@torch.jit.script -def batch_view(data, mask, dims, sizes): - batch_size = data.size(0) - # if(sizes[0] != batch_size and sizes[0] != -1 and sizes[0] != 1): - # raise "first dim in view must be 1, -1, or batch size" - # for i in range(dims.size(0)): - # if dims[0] == 1 and sizes[i + 1] != -1: - # raise "size argument in dynamic dimension has to be -1" - sizes = sizes.type_as(torch.ones([1], dtype=torch.int)) - data_sizes_ = torch.cat([torch.ones([1], dtype=torch.int) * batch_size, sizes.narrow(0, 1, sizes.size(0) - 1)], 0) - data_sizes = data_sizes_._tensor_to_list() - res_data = data.view(data_sizes) - mask_sizes_ = data_sizes_.narrow(0, 0, 1) - res_dims = data_sizes_.narrow(0, 0, 1) - for i_ in range(sizes.size(0) - 1): - i = i_ + 1 - if(sizes[i] == -1): - cur_size_ = mask.size(i) - cur_dim = 1 - else: - cur_size_ = 1 - cur_dim = 0 - mask_sizes_ = torch.cat([mask_sizes_, torch.ones([1], dtype=torch.int) * cur_size_]) - res_dims = torch.cat([res_dims, torch.ones([1], dtype=torch.int) * cur_dim]) - mask_sizes = mask_sizes_._tensor_to_list() - res_mask = mask.view(mask_sizes) - return res_data, res_mask, res_dims.narrow(0, 1, res_dims.size(0) - 1).type_as(dims) - - -@torch.jit.script -def batch_cat2(data1, mask1, dims1, data2, mask2, dims2, dim_): - dim = int(dim_) - data = torch.cat([data1, data2], dim) - if(dims1[dim - 1]): - mask = torch.cat([mask1, mask2], dim) - else: - mask = mask1 - return data, mask, dims1 - - -@torch.jit.script -def batch_cat3(data1, mask1, dims1, data2, mask2, dims2, data3, mask3, dims3, dim_): - dim = int(dim_) - data = torch.cat([data1, data2, data3], dim) - if(dims1[dim - 1]): - mask = torch.cat([mask1, mask2, mask3], dim) - else: - mask = mask1 - return data, mask, dims1 - - -@torch.jit.script -def batch_narrow(data, mask, dims, dimension_, start_, length_): - dimension = int(dimension_) - start = int(start_) - length = int(length_) - # if dimension == 0: - # raise ValueError("cannot do narrow along batch_dim") - data = data.narrow(dimension, start, length) - if dims[dimension - 1]: - mask = mask.narrow(dimension, start, length) - else: - mask = mask.narrow(dimension, 0, 1) - return data, mask, dims - - -@torch.jit.script -def batch_sum(data, mask, dims): - data = data * mask.type_as(data) - for _ in range(dims.size(0)): - data = data.sum(1) - mask = torch.ones([data.size(0)], dtype=torch.uint8) - dims = dims[:0] # empty tensor - return data, mask, dims - - -@torch.jit.script -def batch_from_scalar_tensor(data): - data = data.unsqueeze(0) - mask = torch.ones([1], dtype=torch.uint8) - dims = torch.zeros([0], dtype=torch.uint8) - return data, mask, dims - torch.register_batch_operator("tanh", batch_tanh.graph) torch.register_batch_operator("sigmoid", batch_sigmoid.graph) -torch.register_batch_operator("relu", batch_relu.graph) -torch.register_batch_operator("neg", batch_neg.graph) -torch.register_batch_operator("neg", batch_neg_scalar.graph) torch.register_batch_operator("add", batch_add.graph) -torch.register_batch_operator("add", batch_add_scalar.graph) -torch.register_batch_operator("sub", batch_sub.graph) -torch.register_batch_operator("sub", batch_sub_scalar.graph) torch.register_batch_operator("mul", batch_mul.graph) -torch.register_batch_operator("mul", batch_mul_scalar.graph) -torch.register_batch_operator("div", batch_div.graph) torch.register_batch_operator("matmul", batch_matmul.graph) torch.register_batch_operator("mm", batch_mm.graph) -torch.register_batch_operator("fmod", batch_fmod.graph) -torch.register_batch_operator("zeros_like", batch_zeros_like.graph) torch.register_batch_operator("select", batch_select.graph) -torch.register_batch_operator("index_select", batch_index_select.graph) -torch.register_batch_operator("view_as", batch_view_as.graph) torch.register_batch_operator("where", batch_where.graph) -torch.register_batch_operator("where", batch_where_scalar.graph) -torch.register_batch_operator("update", batch_update.graph) -torch.register_batch_operator("any", batch_any.graph) -torch.register_batch_operator("type_as", batch_type_as.graph) -torch.register_batch_operator("gt", batch_gt.graph) -torch.register_batch_operator("gt", batch_gt_scalar.graph) -torch.register_batch_operator("gt", batch_gt_one_scalar.graph) -torch.register_batch_operator("lt", batch_lt.graph) -torch.register_batch_operator("eq", batch_eq.graph) -torch.register_batch_operator("size", batch_size.graph) -torch.register_batch_operator("dim", batch_dim.graph) -torch.register_batch_operator("squeeze", batch_squeeze.graph) -torch.register_batch_operator("unsqueeze", batch_unsqueeze.graph) -torch.register_batch_operator("argmax", batch_argmax.graph) -torch.register_batch_operator("topk", batch_topk.graph) -torch.register_batch_operator("softmax", batch_softmax.graph) -torch.register_batch_operator("view", batch_view.graph) -torch.register_batch_operator("cat", batch_cat2.graph) -torch.register_batch_operator("cat", batch_cat3.graph) -torch.register_batch_operator("narrow", batch_narrow.graph) -torch.register_batch_operator("sum", batch_sum.graph) -torch.register_batch_operator("batch_from_scalar_tensor", batch_from_scalar_tensor.graph) diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py index bc979d15141121..d152b2010fcae4 100644 --- a/torch/jit/frontend.py +++ b/torch/jit/frontend.py @@ -435,8 +435,8 @@ def build_List(ctx, expr): @staticmethod def build_Tuple(ctx, expr): - return TupleLiteral(ctx.make_range(expr.lineno, expr.col_offset, expr.col_offset + 1), - [build_expr(ctx, e) for e in expr.elts]) + return ListLiteral(ctx.make_range(expr.lineno, expr.col_offset, expr.col_offset + 1), + [build_expr(ctx, e) for e in expr.elts]) @staticmethod def build_Num(ctx, expr): diff --git a/torch/legacy/nn/ELU.py b/torch/legacy/nn/ELU.py index 9e00e8a172fc88..6ad240658a9e28 100644 --- a/torch/legacy/nn/ELU.py +++ b/torch/legacy/nn/ELU.py @@ -23,7 +23,6 @@ def updateOutput(self, input): self.output, self.alpha, 1.0, - 1.0, self.inplace ) return self.output @@ -35,7 +34,6 @@ def updateGradInput(self, input, gradOutput): self.gradInput, self.output, self.alpha, - 1.0, 1.0 ) return self.gradInput diff --git a/torch/lib/THD/base/data_channels/DataChannelMPI.cpp b/torch/lib/THD/base/data_channels/DataChannelMPI.cpp index b23157581bdfc0..cc176931d8c0c2 100644 --- a/torch/lib/THD/base/data_channels/DataChannelMPI.cpp +++ b/torch/lib/THD/base/data_channels/DataChannelMPI.cpp @@ -100,14 +100,6 @@ void DataChannelMPI::destroy() {} bool DataChannelMPI::init() { -#ifdef OMPI_MAJOR_VERSION - // OMPI_* is specific to Openmpi implementation. - // Openmpi v1.10 segfaults in MPI_Bcast with CUDA buffer. - if (int(OMPI_MAJOR_VERSION) < 2) { - throw std::runtime_error("Please use Openmpi major version 2 and above for distributed."); - } -#endif /* OMPI_MAJOR_VERSION */ - int provided; MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &provided); if (provided != MPI_THREAD_MULTIPLE) { diff --git a/torch/lib/c10d/Utils.hpp b/torch/lib/c10d/Utils.hpp index 9bb0ef0e98ca82..26f6c480420b67 100644 --- a/torch/lib/c10d/Utils.hpp +++ b/torch/lib/c10d/Utils.hpp @@ -64,7 +64,7 @@ inline std::vector> getSizes( const std::vector& tensors) { std::vector> sizes(tensors.size()); for (size_t i = 0; i < tensors.size(); i++) { - sizes[i] = tensors[i].sizes().vec(); + sizes[i] = tensors[i].sizes(); } return sizes; } diff --git a/torch/nn/functional.py b/torch/nn/functional.py index 746c2664529175..17a7c09b012da6 100644 --- a/torch/nn/functional.py +++ b/torch/nn/functional.py @@ -741,25 +741,6 @@ def selu(input, inplace=False): """) -def celu(input, alpha=1., inplace=False): - r"""celu(input, alpha=1., inplace=False) -> Tensor - - Applies element-wise, - :math:`\text{CELU}(x) = \max(0,x) + \min(0, \alpha * (\exp(x/\alpha) - 1))`. - - See :class:`~torch.nn.CELU` for more details. - """ - if inplace: - return torch.celu_(input, alpha) - return torch.celu(input, alpha) - -celu_ = _add_docstr(torch.celu_, r""" -celu_(input, alpha=1.) -> Tensor - -In-place version of :func:`~celu`. -""") - - def leaky_relu(input, negative_slope=0.01, inplace=False): r""" leaky_relu(input, negative_slope=0.01, inplace=False) -> Tensor @@ -878,7 +859,7 @@ def softmin(input, dim=None, _stacklevel=3): """ if dim is None: dim = _get_softmax_dim('softmin', input.dim(), _stacklevel) - return (-input).softmax(dim) + return -input.softmax(dim) def softmax(input, dim=None, _stacklevel=3): @@ -1118,7 +1099,7 @@ def embedding(input, weight, padding_idx=None, max_norm=None, norm_type=2, assert padding_idx >= -weight.size(0), 'Padding_idx must be within num_embeddings' padding_idx = weight.size(0) + padding_idx elif padding_idx is None: - padding_idx = -1 + padding_idx = -1 if max_norm is not None: # `embedding_renorm_` will call .contiguous() on input anyways, so we # call it here and take advantage of the improved locality in the @@ -1369,41 +1350,6 @@ def local_response_norm(input, size, alpha=1e-4, beta=0.75, k=1): # loss -def ctc_loss(log_probs, targets, input_lengths, target_lengths, blank=0, - reduction='elementwise_mean'): - r"""The Connectionist Temporal Classification loss. - - See :class:`~torch.nn.CTCLoss` for details. - - Args: - log_probs: :math:`(T, N, C)` where `C = number of characters in alphabet including blank`, - `T = input length`, and `N = batch size`. - The logarithmized probabilities of the outputs - (e.g. obtained with :func:`torch.nn.functional.log_softmax`). - targets: :math:`(N, S)` or `(sum(target_lenghts))`. - Targets (cannot be blank). In the second form, the targets are assumed to be concatenated. - input_lengths: :math:`(N)`. - Lengths of the inputs (must each be :math:`\leq T`) - target_lengths: :math:`(N)`. - Lengths of the targets - blank (int, optional): - Blank label. Default :math:`0`. - reduction (string, optional): Specifies the reduction to apply to the output: - 'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied, - 'elementwise_mean': the output losses will be divided by the target lengths and - then the mean over the batch is taken. Default: 'elementwise_mean' - - Example:: - - >>> log_probs = torch.randn(50, 16, 20).log_softmax(2).detach().requires_grad_() - >>> targets = torch.randint(1, 21, (16, 30), dtype=torch.long) - >>> input_lengths = torch.full((16,), 50, dtype=torch.long) - >>> target_lengths = torch.randint(10,30,(16,), dtype=torch.long) - >>> loss = F.ctc_loss(log_probs, targets, input_lengths, target_lengths) - >>> loss.backward() - """ - return torch.ctc_loss(log_probs, targets, input_lengths, target_lengths, blank, _Reduction.get_enum(reduction)) - def nll_loss(input, target, weight=None, size_average=None, ignore_index=-100, reduce=None, reduction='elementwise_mean'): @@ -1725,7 +1671,7 @@ def _pointwise_loss(lambd, lambd_optimized, input, target, reduction='elementwis return d return torch.mean(d) if reduction == 'elementwise_mean' else torch.sum(d) else: - return lambd_optimized(input, target, _Reduction.get_enum(reduction)) + return lambd_optimized(input, target, reduction) def smooth_l1_loss(input, target, size_average=None, reduce=None, reduction='elementwise_mean'): @@ -1749,7 +1695,9 @@ def l1_loss(input, target, size_average=None, reduce=None, reduction='elementwis See :class:`~torch.nn.L1Loss` for details. """ if size_average is not None or reduce is not None: - reduction = _Reduction.legacy_get_string(size_average, reduce) + reduction = _Reduction.legacy_get_enum(size_average, reduce) + else: + reduction = _Reduction.get_enum(reduction) return _pointwise_loss(lambda a, b: torch.abs(a - b), torch._C._nn.l1_loss, input, target, reduction) @@ -1762,7 +1710,9 @@ def mse_loss(input, target, size_average=None, reduce=None, reduction='elementwi See :class:`~torch.nn.MSELoss` for details. """ if size_average is not None or reduce is not None: - reduction = _Reduction.legacy_get_string(size_average, reduce) + reduction = _Reduction.legacy_get_enum(size_average, reduce) + else: + reduction = _Reduction.get_enum(reduction) return _pointwise_loss(lambda a, b: (a - b) ** 2, torch._C._nn.mse_loss, input, target, reduction) diff --git a/torch/nn/modules/__init__.py b/torch/nn/modules/__init__.py index 6c66f8d43f005f..4d98f482768a63 100644 --- a/torch/nn/modules/__init__.py +++ b/torch/nn/modules/__init__.py @@ -3,10 +3,10 @@ from .conv import Conv1d, Conv2d, Conv3d, \ ConvTranspose1d, ConvTranspose2d, ConvTranspose3d from .activation import Threshold, ReLU, Hardtanh, ReLU6, Sigmoid, Tanh, \ - Softmax, Softmax2d, LogSoftmax, ELU, SELU, CELU, Hardshrink, LeakyReLU, LogSigmoid, \ + Softmax, Softmax2d, LogSoftmax, ELU, SELU, Hardshrink, LeakyReLU, LogSigmoid, \ Softplus, Softshrink, PReLU, Softsign, Softmin, Tanhshrink, RReLU, GLU from .loss import L1Loss, NLLLoss, KLDivLoss, MSELoss, BCELoss, BCEWithLogitsLoss, NLLLoss2d, \ - CosineEmbeddingLoss, CTCLoss, HingeEmbeddingLoss, MarginRankingLoss, \ + CosineEmbeddingLoss, HingeEmbeddingLoss, MarginRankingLoss, \ MultiLabelMarginLoss, MultiLabelSoftMarginLoss, MultiMarginLoss, \ SmoothL1Loss, SoftMarginLoss, CrossEntropyLoss, TripletMarginLoss, PoissonNLLLoss from .container import Container, Sequential, ModuleList, ModuleDict, ParameterList, ParameterDict @@ -31,10 +31,10 @@ __all__ = [ 'Module', 'Linear', 'Conv1d', 'Conv2d', 'Conv3d', 'ConvTranspose1d', 'ConvTranspose2d', 'ConvTranspose3d', 'Threshold', 'ReLU', 'Hardtanh', 'ReLU6', - 'Sigmoid', 'Tanh', 'Softmax', 'Softmax2d', 'LogSoftmax', 'ELU', 'SELU', 'CELU', 'GLU', 'Hardshrink', + 'Sigmoid', 'Tanh', 'Softmax', 'Softmax2d', 'LogSoftmax', 'ELU', 'SELU', 'GLU', 'Hardshrink', 'LeakyReLU', 'LogSigmoid', 'Softplus', 'Softshrink', 'PReLU', 'Softsign', 'Softmin', 'Tanhshrink', 'RReLU', 'L1Loss', 'NLLLoss', 'KLDivLoss', 'MSELoss', 'BCELoss', 'BCEWithLogitsLoss', - 'NLLLoss2d', 'PoissonNLLLoss', 'CosineEmbeddingLoss', 'CTCLoss', 'HingeEmbeddingLoss', 'MarginRankingLoss', + 'NLLLoss2d', 'PoissonNLLLoss', 'CosineEmbeddingLoss', 'HingeEmbeddingLoss', 'MarginRankingLoss', 'MultiLabelMarginLoss', 'MultiLabelSoftMarginLoss', 'MultiMarginLoss', 'SmoothL1Loss', 'SoftMarginLoss', 'CrossEntropyLoss', 'Container', 'Sequential', 'ModuleList', 'ModuleDict', 'ParameterList', 'ParameterDict', 'AvgPool1d', 'AvgPool2d', 'AvgPool3d', 'MaxPool1d', 'MaxPool2d', diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py index 51cfab79404145..d372a2cae21d2c 100644 --- a/torch/nn/modules/activation.py +++ b/torch/nn/modules/activation.py @@ -118,7 +118,6 @@ class RReLU(Module): .. _`Empirical Evaluation of Rectified Activations in Convolutional Network`: https://arxiv.org/abs/1505.00853 """ - def __init__(self, lower=1. / 8, upper=1. / 3, inplace=False): super(RReLU, self).__init__() self.lower = lower @@ -300,46 +299,6 @@ def extra_repr(self): return 'alpha={}{}'.format(self.alpha, inplace_str) -class CELU(Module): - r"""Applies element-wise, - :math:`\text{CELU}(x) = \max(0,x) + \min(0, \alpha * (\exp(x/\alpha) - 1))` - - More details can be found in the paper `Continuously Differentiable Exponential Linear Units`_ . - - Args: - alpha: the :math:`\alpha` value for the CELU formulation. Default: 1.0 - inplace: can optionally do the operation in-place. Default: ``False`` - - Shape: - - Input: :math:`(N, *)` where `*` means, any number of additional - dimensions - - Output: :math:`(N, *)`, same shape as the input - - .. image:: scripts/activation_images/CELU.png - - Examples:: - - >>> m = nn.CELU() - >>> input = torch.randn(2) - >>> output = m(input) - - .. _`Continuously Differentiable Exponential Linear Units`: - https://arxiv.org/abs/1704.07483 - """ - - def __init__(self, alpha=1., inplace=False): - super(CELU, self).__init__() - self.alpha = alpha - self.inplace = inplace - - def forward(self, input): - return F.celu(input, self.alpha, self.inplace) - - def extra_repr(self): - inplace_str = ', inplace' if self.inplace else '' - return 'alpha={}{}'.format(self.alpha, inplace_str) - - class SELU(Module): r"""Applies element-wise, :math:`\text{SELU}(x) = \text{scale} * (\max(0,x) + \min(0, \alpha * (\exp(x) - 1)))`, @@ -709,7 +668,6 @@ class Softmin(Module): >>> input = torch.randn(2, 3) >>> output = m(input) """ - def __init__(self, dim=None): super(Softmin, self).__init__() self.dim = dim diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py index ec7d60d8125152..489e8998843f98 100644 --- a/torch/nn/modules/loss.py +++ b/torch/nn/modules/loss.py @@ -1123,61 +1123,6 @@ def forward(self, anchor, positive, negative): return F.triplet_margin_loss(anchor, positive, negative, margin=self.margin, p=self.p, eps=self.eps, swap=self.swap, reduction=self.reduction) - -class CTCLoss(_Loss): - r"""The Connectionist Temporal Classification loss. - - Args: - blank (int, optional): blank label. Default :math:`0`. - reduction (string, optional): Specifies the reduction to apply to the output: - 'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied, - 'elementwise_mean': the output losses will be divided by the target lengths and - then the mean over the batch is taken. Default: 'elementwise_mean' - - Inputs: - log_probs: :math:`(T, N, C)` where `C = number of characters in alphabet including blank`, - `T = input length`, and `N = batch size`. - The logarithmized probabilities of the outputs - (e.g. obtained with :func:`torch.nn.functional.log_softmax`). - targets: :math:`(N, S)` or `(sum(target_lenghts))`. - Targets (cannot be blank). In the second form, the targets are assumed to be concatenated. - input_lengths: :math:`(N)`. - Lengths of the inputs (must each be :math:`\leq T`) - target_lengths: :math:`(N)`. - Lengths of the targets - - - Example:: - - >>> ctc_loss = nn.CTCLoss() - >>> log_probs = torch.randn(50, 16, 20).log_softmax(2).detach().requires_grad_() - >>> targets = torch.randint(1, 21, (16, 30), dtype=torch.long) - >>> input_lengths = torch.full((16,), 50, dtype=torch.long) - >>> target_lengths = torch.randint(10,30,(16,), dtype=torch.long) - >>> loss = ctc_loss(log_probs, targets, input_lengths, target_lengths) - >>> loss.backward() - - Reference: - A. Graves et al.: Connectionist Temporal Classification: - Labelling Unsegmented Sequence Data with Recurrent Neural Networks: - https://www.cs.toronto.edu/~graves/icml_2006.pdf - - .. Note:: - In order to use CuDNN, the following must be satisfied: :attr:`targets` must be - in concatenated format, all :attr:`input_lengths` must be `T`. :math:`blank=0`, - :attr:`target_lengths` :math:`\leq 256`, the integer arguments must be of - :class:`torch.IntTensor`. - - The regular implementation uses the (more common in PyTorch) `torch.long` dtype. - """ - - def __init__(self, blank=0, reduction='elementwise_mean'): - super(CTCLoss, self).__init__(reduction=reduction) - self.blank = blank - - def forward(self, log_probs, targets, input_lengths, target_lengths): - return F.ctc_loss(log_probs, targets, input_lengths, target_lengths, self.blank, self.reduction) - # TODO: L1HingeEmbeddingCriterion # TODO: MSECriterion weight # TODO: ClassSimplexCriterion diff --git a/torch/nn/parallel/distributed_c10d.py b/torch/nn/parallel/distributed_c10d.py index 424670ac76fc14..c2b32cb97b6b01 100644 --- a/torch/nn/parallel/distributed_c10d.py +++ b/torch/nn/parallel/distributed_c10d.py @@ -242,7 +242,11 @@ def train(self, mode=True): module.train(mode) def _dist_broadcast_coalesced(self, tensors, buffer_size): - c10d._dist_broadcast_coalesced(tensors, buffer_size, self.process_group) + for tensors in _take_tensors(tensors, buffer_size): + flat_tensors = _flatten_dense_tensors(tensors) + c10d.broadcast(flat_tensors, 0, self.process_group).wait() + for tensor, synced in zip(tensors, _unflatten_dense_tensors(flat_tensors, tensors)): + tensor.copy_(synced) def _sync_params(self): if len(self.device_ids) > 1: diff --git a/torch/nn/utils/convert_parameters.py b/torch/nn/utils/convert_parameters.py index 36a7eb207bcc65..7f0dd1666dad9c 100644 --- a/torch/nn/utils/convert_parameters.py +++ b/torch/nn/utils/convert_parameters.py @@ -45,9 +45,9 @@ def vector_to_parameters(vec, parameters): param_device = _check_param_device(param, param_device) # The length of the parameter - num_param = param.numel() + num_param = torch.prod(torch.LongTensor(list(param.size()))) # Slice the vector, reshape it, and replace the old data of the parameter - param.data = vec[pointer:pointer + num_param].view_as(param).data + param.data = vec[pointer:pointer + num_param].view(param.size()).data # Increment the pointer pointer += num_param diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py index 3262ca282b2c5d..3ca44f35c4eff3 100644 --- a/torch/onnx/symbolic.py +++ b/torch/onnx/symbolic.py @@ -70,12 +70,6 @@ def _get_const(value, desc, arg_name): return _parse_arg(value, desc) -def _unpack_list(list_value): - list_node = list_value.node() - assert list_node.kind() == "prim::ListConstruct" - return list_node.inputs() - - def parse_args(*arg_descriptors): def decorator(fn): def wrapper(g, *args): @@ -221,18 +215,13 @@ def reciprocal(g, self): return g.op("Div", _if_scalar_type_as(g, torch.ones(1), self), self) -@parse_args('v', 'i') -def cat(g, tensor_list, dim): - tensors = _unpack_list(tensor_list) +# This syntax is Python 2 portable +def cat(g, *args): + dim = _get_const(args[-1], 'i', 'dim') + tensors = args[:-1] return g.op("Concat", *tensors, axis_i=dim) -@parse_args('v', 'i') -def stack(g, tensor_list, dim): - unsqueezed = [g.op("Unsqueeze", t, axes_i=[dim]) for t in _unpack_list(tensor_list)] - return g.op("Concat", *unsqueezed, axis_i=dim) - - def mm(g, self, other): # Create a dummy C tensor. Only needed for API purposes, the value is # since beta = 0 @@ -360,6 +349,11 @@ def view(g, self, size): return g.op("Reshape", self, shape) +def stack(g, *args): + unsqueezed = [g.op("Unsqueeze", t, axes_i=[dim]) for t in args[:-1]] + [args[-1]] + return concat(g, *unsqueezed) + + @parse_args('v', 'i', 'i') def split(g, self, split_size, dim): size = self.type().sizes()[dim] @@ -561,10 +555,9 @@ def replication_pad(g, input, padding): @parse_args('v', 'is') def upsample_nearest2d(g, input, output_size): - height_scale = float(output_size[-2]) / input.type().sizes()[-2] - width_scale = float(output_size[-1]) / input.type().sizes()[-1] return g.op("Upsample", input, - scales_f=[1., 1., height_scale, width_scale], + height_scale_f=float(output_size[-2]) / input.type().sizes()[-2], + width_scale_f=float(output_size[-1]) / input.type().sizes()[-1], mode_s="nearest") @@ -572,11 +565,10 @@ def upsample_nearest2d(g, input, output_size): def upsample_bilinear2d(g, input, output_size, align_corners): if align_corners: return _unimplemented("upsample_bilinear2d", "align_corners == True") - height_scale = float(output_size[-2]) / input.type().sizes()[-2] - width_scale = float(output_size[-1]) / input.type().sizes()[-1] - return g.op("Upsample", input, - scales_f=[1., 1., height_scale, width_scale], - mode_s="bilinear") + w_scale = float(output_size[-1]) / input.type().sizes()[-1] + h_scale = float(output_size[-2]) / input.type().sizes()[-2] + return g.op("Upsample", input, width_scale_f=w_scale, + height_scale_f=h_scale, mode_s="bilinear") def gt(g, input, other): @@ -667,12 +659,10 @@ def unfold(g, input, dimension, size, step): return g.op("ATen", input, operator_s="unfold", dimension_i=dimension, size_i=size, step_i=step) -@parse_args('v', 't', 't', 't') -def elu(g, input, alpha, scale, input_scale): +@parse_args('v', 't', 't') +def elu(g, input, alpha, scale): if scale and scale != 1.: return _unimplemented("scale", "does not support scale in Elu") - if input_scale and input_scale != 1.: - return _unimplemented("input_scale", "does not support input_scale in Elu") # See Note [Export inplace] return g.op("Elu", input, alpha_f=_scalar(alpha)) @@ -686,10 +676,8 @@ def index_select(g, self, dim, index): return g.op("Gather", self, index, axis_i=dim) -def index_put(g, self, indices_list_value, values): - indices_list = list(_unpack_list(indices_list_value)) - args = [self] + indices_list + [values] - return g.op("ATen", *args, operator_s='index_put') +def index_put(g, *inputs): + return g.op("ATen", *inputs, operator_s='index_put') def type_as(g, self, other): @@ -880,17 +868,14 @@ def topk(g, self, k, dim, largest, sorted, out=None): return g.op("TopK", self, k_i=k, axis_i=dim, outputs=2) +@parse_args('v', 'is') def repeat(g, self, repeats): - if not _is_value(repeats): - repeats = g.op("Constant", value_t=torch.LongTensor(repeats)) - const_repeats = _maybe_get_const(repeats, 'is') - - if self.isTensor() and not _is_value(const_repeats): + if self.isTensor(): sizes = self.type().sizes() - diff_dims = len(const_repeats) - len(sizes) + diff_dims = len(repeats) - len(sizes) if diff_dims > 0: self = view(g, self, [1] * diff_dims + sizes) - return g.op("Tile", self, repeats) + return g.op("Tile", self, g.op("Constant", value_t=torch.LongTensor(repeats))) def instance_norm(g, input, **kwargs): diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py index b770b900c4edd3..4f9299d258ea3e 100644 --- a/torch/onnx/utils.py +++ b/torch/onnx/utils.py @@ -480,14 +480,8 @@ def _run_symbolic_function(g, n, inputs, env, operator_export_type=OperatorExpor raise RuntimeError("Unsupported prim::Constant kind: `{}`. Send a bug report.".format( n.kindOf("value"))) elif op_name == "ListConstruct": - t = n.output().type() - # Tensor lists are used mostly for inputs to cat/stack. They need to be handled - # in those symbolics, and should become dead afterwards. - if t == torch._C.ListType.ofTensors(): - return None - elif t == torch._C.ListType.ofInts(): - unsqueezed = [g.op("Unsqueeze", input, axes_i=[0]) for input in inputs] - return g.op("Concat", *unsqueezed, axis_i=0) + unsqueezed = [g.op("Unsqueeze", input, axes_i=[0]) for input in inputs] + return g.op("Concat", *unsqueezed, axis_i=0) elif op_name == "Undefined": # Undefined is not an ONNX operator; keep it as prim::Undefined # and let the exporter handle finally eliminating these diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py index 96cfaff8684cf0..ad7f780719ccd3 100644 --- a/torch/optim/lr_scheduler.py +++ b/torch/optim/lr_scheduler.py @@ -1,4 +1,3 @@ -import types import math import torch from torch._six import inf @@ -87,37 +86,6 @@ def __init__(self, optimizer, lr_lambda, last_epoch=-1): self.last_epoch = last_epoch super(LambdaLR, self).__init__(optimizer, last_epoch) - def state_dict(self): - """Returns the state of the scheduler as a :class:`dict`. - - It contains an entry for every variable in self.__dict__ which - is not the optimizer. - The learning rate lambda functions will only be saved if they are callable objects - and not if they are functions or lambdas. - """ - state_dict = {key: value for key, value in self.__dict__.items() if key not in ('optimizer', 'lr_lambdas')} - state_dict['lr_lambdas'] = [None] * len(self.lr_lambdas) - - for idx, fn in enumerate(self.lr_lambdas): - if not isinstance(fn, types.FunctionType): - state_dict['lr_lambdas'][idx] = fn.__dict__.copy() - - return state_dict - - def load_state_dict(self, state_dict): - """Loads the schedulers state. - - Arguments: - state_dict (dict): scheduler state. Should be an object returned - from a call to :meth:`state_dict`. - """ - lr_lambdas = state_dict.pop('lr_lambdas') - self.__dict__.update(state_dict) - - for idx, fn in enumerate(lr_lambdas): - if fn is not None: - self.lr_lambdas[idx].__dict__.update(fn) - def get_lr(self): return [base_lr * lmbda(self.last_epoch) for lmbda, base_lr in zip(self.lr_lambdas, self.base_lrs)] diff --git a/torch/tensor.py b/torch/tensor.py index 9784fd59c9d2fb..6b587fcf903586 100644 --- a/torch/tensor.py +++ b/torch/tensor.py @@ -384,8 +384,6 @@ def __dir__(self): return sorted(keys) # Numpy array interface, to support `numpy.asarray(tensor) -> ndarray` - __array_priority__ = 1000 # prefer Tensor ops over numpy ones - def __array__(self, dtype=None): if dtype is None: return self.cpu().numpy()