diff --git a/.clang-tidy b/.clang-tidy index 5466a4a31d20a3..d5fc66c26d42d9 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -2,6 +2,7 @@ # NOTE: there must be no spaces before the '-', so put the comma first. Checks: ' * + ,clang-analyzer-* ,modernize-* ,-cert-err58-cpp ,-cert-err60-cpp @@ -9,6 +10,7 @@ Checks: ' ,-cppcoreguidelines-owning-memory ,-cppcoreguidelines-pro-bounds-array-to-pointer-decay ,-cppcoreguidelines-pro-bounds-constant-array-index + ,-cppcoreguidelines-pro-type-member-init ,-cppcoreguidelines-pro-type-static-cast-downcast ,-cppcoreguidelines-pro-type-vararg ,-cppcoreguidelines-special-member-functions @@ -23,9 +25,11 @@ Checks: ' ,-hicpp-braces-around-statements ,-hicpp-explicit-conversions ,-hicpp-no-array-decay + ,-hicpp-signed-bitwise ,-hicpp-special-member-functions ,-hicpp-vararg ,-llvm-header-guard + ,-llvm-include-order ,-llvm-namespace-comment ,-misc-unused-parameters ,-modernize-make-unique @@ -34,7 +38,6 @@ Checks: ' ,-readability-braces-around-statements ,-readability-else-after-return ,-readability-named-parameter - ,clang-analyzer-* ' WarningsAsErrors: '' HeaderFilterRegex: 'torch/csrc/' diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh index 053a9be5e05487..76ecadbd5297b8 100755 --- a/.jenkins/caffe2/test.sh +++ b/.jenkins/caffe2/test.sh @@ -64,7 +64,13 @@ for test in $(find "${INSTALL_PREFIX}/test" -executable -type f); do ;; */aten/*) # ATen uses test framework Catch2 - "$test" -r=xml -o "${junit_reports_dir}/$(basename $test).xml" + # NB: We do NOT use the xml test reporter, because + # Catch doesn't support multiple reporters + # c.f. https://github.com/catchorg/Catch2/blob/master/docs/release-notes.md#223 + # which means that enabling XML output means you lose useful stdout + # output for Jenkins. It's more important to have useful console + # output than it is to have XML output for Jenkins. + "$test" ;; *) "$test" --gtest_output=xml:"$gtest_reports_dir/$(basename $test).xml" diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index 56db6914c1c20a..48e81dfd635bce 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -43,12 +43,9 @@ if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then # https://github.com/RadeonOpenCompute/hcc#hcc-with-thinlto-linking export KMTHINLTO=1 - sudo chown -R jenkins:jenkins /usr/local - rm -rf "$(dirname "${BASH_SOURCE[0]}")/../../../pytorch_amd/" || true - python "$(dirname "${BASH_SOURCE[0]}")/../../tools/amd_build/build_pytorch_amd.py" - - USE_ROCM=1 python setup.py install - exit + python tools/amd_build/build_pytorch_amd.py + USE_ROCM=1 python setup.py install --user + exit 0 fi # TODO: Don't install this here diff --git a/CMakeLists.txt b/CMakeLists.txt index 651e230ab35ea7..51984e0b79c51d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -284,6 +284,8 @@ include_directories(BEFORE ${PROJECT_SOURCE_DIR}) # in PROJECT_SOURCE_DIR. include_directories(BEFORE ${PROJECT_BINARY_DIR}) +include_directories(BEFORE ${PROJECT_SOURCE_DIR}/aten/src/) + # ---[ Old caffe protobuf if(BUILD_CAFFE2) add_subdirectory(caffe/proto) diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index 562910ad86a298..4eaa4ad81ec372 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -44,6 +44,7 @@ CONFIGURE_FILE(cuda/CUDAConfig.h.in "${CMAKE_CURRENT_SOURCE_DIR}/cuda/CUDAConfig # NB: If you edit these globs, you'll have to update setup.py package_data as well FILE(GLOB base_h "*.h" "detail/*.h") FILE(GLOB base_cpp "*.cpp" "detail/*.cpp") +add_subdirectory(core) FILE(GLOB cuda_h "cuda/*.h" "cuda/detail/*.h" "cuda/*.cuh" "cuda/detail/*.cuh") FILE(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp") FILE(GLOB cuda_cu "cuda/*.cu" "cuda/detail/*.cu") @@ -62,7 +63,7 @@ FILE(GLOB native_cuda_cpp "native/cuda/*.cpp") FILE(GLOB native_mkl_cpp "native/mkl/*.cpp") FILE(GLOB native_mkldnn_cpp "native/mkldnn/*.cpp") -set(all_cpu_cpp ${base_cpp} ${native_cpp} ${native_sparse_cpp} ${native_mkl_cpp} ${native_mkldnn_cpp} ${generated_cpp} ${ATen_CPU_SRCS} ${cpu_kernel_cpp}) +set(all_cpu_cpp ${base_cpp} ${ATen_CORE_SRCS} ${native_cpp} ${native_sparse_cpp} ${native_mkl_cpp} ${native_mkldnn_cpp} ${generated_cpp} ${ATen_CPU_SRCS} ${cpu_kernel_cpp}) if(AT_MKL_ENABLED) set(all_cpu_cpp ${all_cpu_cpp} ${mkl_cpp}) endif() @@ -393,7 +394,7 @@ INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake" DESTINATION "${AT_INSTALL_SHARE_DIR}/cmake/ATen") # https://stackoverflow.com/questions/11096471/how-can-i-install-a-hierarchy-of-files-using-cmake -FOREACH(HEADER ${base_h} ${cuda_h} ${cudnn_h}) +FOREACH(HEADER ${base_h} ${ATen_CORE_HEADERS} ${cuda_h} ${cudnn_h}) string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}/" "" HEADER_SUB ${HEADER}) GET_FILENAME_COMPONENT(DIR ${HEADER_SUB} DIRECTORY) INSTALL(FILES ${HEADER} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen/${DIR}) diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index 309c4be2e651dd..7d3fdd1cc2d4af 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -9,6 +9,9 @@ #include "ATen/detail/CUDAHooksInterface.h" #include "ATen/CUDAStream.h" +// This is temporary +#include "ATen/core/ATenCoreTest.h" + #include #include #include diff --git a/aten/src/ATen/Storage.cpp b/aten/src/ATen/Storage.cpp index f5ba512cc27105..893703a0510425 100644 --- a/aten/src/ATen/Storage.cpp +++ b/aten/src/ATen/Storage.cpp @@ -4,20 +4,30 @@ namespace at { +Storage::Storage(at::ScalarType scalar_type, size_t size, Allocator* allocator) + : storage_impl_(new StorageImpl( + scalar_type, + size, + allocator, + /* resizable */ false)) {} + +Storage::Storage( + at::ScalarType scalar_type, + at::DataPtr data_ptr, + size_t size, + const std::function& deleter) + : storage_impl_(new StorageImpl( + scalar_type, + size, + std::move(data_ptr), + /* allocator */ nullptr, + /* resizable */ false)) {} + Storage::~Storage() { if (!storage_impl_) { return; } - if (--storage_impl_->refcount == 0) { - if (storage_impl_->finalizer) { - (*storage_impl_->finalizer)(); - } - storage_impl_->finalizer = nullptr; - storage_impl_->data_ptr.clear(); - if (storage_impl_ && --storage_impl_->weakcount == 0) { - delete storage_impl_; - } - } + storage_impl_->release(); } } // namespace at diff --git a/aten/src/ATen/Storage.h b/aten/src/ATen/Storage.h index a5c85192e36f8c..aa27296c74d40f 100644 --- a/aten/src/ATen/Storage.h +++ b/aten/src/ATen/Storage.h @@ -8,6 +8,12 @@ struct AT_API Storage { public: Storage() = delete; Storage(StorageImpl* storage_impl) : storage_impl_(storage_impl) {} + Storage(at::ScalarType, size_t size, Allocator* allocator); + Storage( + at::ScalarType, + at::DataPtr, + size_t size, + const std::function& deleter); ~Storage(); // There are reasonable interpretations of these constructors, but they're to // be implemented on demand. diff --git a/aten/src/ATen/StorageImpl.cpp b/aten/src/ATen/StorageImpl.cpp index a26f8971310aa5..6e3d693d012c5c 100644 --- a/aten/src/ATen/StorageImpl.cpp +++ b/aten/src/ATen/StorageImpl.cpp @@ -12,8 +12,6 @@ StorageImpl::StorageImpl( : scalar_type(scalar_type), data_ptr(std::move(data_ptr)), size(size), - refcount(1), - weakcount(1), // from the strong reference resizable(resizable), allocator(allocator), finalizer(nullptr) {} diff --git a/aten/src/ATen/StorageImpl.h b/aten/src/ATen/StorageImpl.h index c48ec51e013d4c..d9f14ee6baa5f2 100644 --- a/aten/src/ATen/StorageImpl.h +++ b/aten/src/ATen/StorageImpl.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -39,7 +40,7 @@ namespace at { struct Type; -struct TH_CPP_API StorageImpl { +struct TH_CPP_API StorageImpl : public Retainable { StorageImpl() = delete; virtual ~StorageImpl() {}; @@ -48,8 +49,6 @@ struct TH_CPP_API StorageImpl { at::ScalarType scalar_type; at::DataPtr data_ptr; ptrdiff_t size; - std::atomic refcount; - std::atomic weakcount; bool resizable; at::Allocator* allocator; std::unique_ptr finalizer; @@ -76,6 +75,14 @@ struct TH_CPP_API StorageImpl { return static_cast(this->data_ptr.get()); } + void release_resources() { + if (finalizer) { + (*finalizer)(); + } + finalizer = nullptr; + data_ptr.clear(); + } + void operator=(const StorageImpl&) = delete; virtual size_t elementSize() const { @@ -94,9 +101,6 @@ struct TH_CPP_API StorageImpl { const void* data() const { return data_ptr.get(); }; - void retain() { - ++refcount; - } int getDevice() const { return data_ptr.device().index(); diff --git a/aten/src/ATen/THLongStorageView.h b/aten/src/ATen/THLongStorageView.h index 55e7d3de6dea4a..8ebcfdaeada40f 100644 --- a/aten/src/ATen/THLongStorageView.h +++ b/aten/src/ATen/THLongStorageView.h @@ -64,7 +64,6 @@ class THLongStorageView { storage.size = ref.size(); } storage.scalar_type = at::CTypeToScalarType>::to(); - storage.refcount = 0; storage.set_resizable(false); } private: diff --git a/aten/src/ATen/core/ATenCoreTest.cpp b/aten/src/ATen/core/ATenCoreTest.cpp new file mode 100644 index 00000000000000..e409ff2d979fce --- /dev/null +++ b/aten/src/ATen/core/ATenCoreTest.cpp @@ -0,0 +1,10 @@ +#include + +namespace at { + +static int CoreTestGlobal = 0; +int CoreTest() { + return CoreTestGlobal++; +} + +} diff --git a/aten/src/ATen/core/ATenCoreTest.h b/aten/src/ATen/core/ATenCoreTest.h new file mode 100644 index 00000000000000..e79705fde394f5 --- /dev/null +++ b/aten/src/ATen/core/ATenCoreTest.h @@ -0,0 +1,18 @@ +#pragma once + +// TODO: Move this to something like ATenCoreGeneral.h +#ifdef _WIN32 +# if defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS) +# define AT_CORE_API __declspec(dllexport) +# else +# define AT_CORE_API __declspec(dllimport) +# endif +#else +# define AT_CORE_API +#endif + +namespace at { + +AT_CORE_API int CoreTest(); + +} diff --git a/aten/src/ATen/core/CMakeLists.txt b/aten/src/ATen/core/CMakeLists.txt new file mode 100644 index 00000000000000..66c04b3a17c592 --- /dev/null +++ b/aten/src/ATen/core/CMakeLists.txt @@ -0,0 +1,13 @@ +# This file solely exists to let Caffe2 Android build get at the list +# of core files without having to trundle through all of ATen's CMakeLists.txt + +FILE(GLOB ATen_CORE_HEADERS "*.h") +FILE(GLOB ATen_CORE_SRCS "*.cpp") + +# Pass to parent +set(ATen_CORE_HEADERS ${ATen_CORE_HEADERS} PARENT_SCOPE) +set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE) +# This is a little dodgy, because it means ALL ATen headers are made +# visible. Fortunately, you should just get a lot of undefined symbol +# errors if you go outside core +set(ATen_CORE_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../.. PARENT_SCOPE) diff --git a/aten/src/ATen/core/README.md b/aten/src/ATen/core/README.md new file mode 100644 index 00000000000000..71654f44e26f91 --- /dev/null +++ b/aten/src/ATen/core/README.md @@ -0,0 +1,5 @@ +ATen Core +--------- + +ATen Core is a minimal subset of ATen which is suitable for deployment +on mobile. Binary size of files in this folder is an important constraint. diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h index 085f2723bf0455..7ce3da3c9e051c 100644 --- a/aten/src/ATen/cudnn/Descriptors.h +++ b/aten/src/ATen/cudnn/Descriptors.h @@ -319,6 +319,20 @@ struct AT_CUDA_API RNNDescriptor } }; +#if CUDNN_VERSION >= 7000 + +struct AT_CUDA_API CTCLossDescriptor + : public Descriptor +{ + void set(cudnnDataType_t datatype) { + AT_CUDNN_CHECK(cudnnSetCTCLossDescriptor(mut_desc(), datatype)); + } +}; + +#endif + union Constant { float f; diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py index 93c20d4be032f4..b012de25194361 100644 --- a/aten/src/ATen/function_wrapper.py +++ b/aten/src/ATen/function_wrapper.py @@ -290,7 +290,7 @@ def __init__(self, reason): 'Backend::${DenseBackend}, ScalarType::Long)'), 'THStorage*': CodeTemplate( - 'checked_cast_storage<${Storage}>(' + 'checked_cast_storage(' '&${arg_name},"${arg_name}",${arg_pos}, ' 'Backend::${Backend}, ScalarType::${ScalarName})'), 'THGenerator*': diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py index 0f2aaffd6eac9d..209cca57c293ff 100644 --- a/aten/src/ATen/gen.py +++ b/aten/src/ATen/gen.py @@ -103,10 +103,6 @@ def check_all_files_written(self): TEMPLATE_PATH = options.source_path + "/templates" GENERATOR_DERIVED = CodeTemplate.from_file( TEMPLATE_PATH + "/GeneratorDerived.h") -STORAGE_DERIVED_CPP = CodeTemplate.from_file( - TEMPLATE_PATH + "/StorageDerived.cpp") -STORAGE_DERIVED_H = CodeTemplate.from_file(TEMPLATE_PATH + "/StorageDerived.h") - TYPE_DERIVED_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDerived.cpp") SPARSE_TYPE_DERIVED_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/SparseTypeDerived.cpp") TYPE_DERIVED_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDerived.h") @@ -237,7 +233,6 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations env['isFloatingType'] = is_floating_type env['isIntegralType'] = not is_floating_type if density == 'Dense': - env['Storage'] = "{}{}Storage".format(backend, scalar_name) env['Tensor'] = "{}{}{}Tensor".format(density_tag, backend, scalar_name) env['Type'] = "{}{}{}Type".format(density_tag, backend, scalar_name) env['DenseTensor'] = "{}{}Tensor".format(backend, scalar_name) @@ -246,7 +241,6 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations env['storage_tensor_headers'] = [] if density != 'Sparse': env['storage_tensor_headers'] = [ - '#include "ATen/{}.h"'.format(env['Storage']), '#include "ATen/{}.h"'.format(env['Tensor']), '#include "ATen/{}ByteTensor.h"'.format(env['Backend']), '#include "ATen/{}IntTensor.h"'.format(env['Backend']), @@ -322,8 +316,6 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations if density != 'Sparse': # there are no storage or tensor types for sparse; it's all uniform - fm.write(env['Storage'] + ".cpp", STORAGE_DERIVED_CPP, env) - fm.write(env['Storage'] + ".h", STORAGE_DERIVED_H, env) env['TensorDenseOrSparse'] = TENSOR_DENSE_CPP.substitute(env) fm.write(env['Tensor'] + ".cpp", TENSOR_DERIVED_CPP, env) fm.write(env['Tensor'] + ".h", TENSOR_DERIVED_H, env) @@ -379,7 +371,7 @@ def declare_outputs(): for backend, density, scalar_types in iterate_types(): scalar_name = scalar_types[0] full_backend = "Sparse" + backend if density == "Sparse" else backend - for kind in ["Storage", "Type", "Tensor"]: + for kind in ["Type", "Tensor"]: if kind != 'Type' and density == "Sparse": # No Storage or Tensor for sparse continue diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp index cb24e71119f9b1..c82bf8ba0ae043 100644 --- a/aten/src/ATen/native/Linear.cpp +++ b/aten/src/ATen/native/Linear.cpp @@ -1,6 +1,7 @@ #include "ATen/ATen.h" #include "ATen/NativeFunctions.h" #include "ATen/WrapDimUtilsMulti.h" +#include namespace at { namespace native { @@ -136,6 +137,8 @@ Tensor einsum(std::string eqn, TensorList tensors) { } else { in_eqn = eqn; } + // remove spaces for einsum compatibility (#9929) + in_eqn.erase(std::remove_if(in_eqn.begin(), in_eqn.end(), isspace), in_eqn.end()); // next we parse in_eq (the left hand side) by iterating. It is a string of comma separated terms per index int64_t operand = 0; @@ -212,7 +215,7 @@ Tensor einsum(std::string eqn, TensorList tensors) { num_output_dims++; } } - } else { // letter (hopefully) + } else if (! isspace(c)) { // letter (hopefully) AT_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis in the right hand side"); AT_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices"); int64_t letter_num = c-'a'; diff --git a/aten/src/ATen/native/LossCTC.cpp b/aten/src/ATen/native/LossCTC.cpp new file mode 100644 index 00000000000000..092b7255eb4a0d --- /dev/null +++ b/aten/src/ATen/native/LossCTC.cpp @@ -0,0 +1,365 @@ +// Copyright (c) 2018 MathInf GmbH, Thomas Viehmann +// Licensed under the BSD-3-Clause license +// This is the CPU implementation of the Connectionist Temporal Loss. +// We mostly follow Graves. +// 1. Graves et al: http://www.cs.toronto.edu/~graves/icml_2006.pdf +// We use the equations from above link, but note that [1] has 1-based indexing and we (of course) use 0-based. +// Graves et al call the probabilities y, we use log_probs (also calling them inputs) + +#include +#include "ATen/Dispatch.h" +#include "ATen/TensorUtils.h" + +#include +#include + +namespace at { +namespace native { + +namespace { + +// this ad-hoc converts from targets (l in [1]) to augmented targets (l' in [1]) note that no bound-checking is done +template +static inline int64_t get_target_prime(target_t* target, int64_t offset, int64_t stride, int64_t idx, int64_t BLANK) { + if (idx % 2 == 0) { + return BLANK; + } else { + return target[offset + stride * (idx / 2)]; + } +} + +// This kernel is a relatively straightforward implementation of the alpha calculation in the forward backward algorithm (section 4.1). +// A (minor) twist is that we are using log-calculations to enhance numerical stability (log_probs and log_alpha). +// The function returns the loss and the alphas, the alphas are kept for the backward step. The wrapper (ctc_loss below) hides +// the alphas from the user by only returning the loss. +template +std::tuple ctc_loss_cpu_template(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK) { + // log_probs: input_len x batch_size x num_labels + // targets [int64]: batch_size x target_length OR sum(target_lengths) + constexpr scalar_t neginf = -std::numeric_limits::infinity(); + using target_t = typename std::conditional::type; + + CheckedFrom c = "ctc_loss_cpu"; + auto log_probs_arg = TensorArg(log_probs, "log_probs", 1); + auto targets_arg = TensorArg(targets, "targets", 2); + checkScalarType(c, targets_arg, target_scalar_type); + checkDim(c, log_probs_arg, 3); + checkDimRange(c, targets_arg, 1, 3); + + int64_t batch_size = log_probs.size(1); + int64_t num_labels = log_probs.size(2); + AT_CHECK(BLANK < num_labels, "blank must be in label range"); + AT_CHECK((int64_t) input_lengths.size() == batch_size, "input_lengths must be of size batch_size"); + AT_CHECK((int64_t) target_lengths.size() == batch_size, "target_lengths must be of size batch_size"); + + size_t tg_target_stride; + int64_t max_target_length; + std::vector tg_batch_offsets(batch_size); + if (targets.dim() == 1) { // concatenated targets + int64_t pos = 0; + max_target_length = 0; + for (int64_t i = 0; i < batch_size; i++) { + tg_batch_offsets[i] = pos; + pos += target_lengths[i]; + if (max_target_length < target_lengths[i]) + max_target_length = target_lengths[i]; + } + tg_target_stride = targets.stride(0); + checkSize(c, targets_arg, 0, pos); + } + else { // batch x max_target_length + // dim is 2 + int64_t tg_batch_stride = targets.stride(0); + for (int64_t i = 0; i < batch_size; i++) { + tg_batch_offsets[i] = i * tg_batch_stride; + } + tg_target_stride = targets.stride(1); + max_target_length = targets.size(1); + checkSize(c, targets_arg, 0, batch_size); + AT_CHECK(targets.size(1) >= max_target_length, + "Expected tensor to have size at least ", max_target_length, " at dimension 1, but got size ", targets.size(1), " for ", targets_arg, + " (while checking arguments for ", c, ")"); + } + int64_t max_input_length = log_probs.size(0); + for (int64_t b = 0; b < batch_size; b++) { + AT_CHECK(input_lengths[b] <= max_input_length, + "Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", targets.size(0), " for ", targets_arg, + " (while checking arguments for ", c, ")"); + } + + Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2*max_target_length+1}, log_probs.options()); + Tensor neg_log_likelihood = at::empty({batch_size}, log_probs.options()); + + auto lpp = log_probs.permute({1,0,2}); + auto log_probs_a_global = lpp.accessor(); + auto log_alpha_a_global = log_alpha.accessor(); + auto targets_data = targets.data(); + auto neg_log_likelihood_a = neg_log_likelihood.accessor(); + + // alpha calculation for the first row, the three equations for alpha_1 above eq (6) + // first the default + log_alpha.narrow(1, 0, 1).fill_(neginf); + #pragma omp parallel for + for (int64_t b = 0; b < batch_size; b++) { + int64_t input_length = input_lengths[b]; + int64_t target_length = target_lengths[b]; + auto log_probs_a = log_probs_a_global[b]; + auto log_alpha_a = log_alpha_a_global[b]; + int64_t tg_batch_offset = tg_batch_offsets[b]; + + // the first two items of alpha_t above eq (6) + log_alpha_a[0][0] = log_probs_a[0][BLANK]; + if (target_length > 0) + log_alpha_a[0][1] = log_probs_a[0][get_target_prime(targets_data, tg_batch_offset, tg_target_stride, 1, BLANK)]; + + // now the loop over the inputs + for (int64_t t=1; t 0) { + la2 = log_alpha_a[t-1][s-1]; + if (la2 > lamax) + lamax = la2; + } else { + la2 = neginf; + } + if ((s > 1) && (get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s-2, BLANK) != + current_target_prime)) { + la3 = log_alpha_a[t-1][s-2]; + if (la3 > lamax) + lamax = la3; + } else { + la3 = neginf; + } + if (lamax == neginf) // cannot do neginf-neginf + lamax = 0; + // this is the assignment of eq (6) + log_alpha_a[t][s] = std::log(std::exp(la1-lamax)+std::exp(la2-lamax)+std::exp(la3-lamax))+lamax + log_probs_a[t][current_target_prime]; + } + } + // the likelihood is the the sum of the last two alphas, eq (8), the loss is the negative log likelihood + scalar_t l1 = log_alpha_a[input_length-1][target_length*2]; + scalar_t l2 = log_alpha_a[input_length-1][target_length*2-1]; + scalar_t m = std::max(l1, l2); + m = ((m == neginf) ? 0 : m); + scalar_t log_likelihood = std::log(std::exp(l1-m)+std::exp(l2-m))+m; + neg_log_likelihood_a[b] = -log_likelihood; + } + + return std::make_tuple(neg_log_likelihood, log_alpha); +} + +// This is the backward. It consists of two phases: +// a) computing the beta analogous to the alphas in the forward (backward half of the forward-backward algorithm) (eq (10) and (11)) +// b) collecting the per-activation characters for all s and wrapping the gradient (eq (16), the collection is the sum) +template +Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, + const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK) { + constexpr scalar_t neginf = -std::numeric_limits::infinity(); + using target_t = typename std::conditional::type; + int64_t max_input_length = log_probs.size(0); + int64_t batch_size = log_probs.size(1); + int64_t num_labels = log_probs.size(2); + Tensor grad = at::full_like(log_probs, neginf); // at this point, this is log of empty sum + + // The admin bits. We don't do much checking and assume that the forward did. + int64_t tg_target_stride; + int64_t max_target_length; + std::vector tg_batch_offsets(batch_size); + + if (targets.dim() == 1) { // concatenated targets + int64_t pos = 0; + max_target_length = 0; + for (int64_t i = 0; i < batch_size; i++) { + tg_batch_offsets[i] = pos; + pos += target_lengths[i]; + if (max_target_length < target_lengths[i]) + max_target_length = target_lengths[i]; + } + tg_target_stride = targets.stride(0); + } + else { // batch x max_target_length + // dim is 2 + int64_t tg_batch_stride = targets.stride(0); + for (int64_t i = 0; i < batch_size; i++) { + tg_batch_offsets[i] = i * tg_batch_stride; + } + tg_target_stride = targets.stride(1); + max_target_length = targets.size(1); + } + + Tensor log_beta = at::empty_like(log_alpha); // could be optimized to use only 2 rows + auto lpp = log_probs.permute({1,0,2}); + auto log_probs_a_global = lpp.accessor(); + auto log_alpha_a_global = log_alpha.accessor(); + auto log_beta_a_global = log_beta.accessor(); + auto gp = grad.permute({1,0,2}); + auto grad_a_global = gp.accessor(); + auto targets_data = targets.data(); + + #pragma omp parallel for + for (int64_t b = 0; b < batch_size; b++) { + auto log_probs_a = log_probs_a_global[b]; + auto log_alpha_a = log_alpha_a_global[b]; + auto log_beta_a = log_beta_a_global[b]; + auto grad_a = grad_a_global[b]; + int64_t input_length = input_lengths[b]; + int64_t target_length = target_lengths[b]; + int64_t tg_batch_offset = tg_batch_offsets[b]; + + // the initialization of beta before eq (10) + // here we do the fill for each batch item separately, as the input lengths will differ, so the t in which + // we start varies + if (input_length > 0) { + log_beta.narrow(0, b, 1).narrow(1, input_length-1, 1).fill_(neginf); + log_beta_a[input_length-1][2*target_length] = log_probs_a[input_length-1][BLANK]; + grad_a[input_length-1][BLANK] = log_alpha_a[input_length-1][2*target_length] + log_beta_a[input_length-1][2*target_length]; + + if (target_length > 0) { + auto current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, 2*target_length-1, BLANK); + log_beta_a[input_length-1][2*target_length-1] = log_probs_a[input_length-1][current_target_prime]; + + // the first two are a blank and a non-blank, so we know they are different and we don't need to do log+ + grad_a[input_length-1][current_target_prime] = log_alpha_a[input_length-1][2*target_length-1] + log_beta_a[input_length-1][2*target_length-1]; + } + } + + // now loop applying eq (10) / (11) + for (int64_t t=input_length-2; t>=0; t--) { + // this loop over s could be parallel/vectorized and doesn't really need to be descending... + // alternatively, one might consider moving s to the outer loop to cache current_target_prime more (but then it needs to be descending) + // for the cuda implementation, that gave a speed boost. + for (int64_t s=2*target_length; s>=0; s--) { + scalar_t lb1 = log_beta_a[t+1][s]; + scalar_t lbmax = lb1; + scalar_t lb2, lb3; + auto current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK); + if (s < 2*target_length) { + lb2 = log_beta_a[t+1][s+1]; + if (lb2 > lbmax) + lbmax = lb2; + } else { + lb2 = neginf; + } + if ((s < 2*target_length-1) && (get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s+2, BLANK) != + current_target_prime)) { + lb3 = log_beta_a[t+1][s+2]; + if (lb3 > lbmax) + lbmax = lb3; + } else { + lb3 = neginf; + } + if (lbmax == neginf) + lbmax = 0; + + log_beta_a[t][s] = std::log(std::exp(lb1-lbmax)+std::exp(lb2-lbmax)+std::exp(lb3-lbmax))+lbmax + log_probs_a[t][current_target_prime]; + // one might check whether one can vectorize this better when done after the t-loop... + // now that we have beta, we fill in the sum of alpha*beta in eq (16) + // in contrast to the cuda implementation, we only parallelize over the batch, so we don't have a concurrency + // issue (several s can map to the same target character) + // collected[b, t, target'[s]] "log+=" log_alpha[t, s]+log_beta[t, s] + scalar_t log_alpha_beta = log_alpha_a[t][s] + log_beta_a[t][s]; + scalar_t &lcab = grad_a[t][current_target_prime]; + if (lcab == neginf) { + lcab = log_alpha_beta; + } else { + scalar_t max = std::max(lcab, log_alpha_beta); + lcab = std::log(std::exp(lcab-max)+std::exp(log_alpha_beta-max))+max; + } + } + } + + // now grad has the sum of eq (16) + // now we wrap up the calculation by adding in the remaining items of eq (16) + // this could be a great target for further vectorization. + // grad is the output gradient, nll is the loss. Note that the likelihood -nll is the Z of eq (16) + scalar_t nll = neg_log_likelihood.accessor()[b]; + scalar_t gr = grad_out.accessor()[b]; + for (int64_t t = 0; t < input_length; t++) { // or go for the full thing? + for (int64_t c = 0; c < num_labels; c++) { + scalar_t& res = grad_a[t][c]; + scalar_t lp = log_probs_a[t][c]; + res = std::exp(lp)-std::exp(res + nll - lp) * gr; + } + } + // zero the remainder + if (input_length < max_input_length) { + grad.narrow(0, input_length, max_input_length - input_length).narrow(1, b, 1).zero_(); + } + } + return grad; +} + +} // namespace + +std::tuple ctc_loss_cpu(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK) { + return AT_DISPATCH_FLOATING_TYPES(log_probs.type(), "ctc_loss", [&] { + if (targets.type().scalarType() == kLong) { + return ctc_loss_cpu_template(log_probs, targets, input_lengths, target_lengths, BLANK); + } else { + return ctc_loss_cpu_template(log_probs, targets, input_lengths, target_lengths, BLANK); + } + }); +} + +Tensor ctc_loss_backward_cpu(const Tensor& grad, const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, + const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK) { + return AT_DISPATCH_FLOATING_TYPES(log_probs.type(), "ctc_loss_backward", [&] { + if (targets.type().scalarType() == kLong) { + return ctc_loss_backward_cpu_template(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK); + } else { + return ctc_loss_backward_cpu_template(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK); + } + }); +} + +// this wrapper function dispatches to the native and cudnn implementations and hides the alpha/grad from the user (by just returning the loss) +// the gradient is implemented for _cudnn_ctc_loss (just in derivatives.yaml) and _ctc_loss and this function has automatic gradients +// it also handles the reduction if desired +Tensor ctc_loss(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK, int64_t reduction) { + auto& ctx = at::globalContext(); + + bool use_cudnn = + detail::getCUDAHooks().compiledWithCuDNN() && + (detail::getCUDAHooks().versionCuDNN() >= 7000) && + ctx.userEnabledCuDNN() && + (BLANK == 0) && (targets.dim()==1) && + (log_probs.type().scalarType() == at::kFloat) && + (targets.type().scalarType() == at::kInt) && + (log_probs.type().backend() == Backend::CUDA); + + if (use_cudnn) { + // we don't know that input_lengths and target_lengths have the same size (they should, but we didn't check yet) + int64_t max_input_length = log_probs.size(0); + for (int64_t b = 0; b < input_lengths.size(); b++) { + use_cudnn &= (input_lengths[b] == max_input_length); + } + for (int64_t b = 0; b < target_lengths.size(); b++) { + use_cudnn &= (target_lengths[b] <= 256); + } + } + + Tensor res; + if (use_cudnn) { + res = std::get<0>(at::_cudnn_ctc_loss(log_probs, targets, input_lengths, target_lengths, BLANK, ctx.deterministicCuDNN())); + } else { + res = std::get<0>(at::_ctc_loss(log_probs, targets, input_lengths, target_lengths, BLANK)); + } + if (reduction == Reduction::ElementwiseMean) { + auto target_lengths_t = at::tensor(target_lengths, res.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(res.type()); + return (res / target_lengths_t).mean(); + } else if (reduction == Reduction::Sum) { + return res.sum(); + } + return res; +} + +} } // at::native diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh index 4b474e0c079e77..12f22fcaf2f216 100644 --- a/aten/src/ATen/native/cuda/Loops.cuh +++ b/aten/src/ATen/native/cuda/Loops.cuh @@ -76,6 +76,9 @@ void gpu_nullary_kernel(TensorIterator& iter, const func_t& f) { using arg0_t = typename traits::result_type; int64_t numel = iter.numel(); + if (numel == 0) { + return; + } if (iter.is_trivial_1d()) { auto strides = iter.get_inner_strides(); int stride0 = strides[0]; @@ -105,6 +108,9 @@ void gpu_unary_kernel(TensorIterator& iter, const func_t& f) { using arg1_t = typename traits::arg1_t; int64_t numel = iter.numel(); + if (numel == 0) { + return; + } if (iter.is_cpu_scalar(1)) { auto a = iter.scalar_value(1); iter.remove_operand(1); @@ -152,6 +158,9 @@ void gpu_binary_kernel(TensorIterator& iter, const func_t& f) { using arg2_t = typename traits::arg2_t; int numel = iter.numel(); + if (numel == 0) { + return; + } if (iter.is_cpu_scalar(1)) { auto a = iter.scalar_value(1); iter.remove_operand(1); diff --git a/aten/src/ATen/native/cuda/LossCTC.cu b/aten/src/ATen/native/cuda/LossCTC.cu new file mode 100644 index 00000000000000..70ece3f4440cf7 --- /dev/null +++ b/aten/src/ATen/native/cuda/LossCTC.cu @@ -0,0 +1,625 @@ +// Copyright (c) 2018 MathInf GmbH, Thomas Viehmann +// Licensed under the BSD-3-Clause license +// This is the GPU implementation of the Connectionist Temporal Loss. +// We mostly follow Graves. +// 1. Graves et al: http://www.cs.toronto.edu/~graves/icml_2006.pdf +// We use the equations from above link, but note that [1] has 1-based indexing and we (of course) use 0-based. +// Graves et al call the probabilities y, we use log_probs (also calling them inputs) +// A few optimizations (simmilar to those here, but also some I didn't take) are described in +// 2. Minmin Sun: http://on-demand.gputechconf.com/gtc/2016/presentation/s6383-minmin-sun-speech-recognition.pdf + +#include +#include + +#include +#include "ATen/Dispatch.h" +#include "ATen/cuda/CUDAApplyUtils.cuh" + +#include +#include + +namespace at { +namespace native { + +namespace { + +// this ad-hoc converts from targets (l in [1]) to augmented targets (l' in [1]) note that no bound-checking is done +// __restrict__ impact to be measured, https://devblogs.nvidia.com/cuda-pro-tip-optimize-pointer-aliasing/ +template +__device__ static inline int64_t get_target_prime(const target_t* __restrict__ target, int64_t offset, int64_t stride, int64_t idx, int64_t BLANK) { + if (idx % 2 == 0) { + return BLANK; + } else { + return target[offset + stride * (idx / 2)]; + } +} + +// this kernel is a relatively straightforward implementation of the alpha calculation in the forward backward algorithm (section 4.1). +// A (minor) twist is that we are using log-calculations to enhance numerical stability (log_probs and log_alpha). +// In total it would be more efficient to compute the beta in the same kernel (e.g. cudnn does this). While the beta are not +// needed for the loss itself (just the grad), we can return log_alpha+log_beta (so same space as currently) and the overhead +// is small and the use-case for loss without grad is relatively limited. +// We parallelize by batch and target sequence. Empirically, it is faster to loop over the input (log probs) sequence and do +// target in parallel, even if it means more frequent __syncthreads. +// In contrast to the cuDNN implementation, we allow large target lengths. For this we need that all previous `s` have been +// computed when we start a new block_s. This is why we have our own for loop here. +template +__global__ void ctc_loss_log_alpha_gpu_kernel(scalar_t* __restrict__ log_alpha_data, + const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths, int64_t max_input_length, + const target_t* __restrict__ targets_data, const int64_t* __restrict__ target_lengths, int64_t max_target_length, + scalar_t* __restrict__ neg_log_likelihood_data, + int64_t lp_input_stride, int64_t lp_batch_stride, int64_t lp_char_stride, + int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride, + const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride, + int64_t batch_size, int64_t BLANK) { + + constexpr scalar_t neginf = -INFINITY; + + // bookkeeping + int64_t b = threadIdx.y + blockIdx.y * blockDim.y; + int64_t input_length = input_lengths[b]; + int64_t target_length = target_lengths[b]; + int64_t lp_batch_offset = b*lp_batch_stride; + int64_t la_batch_offset = b*la_batch_stride; + int64_t tg_batch_offset = tg_batch_offsets[b]; + + if (b >= batch_size) + return; + + // first row (t=0), the three equations for alpha_1 above eq (6) + for (int64_t block_s = 0; block_s < 2*max_target_length+1; block_s += blockDim.x) { + int64_t s = threadIdx.x + block_s; + scalar_t la; + switch (s) { + case 0: + la = log_probs_data[lp_batch_offset + lp_char_stride * BLANK]; + break; + case 1: + if (target_length > 0) { + la = log_probs_data[lp_batch_offset + lp_char_stride * get_target_prime(targets_data, tg_batch_offset, tg_target_stride, 1, BLANK)]; + } + else { + la = neginf; + } + break; + default: + la = neginf; + } + if (s < 2*max_target_length+1) + log_alpha_data[la_batch_offset + /* la_input_stride * 0 */ + la_target_stride * s] = la; + } + + for (int64_t block_s = 0; block_s < 2*max_target_length+1; block_s += blockDim.x) { + int64_t s = threadIdx.x + block_s; + + // These two only depend on s, so we can cache them. + int64_t current_char; // l_s in eq (6) + bool have_three; // flag which of the two cases in eq (6) we have + if (s < 2*target_length+1) { + current_char = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK); + have_three = ((s > 1) && (get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s-2, BLANK) != + current_char)); + } else { + current_char = BLANK; + have_three = false; + } + for (int64_t t=1; t < max_input_length; t++) { + __syncthreads(); // on cuda 9 we might use partial synchronization of only the threads within the same batch + if ((t < input_length) && (target_length > 0) && (s < 2*target_length+1)) { + // only for valid t, s. This is equation (6) and (7), la1, la2, la3 are the three summands, + // lamax is the maximum for the logsumexp trick. + scalar_t la1 = log_alpha_data[la_batch_offset + la_input_stride * (t-1) + la_target_stride * s]; + scalar_t lamax = la1; + scalar_t la2, la3; + if (s > 0) { + la2 = log_alpha_data[la_batch_offset + la_input_stride * (t-1) + la_target_stride * (s-1)]; + if (la2 > lamax) + lamax = la2; + } else { + la2 = neginf; + } + if (have_three) { + la3 = log_alpha_data[la_batch_offset + la_input_stride * (t-1) + la_target_stride * (s-2)]; + if (la3 > lamax) + lamax = la3; + } else { + la3 = neginf; + } + if (lamax == neginf) // when all are neginf. (then the whole thing is neginf, but we can pretend) + lamax = 0; + + log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s] = std::log(std::exp(la1-lamax)+std::exp(la2-lamax)+std::exp(la3-lamax))+lamax + + log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * current_char]; + } else { + // otherwise we just set to neginf + if (s < 2*max_target_length+1) + log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s] = neginf; + } + } + } + __syncthreads(); // on cuda 9 we might use partial synchronization of only the threads within the same batch + + // compute the loss (eq (8)) + if (threadIdx.x == 0) { + scalar_t l1 = log_alpha_data[la_batch_offset + la_input_stride * (input_length-1) + la_target_stride * (target_length*2)]; + scalar_t l2 = log_alpha_data[la_batch_offset + la_input_stride * (input_length-1) + la_target_stride * (target_length*2-1)]; + scalar_t m = ((l1 > l2) ? l1 : l2); + m = ((m == neginf) ? 0 : m); + scalar_t log_likelihood = std::log(std::exp(l1-m)+std::exp(l2-m))+m; + neg_log_likelihood_data[b] = -log_likelihood; + } +} + +// The forward computation. Lot's of admin and a call to the alpha kernel. +// Note: we do not check that the labels are in the valid range. As we use +// them for indexing in the kernels, you'll see memory errors when you +// pass corrupt labels. +// We support both a 2-dimensional tensor as targets (one set of targets in each row) and +// a 1-dimensional tensor where all targets are concatenated (and we use target_lengths +// to figure out where they begin). +// We return log_alpha (currently, might change to (log_alpha+log_beta) to be passed to the +// backward. The dispatch function will only return the loss. +template +std::tuple ctc_loss_gpu_template(const Tensor& log_probs, const Tensor& targets_, IntList input_lengths, IntList target_lengths, int64_t BLANK) { + // log_probs: input_len x batch_size x num_labels + // targets [int64]: batch_size x target_length OR sum(target_lengths) + CheckedFrom c = "ctc_loss_gpu"; + using target_t = typename std::conditional::type; + auto targets = targets_.toType(log_probs.type().toScalarType(target_scalar_type)); // to log_probs cuda if it isn't there already + auto log_probs_arg = TensorArg(log_probs, "log_probs", 1); + auto targets_arg = TensorArg(targets, "targets", 2); + checkAllSameGPU(c, {log_probs_arg, targets_arg}); + + checkScalarType(c, targets_arg, target_scalar_type); + checkDim(c, log_probs_arg, 3); + checkDimRange(c, targets_arg, 1, 3); + + int64_t batch_size = log_probs.size(1); + int64_t num_labels = log_probs.size(2); + AT_CHECK(BLANK < num_labels, "blank must be in label range"); + AT_CHECK(input_lengths.size() == batch_size, "input_lengths must be of size batch_size"); + AT_CHECK(target_lengths.size() == batch_size, "target_lengths must be of size batch_size"); + + int64_t lp_input_stride = log_probs.stride(0); + int64_t lp_char_stride = log_probs.stride(2); + int64_t tg_target_stride; + + int64_t max_target_length; + auto tg_batch_offsets = at::empty({batch_size}, TensorOptions(at::CPU(kLong))); + auto tg_batch_offsets_data = tg_batch_offsets.data(); + if (targets.dim() == 1) { // concatenated targets + int64_t pos = 0; + max_target_length = 0; + for (int64_t i = 0; i < batch_size; i++) { + tg_batch_offsets_data[i] = pos; + pos += target_lengths[i]; + if (max_target_length < target_lengths[i]) + max_target_length = target_lengths[i]; + } + tg_target_stride = targets.stride(0); + checkSize(c, targets_arg, 0, pos); + } + else { // batch x max_target_length + // dim is 2 + int64_t tg_batch_stride = targets.stride(0); + for (int64_t i = 0; i < batch_size; i++) { + tg_batch_offsets_data[i] = i * tg_batch_stride; + } + tg_target_stride = targets.stride(1); + max_target_length = targets.size(1); + checkSize(c, targets_arg, 0, batch_size); + AT_CHECK(targets.size(1) >= max_target_length, + "Expected tensor to have size at least ", max_target_length, " at dimension 1, but got size ", targets.size(1), " for ", targets_arg, + " (while checking arguments for ", c, ")"); + } + int64_t max_input_length = log_probs.size(0); + for (int64_t b = 0; b < batch_size; b++) { + AT_CHECK(input_lengths[b] <= max_input_length, + "Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", targets.size(0), " for ", targets_arg, + " (while checking arguments for ", c, ")"); + } + + auto target_lengths_t = at::tensor(target_lengths, targets.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(targets.type().toScalarType(kLong)); + auto input_lengths_t = at::tensor(input_lengths, targets.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(targets.type().toScalarType(kLong)); + tg_batch_offsets = tg_batch_offsets.toType(targets.type().toScalarType(kLong)); + + Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2*max_target_length+1}, log_probs.options()); + Tensor neg_log_likelihood = at::empty({batch_size}, log_probs.options()); + + // Very likely, we could be more clever here, e.g. learning (or genralizing and reusing) from SoftMax.cu... + constexpr int max_threads = 1024; + int threads_target = max_threads; + while (threads_target / 2 >= 2*max_target_length+1) { + threads_target /= 2; + } + int threads_batch = std::min(max_threads / threads_target, (int) batch_size); + + dim3 block(threads_target, threads_batch); + dim3 grid((2*max_target_length+1 + threads_target-1)/threads_target, (batch_size+threads_batch-1)/threads_batch); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + ctc_loss_log_alpha_gpu_kernel<<>>( + log_alpha.data(), + log_probs.data(), input_lengths_t.data(), log_probs.size(0), + targets.data(), target_lengths_t.data(), max_target_length, + neg_log_likelihood.data(), + log_probs.stride(0), log_probs.stride(1), log_probs.stride(2), + log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2), + tg_batch_offsets.data(), tg_target_stride, + batch_size, BLANK); + return std::make_tuple(neg_log_likelihood, log_alpha); +} + +// The second (backward) half of the forward backward algorithm, (10) and (11). This is parallel to the +// alpha kernel above. (As mentioned above, it might make sense do the calculation in the alpha kernel.) +template +__global__ void ctc_loss_backward_log_beta_gpu_kernel(scalar_t* __restrict__ log_beta_data, + const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths, int64_t max_input_length, + const target_t* __restrict__ targets_data, const int64_t* __restrict__ target_lengths, int64_t max_target_length, + int64_t lp_input_stride, int64_t lp_batch_stride, int64_t lp_char_stride, + int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride, + const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride, + int64_t batch_size, int64_t BLANK) { + constexpr scalar_t neginf = -INFINITY; + + int64_t b = threadIdx.y + blockIdx.y * blockDim.y; + + int64_t input_length = input_lengths[b]; + int64_t target_length = target_lengths[b]; + int64_t lp_batch_offset = b*lp_batch_stride; + int64_t lb_batch_offset = b*lb_batch_stride; + int64_t tg_batch_offset = tg_batch_offsets[b]; + + if (b >= batch_size) + return; + + // "first" row, the beta initiaization before eq (10) (t=target_length - differes per batch) + for (int64_t block_s = 2*max_target_length - (2*max_target_length % blockDim.x); block_s >= 0; block_s -= blockDim.x) { + int64_t s = threadIdx.x + block_s; + scalar_t lb; + if (s == 2*target_length) { + lb = log_probs_data[lp_batch_offset + (input_length-1) * lp_input_stride + lp_char_stride * BLANK]; + } else if ((target_length > 0) && (s == 2*target_length-1)) { + int64_t current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK); + lb = log_probs_data[lp_batch_offset + (input_length-1) * lp_input_stride + lp_char_stride * current_target_prime]; + } else { + lb = neginf; + } + if (s < 2*max_target_length+1) { + log_beta_data[lb_batch_offset + (input_length-1) * lb_input_stride + lb_target_stride * s] = lb; + } + } + + // go backward in s + for (int64_t block_s = 2*max_target_length - (2*max_target_length % blockDim.x); block_s >= 0; block_s -= blockDim.x) { + int64_t s = threadIdx.x + block_s; + int64_t current_target_prime; + bool have_three; + if (s < 2*target_length+1) { + current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK); + have_three = ((s < 2*target_length-1) && + (get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s+2, BLANK) != + current_target_prime)); + } else { + current_target_prime = BLANK; + have_three = false; + } + // now go backward in t. Note that we need to skip the last timestep that we did above. + for (int64_t t=max_input_length-2; t>=0; t--) { + __syncthreads(); // on cuda 9 we might use partial synchronization of only the threads within the same batch item + if ((t < input_length-1) && (target_length > 0) && (s < 2*target_length+1)) { + scalar_t lb1 = log_beta_data[lb_batch_offset + lb_input_stride * (t+1) + lb_target_stride * s]; + scalar_t lbmax = lb1; + scalar_t lb2, lb3; + + if (s < 2*target_length) { + lb2 = log_beta_data[lb_batch_offset + lb_input_stride * (t+1) + lb_target_stride * (s+1)]; + if (lb2 > lbmax) + lbmax = lb2; + } else { + lb2 = neginf; + } + if (have_three) { + lb3 = log_beta_data[lb_batch_offset + lb_input_stride * (t+1) + lb_target_stride * (s+2)]; + if (lb3 > lbmax) + lbmax = lb3; + } else { + lb3 = neginf; + } + if (lbmax == neginf) + lbmax = 0; + + scalar_t lb = std::log(std::exp(lb1-lbmax)+std::exp(lb2-lbmax)+std::exp(lb3-lbmax))+lbmax + + log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * current_target_prime]; + + log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s] = lb; + } else if ((s < 2*max_target_length+1) || (t >= input_length)) { + log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s] = neginf; + } + } + } +} + +// This implements the subtrahend of equation (16) for all *nonblank* characters. +// It assumes you have probs in gradient_data when called +// and it modifies gradient_data to be, the gradient. +// In order to facilitate this inplace update, We don't actually do this in logspace. +// (The other variant implemented uses log_space and the differences seem to be +// not so problematic at least with unit normal distributed test activations.) +// Internally this uses atomicAdd because different threads may write to the same +// gradient position. +// This is parallelised over b and s again. +// Note that for us, the Z of eqn (16) is actually constant for all t and it is the +// likelihood - this is why we use the negative log likelihood below. +// We also multiply by the input gradient to keep with standard autograd style. +// I took this trick from [2], for moderate alphabet sizes a log-space +// calculation (with an atomic log add) is similarly in performance, but for large +// alphabets the inplace nature is a considerable advantage. +template +__global__ void ctc_loss_backward_collect_nonblank_gpu_kernel(scalar_t* __restrict__ gradient_data, + const scalar_t* __restrict__ grad_out_data, int64_t grad_out_batch_stride, + const scalar_t* __restrict__ log_alpha_data, const scalar_t* __restrict__ log_beta_data, + const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths, int64_t max_input_length, + const target_t* __restrict__ targets_data, const int64_t* __restrict__ target_lengths, int64_t max_target_length, + const scalar_t* __restrict__ neg_log_likelihood_data, + int64_t gr_input_stride, int64_t gr_batch_stride, int64_t gr_char_stride, + int64_t lp_input_stride, int64_t lp_batch_stride, int64_t lp_char_stride, + int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride, + int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride, + const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride, + int64_t batch_size, int64_t num_labels, int64_t BLANK) { + int64_t b = threadIdx.y + blockIdx.y * blockDim.y; + int64_t s = threadIdx.x + blockIdx.x * blockDim.y; // note, this directly indexes into targets, no targets prime! + + if (b >= batch_size) + return; + + int64_t input_length = input_lengths[b]; + int64_t target_length = target_lengths[b]; + int64_t gr_batch_offset = b*gr_batch_stride; + int64_t lp_batch_offset = b*lp_batch_stride; + int64_t la_batch_offset = b*la_batch_stride; + int64_t lb_batch_offset = b*lb_batch_stride; + int64_t tg_batch_offset = tg_batch_offsets[b]; + + if (s >= target_length) + return; + + int64_t target = targets_data[tg_batch_offset + s * tg_target_stride]; + scalar_t nll = neg_log_likelihood_data[b]; + scalar_t gr = grad_out_data[b * grad_out_batch_stride]; + + for (int64_t t = 0; t < input_length; t++) { + scalar_t lp = log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * target]; + atomicAdd(&gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * target], + -std::exp(log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * (s*2+1)] + + log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * (s*2+1)] + + nll - lp) * gr); + } +} + +// This is the naive implementation of equation (16). It is parallelised in batch and input timestep. +// It appears to be faster than the above method for small batch sizes. +template +__global__ void ctc_loss_backward_collect_gpu_kernel(scalar_t* __restrict__ gradient_data, + const scalar_t* __restrict__ grad_out_data, int64_t grad_out_batch_stride, + const scalar_t* __restrict__ log_alpha_data, const scalar_t* __restrict__ log_beta_data, + const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths, int64_t max_input_length, + const target_t* __restrict__ targets_data, const int64_t* __restrict__ target_lengths, int64_t max_target_length, + const scalar_t* __restrict__ neg_log_likelihood_data, + int64_t gr_input_stride, int64_t gr_batch_stride, int64_t gr_char_stride, + int64_t lp_input_stride, int64_t lp_batch_stride, int64_t lp_char_stride, + int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride, + int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride, + const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride, + int64_t batch_size, int64_t num_labels, int64_t BLANK) { + + constexpr scalar_t neginf = -INFINITY; + int64_t b = threadIdx.y + blockIdx.y * blockDim.y; + int64_t t = threadIdx.x + blockIdx.x * blockDim.x; + + if ((t >= max_input_length) || (b >= batch_size)) + return; + + int64_t input_length = input_lengths[b]; + int64_t target_length = target_lengths[b]; + int64_t gr_batch_offset = b*gr_batch_stride; + int64_t lp_batch_offset = b*lp_batch_stride; + int64_t la_batch_offset = b*la_batch_stride; + int64_t lb_batch_offset = b*lb_batch_stride; + int64_t tg_batch_offset = tg_batch_offsets[b]; + + // collected[b, t, target'[s]] "log+=" log_alpha[t, s]+log_beta[t, s] + for (int s = 0; s < 2*max_target_length+1; s++) { + if ((target_length > 0) && (s < 2*target_length+1)) { + int64_t current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK); + scalar_t log_alpha_beta = (log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s] + + log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s]); + scalar_t& lcab = gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * current_target_prime]; + if (lcab == neginf) { + lcab = log_alpha_beta; + } else { + scalar_t max = ((lcab > log_alpha_beta) ? lcab : log_alpha_beta); + lcab = std::log(std::exp(lcab-max)+std::exp(log_alpha_beta-max))+max; + } + } + } + + scalar_t nll = neg_log_likelihood_data[b]; + scalar_t gr = grad_out_data[b * grad_out_batch_stride]; + + for (int64_t c = 0; c < num_labels; c++) { + scalar_t& res = gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * c]; + if (t < input_length) { + scalar_t lp = log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * c]; + res = std::exp(lp)-std::exp(res + nll - lp) * gr; + } + else { + res = 0.; + } + } +} + +// The backward. It essentially computes eq 16 by using the above kernels. +// We don't do a lot of checking as we envision this to be called only when backpropagating through a (well-checked) forward. +template +Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_probs, const Tensor& targets_, IntList input_lengths, IntList target_lengths, + const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK) { + constexpr scalar_t neginf = -INFINITY; + using target_t = typename std::conditional::type; + auto targets = targets_.toType(log_probs.type().toScalarType(target_scalar_type)); // to cuda if it isn't there already + int64_t batch_size = log_probs.size(1); + int64_t num_labels = log_probs.size(2); + int64_t lp_input_stride = log_probs.stride(0); + int64_t lp_char_stride = log_probs.stride(2); + int64_t tg_target_stride; + + int64_t max_target_length; + auto tg_batch_offsets = at::empty({batch_size}, TensorOptions(at::CPU(kLong))); + auto tg_batch_offsets_data = tg_batch_offsets.data(); + if (targets.dim() == 1) { // concatenated targets + int64_t pos = 0; + max_target_length = 0; + for (int64_t i = 0; i < batch_size; i++) { + tg_batch_offsets_data[i] = pos; + pos += target_lengths[i]; + if (max_target_length < target_lengths[i]) + max_target_length = target_lengths[i]; + } + tg_target_stride = targets.stride(0); + } + else { // batch x max_target_length + // dim is 2 + int64_t tg_batch_stride = targets.stride(0); + for (int64_t i = 0; i < batch_size; i++) { + tg_batch_offsets_data[i] = i * tg_batch_stride; + } + tg_target_stride = targets.stride(1); + max_target_length = targets.size(1); + } + auto target_lengths_t = at::tensor(target_lengths, targets.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(targets.type().toScalarType(kLong)); + auto input_lengths_t = at::tensor(input_lengths, targets.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(targets.type().toScalarType(kLong)); + tg_batch_offsets = tg_batch_offsets.toType(targets.type().toScalarType(kLong)); + + Tensor log_beta = at::empty({batch_size, log_probs.size(0), 2*max_target_length+1}, log_probs.options()); + Tensor grad = at::full_like(log_probs, neginf); // initialization for log(sum (alpha beta)) + + // As above, there may be better configurations to use. + constexpr int max_threads = 1024; + int threads_target = max_threads; + while (threads_target / 2 >= 2*max_target_length+1) { + threads_target /= 2; + } + int threads_batch = std::min(max_threads / threads_target, (int) batch_size); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + { + dim3 block(threads_target, threads_batch); + dim3 grid((2*max_target_length+1 + threads_target-1)/threads_target, (batch_size+threads_batch-1)/threads_batch); + + ctc_loss_backward_log_beta_gpu_kernel<<>> + (log_beta.data(), + log_probs.data(), input_lengths_t.data(), log_probs.size(0), + targets.data(), target_lengths_t.data(), max_target_length, + log_probs.stride(0), log_probs.stride(1), log_probs.stride(2), + log_beta.stride(0), log_beta.stride(1), log_beta.stride(2), + tg_batch_offsets.data(), tg_target_stride, + batch_size, BLANK); + } + + // Very crude heuristic for what is a small problem., based on linearly regressing problem dimensions on + // the (capped) difference of timings. + // Note that for OK problems target length <= input length, so we + // only consider input length. + bool is_large = (2*log_probs.size(0)+(24*batch_size)/10+(2*num_labels)/10) > 450; + if (is_large) { // large alphabet, large batch + // this computes the probs, minuend in (16) + exp_out(grad, log_probs); + // now we compute the subtrahend for the blanks. It is a straightforward reduction because we know that + // blanks are in every other position. + // maybe we should kernelize this, too. + auto grad_blank = grad.narrow(2, BLANK, 1); + grad_blank -= (at::logsumexp(log_alpha.as_strided({batch_size, log_alpha.size(1), max_target_length+1}, + {log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2)*2}) + + log_beta.as_strided({batch_size, log_beta.size(1), max_target_length+1}, + {log_beta.stride(0), log_beta.stride(1), log_beta.stride(2)*2}), + 2, true) + .permute({1, 0, 2}) + .add_(neg_log_likelihood.view({1, batch_size, 1})) + .sub_(log_probs.narrow(2, BLANK, 1)) + .exp_() + ); + // Tor the non-blank characters, we use a kernel to compute the subtrahend. + // Again we might configure block and grid in a better way. + int threads_target = max_threads; + while (threads_target / 2 >= max_target_length) { + threads_target /= 2; + } + int threads_batch = std::min(max_threads / threads_target, (int) batch_size); + dim3 block(threads_target, threads_batch); + dim3 grid((max_target_length + threads_target-1)/threads_target, (batch_size+threads_batch-1)/threads_batch); + ctc_loss_backward_collect_nonblank_gpu_kernel<<>> + (grad.data(), + grad_out.data(), grad_out.stride(0), + log_alpha.data(), log_beta.data(), + log_probs.data(), input_lengths_t.data(), log_probs.size(0), + targets.data(), target_lengths_t.data(), max_target_length, + neg_log_likelihood.data(), + grad.stride(0), grad.stride(1), grad.stride(2), + log_probs.stride(0), log_probs.stride(1), log_probs.stride(2), + log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2), + log_beta.stride(0), log_beta.stride(1), log_beta.stride(2), + tg_batch_offsets.data(), tg_target_stride, + batch_size, num_labels, BLANK); + } else { // small problem, use naive algorithm + // Still no block/grid configuration guru... + int threads_input = max_threads; + while (threads_input / 2 >= log_probs.size(0)) { + threads_input /= 2; + } + threads_batch = std::min(max_threads / threads_input, (int) batch_size); + dim3 block(threads_input, threads_batch); + dim3 grid((log_probs.size(0) + threads_input-1)/threads_input, (batch_size+threads_batch-1)/threads_batch); + + ctc_loss_backward_collect_gpu_kernel<<>> + (grad.data(), + grad_out.data(), grad_out.stride(0), + log_alpha.data(), log_beta.data(), + log_probs.data(), input_lengths_t.data(), log_probs.size(0), + targets.data(), target_lengths_t.data(), max_target_length, + neg_log_likelihood.data(), + grad.stride(0), grad.stride(1), grad.stride(2), + log_probs.stride(0), log_probs.stride(1), log_probs.stride(2), + log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2), + log_beta.stride(0), log_beta.stride(1), log_beta.stride(2), + tg_batch_offsets.data(), tg_target_stride, + batch_size, num_labels, BLANK); + } + return grad; +} + +} // namespace + +std::tuple ctc_loss_gpu(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK) { + return AT_DISPATCH_FLOATING_TYPES(log_probs.type(), "ctc_loss", [&] { + if (targets.type().scalarType() == kLong) { + return ctc_loss_gpu_template(log_probs, targets, input_lengths, target_lengths, BLANK); + } else { + return ctc_loss_gpu_template(log_probs, targets, input_lengths, target_lengths, BLANK); + } + }); +} + +Tensor ctc_loss_backward_gpu(const Tensor& grad, const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, + const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK) { + return AT_DISPATCH_FLOATING_TYPES(log_probs.type(), "ctc_loss_backward", [&] { + if (targets.type().scalarType() == kLong) { + return ctc_loss_backward_gpu_template(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK); + } else { + return ctc_loss_backward_gpu_template(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK); + } + }); +} + +} } // at::native diff --git a/aten/src/ATen/native/cudnn/LossCTC.cpp b/aten/src/ATen/native/cudnn/LossCTC.cpp new file mode 100644 index 00000000000000..966aa20e0a128d --- /dev/null +++ b/aten/src/ATen/native/cudnn/LossCTC.cpp @@ -0,0 +1,92 @@ +#include +#include +#include +#include +#if AT_CUDNN_ENABLED() + #include +#endif + + +#if !AT_CUDNN_ENABLED() || (CUDNN_VERSION < 7000) + +namespace at { namespace native { + +// See Note [ATen preprocessor philosophy] + +std::tuple _cudnn_ctc_loss(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK, bool deterministic) { + throw std::runtime_error("cudnn_ctc_loss: ATen not compiled with cuDNN >= 7 support"); +} + +}} + +#else // AT_CUDNN_ENABLED + +#include +#include +#include + +#include + +namespace at { namespace native { + +namespace { + +} // namespace + +std::tuple _cudnn_ctc_loss(const Tensor& log_probs_t, const Tensor& targets_t, IntList input_lengths_, IntList target_lengths_, int64_t BLANK, bool deterministic) { + CheckedFrom c = "cudnn_ctc_loss"; + TensorArg log_probs { log_probs_t, "log_probs", 1 }; + TensorArg targets { targets_t, "targets", 2 }; + checkDim(c, log_probs, 3); + checkScalarType(c, log_probs, kFloat); + checkDim(c, targets, 1); + checkScalarType(c, targets, kInt); + checkContiguous(c, targets); // ? + checkBackend(c, {*log_probs}, Backend::CUDA); + checkBackend(c, {*targets}, Backend::CPU); + int64_t batch_size = log_probs->size(1); + AT_CHECK(input_lengths_.size() == batch_size, "input_lengths needs to have size to match batch_size"); + AT_CHECK(target_lengths_.size() == batch_size, "target_lengths needs to have size to match batch_size"); + + std::vector input_lengths(input_lengths_.begin(), input_lengths_.end()); + std::vector target_lengths(target_lengths_.begin(), target_lengths_.end()); + + setCuDNNStreamToCurrent(); + AT_CHECK(BLANK == 0, "blank must be label 0 for cudnn_ctc_loss"); + // checked in dispatch: + // assert other conditions for cudnnCTCLoss: all label lengths <= 256 + // all input lengths = logprob.size(0) + + auto handle = getCudnnHandle(); + + cudnnCTCLossAlgo_t algo = (deterministic ? CUDNN_CTC_LOSS_ALGO_DETERMINISTIC : CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC); + + Tensor probs = log_probs->softmax(2); + TensorDescriptor probs_desc{probs}; + Tensor grad = at::empty_like(probs); + TensorDescriptor grad_desc{grad}; + + CTCLossDescriptor ctc_loss_desc; + ctc_loss_desc.set(CUDNN_DATA_FLOAT); + + size_t workspace_size; + AT_CUDNN_CHECK(cudnnGetCTCLossWorkspaceSize(handle, probs_desc.desc(), grad_desc.desc(), + targets->data(), target_lengths.data(), input_lengths.data(), + algo, ctc_loss_desc.desc(), &workspace_size)); + + + Tensor workspace = log_probs->type().toScalarType(kByte).tensor(workspace_size); // new way of doing this with empty? + Tensor costs = at::empty({log_probs->size(1)}, log_probs->options()); + + AT_CUDNN_CHECK(cudnnCTCLoss(handle, probs_desc.desc(), probs.data_ptr(), + targets->data(), target_lengths.data(), input_lengths.data(), + costs.data_ptr(), grad_desc.desc(), grad.data_ptr(), algo, + ctc_loss_desc.desc(), workspace.data_ptr(), workspace_size)); + + return std::make_tuple(costs, grad); +} + + +}} // namespace at::native + +#endif diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 8692d6165ff72a..6917cdb6ee6a65 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -29,6 +29,11 @@ - func: _cast_Half(Tensor self, bool non_blocking=false) -> Tensor variants: function, method +- func: _cudnn_ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank, bool deterministic) -> (Tensor, Tensor) + variants: function + dispatch: + CUDA: _cudnn_ctc_loss + - func: _cudnn_rnn_flatten_weight(TensorList weight_arr, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, bool bidirectional) -> Tensor variants: function dispatch: @@ -504,6 +509,21 @@ - func: cumprod_out(Tensor result, Tensor self, int64_t dim) -> Tensor variants: function +- func: ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank=0, int64_t reduction=Reduction::ElementwiseMean) -> Tensor + variants: function + +- func: _ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank=0) -> (Tensor, Tensor) + variants: function + dispatch: + CPU: ctc_loss_cpu + CUDA: ctc_loss_gpu + +- func: _ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int64_t blank) -> Tensor + variants: function + dispatch: + CPU: ctc_loss_backward_cpu + CUDA: ctc_loss_backward_gpu + - func: det(Tensor self) -> Tensor - func: diagflat(Tensor self, int64_t offset=0) -> Tensor diff --git a/aten/src/ATen/templates/StorageDerived.cpp b/aten/src/ATen/templates/StorageDerived.cpp deleted file mode 100644 index 0491203c3286e6..00000000000000 --- a/aten/src/ATen/templates/StorageDerived.cpp +++ /dev/null @@ -1,69 +0,0 @@ -#include "ATen/${Storage}.h" - -// ${generated_comment} - -#include "ATen/Half.h" -#include "ATen/Allocator.h" -#include - -#include "ATen/Config.h" -$extra_cuda_headers - -namespace at { - -${Storage}::${Storage}() - : Storage(new StorageImpl( - ScalarType::${ScalarName}, - 0, -#if ${isCUDA} - globalContext().getTHCState()->cudaDeviceAllocator, -#else - getTHDefaultAllocator(), -#endif - /* resizable */ true)) {} - -${Storage}::${Storage}(size_t size) - : Storage(new StorageImpl( - ScalarType::${ScalarName}, - size, -#if ${isCUDA} - globalContext().getTHCState()->cudaDeviceAllocator, -#else - getTHDefaultAllocator(), -#endif - /* resizable */ true)) {} - -${Storage}::${Storage}(size_t size, Allocator* allocator) - : Storage(new StorageImpl( - ScalarType::${ScalarName}, - size, - allocator, - /* resizable */ false)) {} - -// TODO: Take in Device as an input to the std::function constructor - -#if ${isCUDA} -static int getPointerDevice(void* ptr) { - struct cudaPointerAttributes attr; - THCudaCheck(cudaPointerGetAttributes(&attr, ptr)); - return attr.device; -} -#endif - -${Storage}::${Storage}( - void * data, - size_t size, - const std::function & deleter) - : Storage(new StorageImpl( - ScalarType::${ScalarName}, - size, - InefficientStdFunctionContext::makeDataPtr(data, deleter, -#if ${isCUDA} - Device(kCUDA, getPointerDevice(data)) -#else - kCPU -#endif - ), - /* allocator */ nullptr, - /* resizable */ false)) {} -} diff --git a/aten/src/ATen/templates/StorageDerived.h b/aten/src/ATen/templates/StorageDerived.h deleted file mode 100644 index dddcd5dbf03f21..00000000000000 --- a/aten/src/ATen/templates/StorageDerived.h +++ /dev/null @@ -1,31 +0,0 @@ -#pragma once - -// ${generated_comment} - -$th_headers - -#include "ATen/Storage.h" -#include "ATen/Context.h" - -#include - -namespace at { - -struct Allocator; - -struct ${Storage} final : public Storage { - ${Storage}(); - ${Storage}(StorageImpl* storage_impl) : Storage(storage_impl){}; - ${Storage}(size_t size); - ${Storage}(size_t size, Allocator* allocator); - ${Storage}( - void* data, - size_t size, - const std::function& deleter); - StorageImpl* storage_impl_; - - protected: - friend struct ${Type}; -}; - -} // namespace at diff --git a/aten/src/ATen/templates/TensorDense.cpp b/aten/src/ATen/templates/TensorDense.cpp index cc2f47a89180ab..aeba9fb22a3653 100644 --- a/aten/src/ATen/templates/TensorDense.cpp +++ b/aten/src/ATen/templates/TensorDense.cpp @@ -3,5 +3,5 @@ std::unique_ptr ${Tensor}::storage() { auto storage = THTensor_getStoragePtr(tensor); THStorage_retain(storage); - return std::unique_ptr(new ${Storage}(storage)); + return std::unique_ptr(new Storage(storage)); } diff --git a/aten/src/ATen/templates/TensorDerived.cpp b/aten/src/ATen/templates/TensorDerived.cpp index d72ba4abde2c12..249fce467debfc 100644 --- a/aten/src/ATen/templates/TensorDerived.cpp +++ b/aten/src/ATen/templates/TensorDerived.cpp @@ -7,7 +7,7 @@ #include "ATen/Config.h" #include "ATen/${Tensor}.h" -#include "ATen/${Storage}.h" +#include "ATen/Storage.h" #include "ATen/Scalar.h" #include "ATen/Half.h" diff --git a/aten/src/ATen/templates/TypeDerived.cpp b/aten/src/ATen/templates/TypeDerived.cpp index 67009473dddefc..ddd1483f0436f3 100644 --- a/aten/src/ATen/templates/TypeDerived.cpp +++ b/aten/src/ATen/templates/TypeDerived.cpp @@ -31,6 +31,14 @@ namespace at { +#if ${isCUDA} +static int getPointerDevice(void* ptr) { + struct cudaPointerAttributes attr; + THCudaCheck(cudaPointerGetAttributes(&attr, ptr)); + return attr.device; +} +#endif + ${Type}::${Type}(Context* context) : Type(context, /*is_variable=*/false, /*is_undefined=*/false) {} ScalarType ${Type}::scalarType() const { @@ -44,18 +52,44 @@ bool ${Type}::is_sparse() const { return backend() == kSparseCPU || backend() == bool ${Type}::is_distributed() const { return false; } std::unique_ptr ${Type}::storage() const { - return std::unique_ptr(new ${Storage}()); + return std::unique_ptr(new Storage( + ScalarType::${ScalarName}, + 0, +#if ${isCUDA} + globalContext().getTHCState()->cudaDeviceAllocator +#else + getTHDefaultAllocator() +#endif + )); } std::unique_ptr ${Type}::storage(size_t size) const { - return std::unique_ptr(new ${Storage}(size)); + return std::unique_ptr(new Storage( + ScalarType::${ScalarName}, + size, +#if ${isCUDA} + globalContext().getTHCState()->cudaDeviceAllocator +#else + getTHDefaultAllocator() +#endif + )); } std::unique_ptr ${Type}::storageFromBlob(void * data, int64_t size, const std::function & deleter) const { return std::unique_ptr( - new ${Storage}(data,size,deleter)); + new Storage( + ScalarType::${ScalarName}, + InefficientStdFunctionContext::makeDataPtr(data, deleter, +#if ${isCUDA} + Device(kCUDA, getPointerDevice(data)) +#else + kCPU +#endif + ), + size, + deleter)); } std::unique_ptr ${Type}::storageWithAllocator(int64_t size, Allocator* allocator) const { return std::unique_ptr( - new ${Storage}(size, allocator)); + new Storage(ScalarType::${ScalarName}, size, allocator)); } Tensor ${Type}::unsafeTensorFromTH(void * th_pointer, bool retain) const { if (retain) @@ -65,7 +99,7 @@ Tensor ${Type}::unsafeTensorFromTH(void * th_pointer, bool retain) const { std::unique_ptr ${Type}::unsafeStorageFromTH(void * th_pointer, bool retain) const { if (retain) ${THStorage}_retain(${state,} (${THStorage}*) th_pointer); - return std::unique_ptr(new ${Storage}((${THStorage}*) th_pointer)); + return std::unique_ptr(new Storage((${THStorage}*) th_pointer)); } std::unique_ptr ${Type}::generator() const { return std::unique_ptr(new ${Generator}(context)); diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp index 6b46c8c0b70018..8e58df97073086 100644 --- a/aten/src/ATen/test/basic.cpp +++ b/aten/src/ATen/test/basic.cpp @@ -270,6 +270,10 @@ static void test(Type & type) { auto result = tensor.m(relu).m(mse_loss, other, Reduction::ElementwiseMean); REQUIRE(result.allclose(mse_loss(relu(tensor), other))); } + SECTION("core") { + int i = CoreTest(); + REQUIRE(i + 1 == CoreTest()); + } } TEST_CASE( "basic tests CPU", "[cpu]" ) { diff --git a/aten/src/TH/THStorageFunctions.cpp b/aten/src/TH/THStorageFunctions.cpp index 0f05bb466651d3..0c36d5bf97fcf0 100644 --- a/aten/src/TH/THStorageFunctions.cpp +++ b/aten/src/TH/THStorageFunctions.cpp @@ -19,38 +19,25 @@ void THStorage_free(THStorage* storage) { if (!storage) { return; } - - if (--storage->refcount == 0) { - if (storage->finalizer) { - (*storage->finalizer)(); - } - storage->finalizer = nullptr; - storage->data_ptr.clear(); - THStorage_weakFree(storage); - } + storage->release(); } // Manually retains a weak reference void THStorage_weakRetain(THStorage *weak_storage) { - weak_storage->weakcount++; + weak_storage->weak_retain(); } // Releases a weak reference void THStorage_weakFree(THStorage *weak_storage) { - if (--weak_storage->weakcount == 0) { - delete weak_storage; - } + weak_storage->weak_release(); } // Given a weak reference, returns a strong reference to a storage (which must // be freed when done) or null if the storage is already dead. THStorage* THStorage_weakLock(THStorage *weak_storage) { - for (;;) { - int refcount = weak_storage->refcount.load(); - if (refcount == 0) return nullptr; - if (weak_storage->refcount.compare_exchange_strong(refcount, refcount + 1)) break; - } - return weak_storage; + if (weak_storage->weak_lock()) + return weak_storage; + return nullptr; } THDescBuff THLongStorage_sizeDesc(const THLongStorage *size) { @@ -95,7 +82,7 @@ ptrdiff_t THStorage_size(const THStorage *self) void THStorage_retain(THStorage *storage) { if (storage) { - ++storage->refcount; + storage->retain(); } } diff --git a/aten/src/TH/THStorageFunctions.hpp b/aten/src/TH/THStorageFunctions.hpp index 671e2f39fb1c7e..0e8b3e4ab17bee 100644 --- a/aten/src/TH/THStorageFunctions.hpp +++ b/aten/src/TH/THStorageFunctions.hpp @@ -35,8 +35,6 @@ TH_API ptrdiff_t THStorage_size(const THStorage *self); -TH_API void THStorage_setFlag(THStorage *storage, const char flag); -TH_API void THStorage_clearFlag(THStorage *storage, const char flag); TH_API void THStorage_retain(THStorage *storage); TH_API void THStorage_resize(THStorage *storage, ptrdiff_t size); TH_API void THStorage_swap(THStorage *storage1, THStorage *storage2); diff --git a/aten/src/TH/THTensor.hpp b/aten/src/TH/THTensor.hpp index 16329f7ed7f621..364290934263ef 100644 --- a/aten/src/TH/THTensor.hpp +++ b/aten/src/TH/THTensor.hpp @@ -109,6 +109,17 @@ inline int64_t* THTensor_getStridePtr(THTensor* tensor) { // NB: Non-retaining inline THStorage* THTensor_getStoragePtr(const THTensor* tensor) { + // Within PyTorch, the invariant is that storage_ is always + // initialized; we never have tensors that don't have any storage. + // However, for Caffe2, this is not true, because they have permitted + // tensors to be allocated without specifying what scalar type + // they should be, only to be filled when GetMutableData is called + // for the first time (providing the necessary type). It is an ERROR to + // invoke any PyTorch operations on such a half-constructed storage, + // and this check tests for that case. + AT_CHECK(tensor->storage_, "Cannot use PyTorch operations on a half-constructed " + "tensor. If this tensor came from Caffe2, please call GetMutableData on " + "it first; otherwise, this is a bug, please report it."); return tensor->storage_; } @@ -141,6 +152,9 @@ inline void THTensor_setStorageOffset(THTensor* tensor, ptrdiff_t storage_offset // NB: Steals ownership of storage inline void THTensor_stealAndSetStoragePtr(THTensor* tensor, THStorage* storage) { + // Caffe2 might have tensors whose storages are null, but we + // don't allow it in PyTorch. + AT_ASSERT(storage); tensor->storage_ = storage; } @@ -177,6 +191,19 @@ inline int THTensor_nDimensionLegacyAll(const THTensor* tensor) { } } +inline int64_t THTensor_strideLegacyNoScalars(const THTensor *self, int dim) { + THArgCheck((dim >= 0) && (dim < THTensor_nDimensionLegacyNoScalars(self)), 2, "dimension %d out of range of %dD tensor", + dim+TH_INDEX_BASE, THTensor_nDimensionLegacyNoScalars(self)); + return THTensor_isZeroDim(self) ? 1 : self->stride(dim); +} + +inline int64_t THTensor_sizeLegacyNoScalars(const THTensor *self, int dim) +{ + THArgCheck((dim >= 0) && (dim < THTensor_nDimensionLegacyNoScalars(self)), 2, "dimension %d out of range of %dD tensor", + dim+TH_INDEX_BASE, THTensor_nDimensionLegacyNoScalars(self)); + return THTensor_isZeroDim(self) ? 1 : self->size(dim); +} + TH_API void THTensor_free(THTensor *self); TH_CPP_API at::optional> THTensor_compute_stride(at::IntList oldshape, at::IntList oldstride, at::IntList newshape); diff --git a/aten/src/THCUNN/generic/MultiMarginCriterion.cu b/aten/src/THCUNN/generic/MultiMarginCriterion.cu index 8272b3d4020ec7..65bd6cdec850bb 100644 --- a/aten/src/THCUNN/generic/MultiMarginCriterion.cu +++ b/aten/src/THCUNN/generic/MultiMarginCriterion.cu @@ -18,7 +18,7 @@ void THNN_(MultiMarginCriterion_updateOutput)( input = THCTensor_(newContiguous)(state, input); if(weights) weights = THCTensor_(newContiguous)(state, weights); - if (input->dim() == 1) + if (THTensor_nDimensionLegacyNoScalars(input) == 1) { dim3 blocks(1); dim3 threads(MULTIMARGIN_THREADS); @@ -30,7 +30,7 @@ void THNN_(MultiMarginCriterion_updateOutput)( THCTensor_(data)(state, input), THCIndexTensor_(data)(state, target), weights ? THCTensor_(data)(state, weights) : NULL, - 1, input->size(0), + 1, THTensor_sizeLegacyNoScalars(input, 0), reduction == Reduction::ElementwiseMean, margin ); @@ -42,7 +42,7 @@ void THNN_(MultiMarginCriterion_updateOutput)( THCTensor_(data)(state, input), THCIndexTensor_(data)(state, target), weights ? THCTensor_(data)(state, weights) : NULL, - 1, input->size(0), + 1, THTensor_sizeLegacyNoScalars(input, 0), reduction == Reduction::ElementwiseMean, margin ); @@ -52,7 +52,7 @@ void THNN_(MultiMarginCriterion_updateOutput)( else if (input->dim() == 2) { int nframe = input->size(0); - THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe), 3, + THArgCheck(!target->is_empty() && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), 3, "inconsistent target size"); dim3 blocks(input->size(0)); dim3 threads(MULTIMARGIN_THREADS); @@ -149,7 +149,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)( if(weights) weights = THCTensor_(newContiguous)(state, weights); - if (input->dim() == 1) + if (THTensor_nDimensionLegacyNoScalars(input) == 1) { dim3 blocks(1); dim3 threads(MULTIMARGIN_THREADS); @@ -162,7 +162,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)( THCTensor_(data)(state, input), THCIndexTensor_(data)(state, target), weights ? THCTensor_(data)(state, weights) : NULL, - 1, gradInput->size(0), + 1, THTensor_sizeLegacyNoScalars(gradInput, 0), reduction == Reduction::ElementwiseMean, margin, reduction != Reduction::None @@ -176,7 +176,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)( THCTensor_(data)(state, input), THCIndexTensor_(data)(state, target), weights ? THCTensor_(data)(state, weights) : NULL, - 1, gradInput->size(0), + 1, THTensor_sizeLegacyNoScalars(gradInput, 0), reduction == Reduction::ElementwiseMean, margin, reduction != Reduction::None @@ -187,7 +187,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)( else if (input->dim() == 2) { int nframe = gradInput->size(0); - THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe), 3, + THArgCheck(!target->is_empty() && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), 3, "inconsistent target size"); dim3 blocks(gradInput->size(0)); dim3 threads(MULTIMARGIN_THREADS); diff --git a/aten/src/THCUNN/generic/SparseLinear.cu b/aten/src/THCUNN/generic/SparseLinear.cu index f73bd5835c04bb..0363dcf0e3996a 100644 --- a/aten/src/THCUNN/generic/SparseLinear.cu +++ b/aten/src/THCUNN/generic/SparseLinear.cu @@ -4,17 +4,17 @@ static bool THNN_(checkInput)(THCTensor* t) { - return !t->is_empty() && THTensor_nDimensionLegacyAll(t) == 2 && t->size(1) == 3; + return !t->is_empty() && t->dim() == 2 && t->size(1) == 3; } static bool THNN_(checkSize2D)(THCTensor* t, int64_t size0, int64_t size1) { - return !t->is_empty() && THTensor_nDimensionLegacyAll(t) == 2 && t->size(0) == size0 && t->size(1) == size1; + return !t->is_empty() && t->dim() == 2 && t->size(0) == size0 && t->size(1) == size1; } static bool THNN_(checkSize1D)(THCTensor* t, int64_t size0) { - return !t->is_empty() && THTensor_nDimensionLegacyAll(t) == 1 && t->size(0) == size0; + return !t->is_empty() && THTensor_nDimensionLegacyNoScalars(t) == 1 && THTensor_sizeLegacyNoScalars(t, 0) == size0; } static inline void THNN_(copyCudaFloatingType)(THCState *state, THCudaIntTensor *buf, THCTensor *t) { diff --git a/aten/src/THNN/generic/MultiLabelMarginCriterion.c b/aten/src/THNN/generic/MultiLabelMarginCriterion.c index 0699c3ac471c55..a18252b06914d6 100644 --- a/aten/src/THNN/generic/MultiLabelMarginCriterion.c +++ b/aten/src/THNN/generic/MultiLabelMarginCriterion.c @@ -17,14 +17,14 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)( int64_t t, d, dt, ddt; real sum; - AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2), + AT_CHECK(!input->is_empty() && input->dim() <= 2, "non-empty vector or matrix expected, got size: ", input->sizes()); - if (input->dim() == 1) + if (input->dim() <= 1) { nframe = 1; - dim = input->size(0); - AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size(0) == dim), + dim = THTensor_sizeLegacyNoScalars(input, 0); + AT_CHECK(!target->is_empty() && (target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == dim), "inconsistent target size"); } else @@ -155,16 +155,16 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)( int64_t t, d, dt; real g; - AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2), + AT_CHECK(!input->is_empty() && input->dim() <= 2, "vector or matrix expected, got size: ", input->sizes()); - if (input->dim() == 1) + if (input->dim() <= 1) { nframe = 1; - dim = input->size(0); - AT_CHECK((!target->is_empty() && target->dim() == 1) && (target->size(0) == dim), + dim = THTensor_sizeLegacyNoScalars(input, 0); + AT_CHECK((!target->is_empty() && target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == dim), "inconsistent target size"); - AT_CHECK((!isTarget->is_empty() && isTarget->dim() == 1) && (isTarget->size(0) == dim), + AT_CHECK((!isTarget->is_empty() && isTarget->dim() <= 1) && (THTensor_sizeLegacyNoScalars(isTarget, 0) == dim), "inconsistent isTarget size"); } else diff --git a/aten/src/THNN/generic/MultiMarginCriterion.c b/aten/src/THNN/generic/MultiMarginCriterion.c index 424669e5de8515..511089bfaeb5b6 100644 --- a/aten/src/THNN/generic/MultiMarginCriterion.c +++ b/aten/src/THNN/generic/MultiMarginCriterion.c @@ -20,13 +20,13 @@ void THNN_(MultiMarginCriterion_updateOutput)( int64_t t, d; real sum; - AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2), + AT_CHECK(!input->is_empty() && input->dim() <= 2, "non-empty vector or matrix expected, got size: ", input->sizes()); - if (input->dim() == 1) + if (input->dim() <= 1) { nframe = 1; - dim = input->size(0); + dim = THTensor_sizeLegacyNoScalars(input, 0); } else { @@ -136,19 +136,19 @@ void THNN_(MultiMarginCriterion_updateGradInput)( int64_t t, d; real g; - AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2), + AT_CHECK(!input->is_empty() && (input->dim() <= 2), "non-empty vector or matrix expected, got size: ", input->sizes()); - if (input->dim() == 1) + if (input->dim() <= 1) { nframe = 1; - dim = input->size(0); + dim = THTensor_sizeLegacyNoScalars(input, 0); } else { nframe = input->size(0); dim = input->size(1); - AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe), + AT_CHECK(!target->is_empty() && (target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), "inconsistent target size, got: ", target->sizes()); } diff --git a/aten/src/THNN/generic/SparseLinear.c b/aten/src/THNN/generic/SparseLinear.c index a28d4e78477ceb..3bf8e652fa9ed9 100644 --- a/aten/src/THNN/generic/SparseLinear.c +++ b/aten/src/THNN/generic/SparseLinear.c @@ -26,7 +26,7 @@ static bool THNN_(checkSize2D)(THTensor* t, int64_t size0, int64_t size1) static bool THNN_(checkSize1D)(THTensor* t, int64_t size0) { - return !t->is_empty() && t->dim() == 1 && t->size(0) == size0; + return !t->is_empty() && THTensor_nDimensionLegacyNoScalars(t) == 1 && THTensor_sizeLegacyNoScalars(t, 0) == size0; } static void THNN_(set1d)(THTensor *t, int64_t x0, real value) { diff --git a/aten/src/THNN/generic/VolumetricConvolution.c b/aten/src/THNN/generic/VolumetricConvolution.c index 4b74445e047705..6b1480667d3167 100644 --- a/aten/src/THNN/generic/VolumetricConvolution.c +++ b/aten/src/THNN/generic/VolumetricConvolution.c @@ -189,7 +189,7 @@ void THNN_(VolumetricConvolution_accGradParameters)( int nOutputPlane = (int)gradWeight->size(0); if (gradBias) { - THArgCheck(!gradBias->is_empty() && gradBias->dim() == 1 && gradBias->size(0) == nOutputPlane, 5, + THArgCheck(!gradBias->is_empty() && THTensor_nDimensionLegacyNoScalars(gradBias) == 1 && THTensor_sizeLegacyNoScalars(gradBias, 0) == nOutputPlane, 5, "gradBias tensor has wrong size" ); } diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 0d84ccbfb606a1..113b4e8ba2e2c8 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -51,6 +51,14 @@ if(BUILD_ATEN) set(Caffe2_HIP_SRCS ${ATen_CUDA_SRCS}) set(Caffe2_HIP_INCLUDES ${Caffe2_HIP_INCLUDES} ${Caffe2_GPU_INCLUDE}) ENDIF(USE_ROCM) +else() + # Only add "ATen Core", a minimal, easy-to-compile fragment of ATen. + # This codepath should only be exercised by the Android build. + add_subdirectory(../aten/src/ATen/core ATen_core) + list(APPEND Caffe2_CPU_SRCS ${ATen_CORE_SRCS}) + list(APPEND Caffe2_CPU_INCLUDE ${ATen_CORE_INCLUDE}) + # TODO: We should probably install the headers, but I don't know + # how to do that. endif() # ---[ Torch build diff --git a/caffe2/core/context.h b/caffe2/core/context.h index f2831909e1587a..8bf5b9dc61dc7a 100644 --- a/caffe2/core/context.h +++ b/caffe2/core/context.h @@ -13,6 +13,8 @@ #include "caffe2/core/typeid.h" #include "caffe2/proto/caffe2.pb.h" +#include "ATen/core/ATenCoreTest.h" + CAFFE2_DECLARE_bool(caffe2_report_cpu_memory_usage); namespace caffe2 { diff --git a/caffe2/core/context_test.cc b/caffe2/core/context_test.cc index a6e44846e9e0be..8924a9dc931be9 100644 --- a/caffe2/core/context_test.cc +++ b/caffe2/core/context_test.cc @@ -6,6 +6,11 @@ namespace caffe2 { +TEST(CPUContextTest, ATenCoreTest) { + int i = at::CoreTest(); + EXPECT_EQ(i + 1, at::CoreTest()); +} + TEST(CPUContextTest, TestAllocAlignment) { for (int i = 1; i < 10; ++i) { auto data = CPUContext::New(i); diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h index 3c5148e5b6c70f..aab127d8c56e16 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h @@ -46,28 +46,31 @@ class Edge : public StorageType { public: using NodeRef = typename Graph::NodeRef; Edge(NodeRef tail, NodeRef head, U... args) - : StorageType(std::forward(args)...), Tail(tail), Head(head) { + : StorageType(std::forward(args)...), + tail_(tail), + head_(head) { DEBUG_PRINT("Creating instance of Edge: %p\n", this); } const NodeRef& tail() const { - return Tail; + return tail_; } const NodeRef& head() const { - return Head; + return head_; } void setTail(NodeRef n) { - Tail = n; + tail_ = n; } void setHead(NodeRef n) { - Head = n; + head_ = n; } private: - NodeRef Tail; - NodeRef Head; + NodeRef tail_; + NodeRef head_; + friend class Graph; }; @@ -88,54 +91,55 @@ class Node : public StorageType, public Notifier> { /// \brief Adds an edge by reference to known in-edges. /// \p e A reference to an edge that will be added as an in-edge. void addInEdge(EdgeRef e) { - inEdges.emplace_back(e); + inEdges_.emplace_back(e); } /// \brief Adds an edge by reference to known out-edges. /// \p e A reference to an edge that will be added as an out-edge. void addOutEdge(EdgeRef e) { - outEdges.emplace_back(e); + outEdges_.emplace_back(e); } /// \brief Removes an edge by reference to known in-edges. /// \p e A reference to an edge that will be removed from in-edges. void removeInEdge(EdgeRef e) { - auto iter = std::find(inEdges.begin(), inEdges.end(), e); - assert( - iter != inEdges.end() && - "Attempted to remove edge that isn't connected to this node"); - inEdges.erase(iter); + removeEdgeInternal(inEdges_, e); } /// \brief Removes an edge by reference to known out-edges. /// \p e A reference to an edge that will be removed from out-edges. void removeOutEdge(EdgeRef e) { - auto iter = std::find(outEdges.begin(), outEdges.end(), e); - assert( - iter != outEdges.end() && - "Attempted to remove edge that isn't connected to this node"); - outEdges.erase(iter); + removeEdgeInternal(outEdges_, e); } const std::vector& getOutEdges() const { - return outEdges; + return outEdges_; } const std::vector& getInEdges() const { - return inEdges; + return inEdges_; } - void setInEdges(std::vector es) { - inEdges = es; + void setInEdges(std::vector edges) { + inEdges_ = edges; } - void setOutEdges(std::vector es) { - outEdges = es; + void setOutEdges(std::vector edges) { + outEdges_ = edges; } - protected: - std::vector inEdges; - std::vector outEdges; + private: + std::vector inEdges_; + std::vector outEdges_; + friend class Graph; + + void removeEdgeInternal(std::vector& edges, EdgeRef e) { + auto iter = std::find(edges.begin(), edges.end(), e); + assert( + iter != edges.end() && + "Attempted to remove edge that isn't connected to this node"); + edges.erase(iter); + } }; /// \brief Effectively a constant reference to a graph. @@ -158,46 +162,56 @@ class Subgraph { using EdgeRef = typename Graph::EdgeRef; void addNode(NodeRef n) { - Nodes.insert(n); + nodes_.insert(n); } + bool hasNode(NodeRef n) const { - return Nodes.count(n) != 0; + return nodes_.count(n) != 0; } + void removeNode(NodeRef n) { - Nodes.erase(n); + nodes_.erase(n); } void addEdge(EdgeRef e) { - Edges.insert(e); + edges_.insert(e); } - bool hasEdge(EdgeRef n) const { - return Edges.count(n) != 0; + + bool hasEdge(EdgeRef e) const { + return edges_.count(e) != 0; } + void removeEdge(EdgeRef e) { - Edges.erase(e); + edges_.erase(e); } const std::unordered_set& getNodes() const { - return Nodes; + return nodes_; + } + + const size_t getNodesCount() const { + return (size_t)nodes_.size(); } + const std::unordered_set& getEdges() const { - return Edges; + return edges_; } + private: + std::unordered_set nodes_; + std::unordered_set edges_; + void printEdges() { - for (const auto& edge : Edges) { + for (const auto& edge : edges_) { printf("Edge: %p (%p -> %p)\n", &edge, edge->tail(), edge->head()); } } void printNodes() const { - for (const auto& node : Nodes) { + for (const auto& node : nodes_) { printf("Node: %p\n", node); } } - - std::unordered_set Nodes; - std::unordered_set Edges; }; /// \brief A simple graph implementation @@ -231,21 +245,21 @@ class Graph { } void importNode(NodeRef node, Graph& otherGraph) { - for (auto it = Nodes.begin(); it != Nodes.end(); ++it) { + for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { if (&(*it) == node) { - std::list>& otherNodes = otherGraph.Nodes; - otherNodes.splice(otherNodes.end(), Nodes, it, ++it); - otherGraph.NodeRefs.insert(node); + std::list>& otherNodes = otherGraph.nodes_; + otherNodes.splice(otherNodes.end(), nodes_, it, ++it); + otherGraph.nodeRefs_.insert(node); break; } } } void importEdge(EdgeRef edge, Graph& otherGraph) { - std::list>& otherEdges = otherGraph.Edges; - for (auto it = Edges.begin(); it != Edges.end(); ++it) { + std::list>& otherEdges = otherGraph.edges_; + for (auto it = edges_.begin(); it != edges_.end(); ++it) { if (&(*it) == edge) { - otherEdges.splice(otherEdges.end(), Edges, it, ++it); + otherEdges.splice(otherEdges.end(), edges_, it, ++it); break; } } @@ -313,9 +327,9 @@ class Graph { /// \return A reference to the edge created. EdgeRef createEdge(NodeRef tail, NodeRef head, U... data) { DEBUG_PRINT("Creating edge (%p -> %p)\n", tail, head); - this->Edges.emplace_back( + this->edges_.emplace_back( Edge(tail, head, std::forward(data)...)); - EdgeRef e = &this->Edges.back(); + EdgeRef e = &this->edges_.back(); head->addInEdge(e); tail->addOutEdge(e); return e; @@ -339,85 +353,85 @@ class Graph { /// related to the node. void deleteNode(NodeRef n, bool deleteEdges = true) { if (deleteEdges) { - auto inEdges = n->inEdges; + auto inEdges = n->inEdges_; for (auto& edge : inEdges) { deleteEdge(edge); } - auto outEdges = n->outEdges; + auto outEdges = n->outEdges_; for (auto& edge : outEdges) { deleteEdge(edge); } } - for (auto i = Nodes.begin(); i != Nodes.end(); ++i) { + for (auto i = nodes_.begin(); i != nodes_.end(); ++i) { if (&*i == n) { - NodeRefs.erase(n); - Nodes.erase(i); + nodeRefs_.erase(n); + nodes_.erase(i); break; } } } - bool hasNode(NodeRef ref) const { - return NodeRefs.find(ref) != NodeRefs.end(); + bool hasNode(NodeRef node) const { + return nodeRefs_.find(node) != nodeRefs_.end(); } /// \brief Deletes a edge from the graph. /// \p e A reference to the edge. - void deleteEdge(EdgeRef e, bool remove_ref = true) { - if (remove_ref) { - e->Tail->removeOutEdge(e); - e->Head->removeInEdge(e); + void deleteEdge(EdgeRef e, bool removeRef = true) { + if (removeRef) { + e->tail_->removeOutEdge(e); + e->head_->removeInEdge(e); } - for (auto i = Edges.begin(); i != Edges.end(); ++i) { + for (auto i = edges_.begin(); i != edges_.end(); ++i) { if (&*i == e) { - Edges.erase(i); + edges_.erase(i); break; } } } const std::vector getMutableNodes() { - std::vector v; - for (auto& n : Nodes) { + std::vector result; + for (auto& n : nodes_) { DEBUG_PRINT("Adding node to mutable output (%p)\n", &n); - v.emplace_back(&n); + result.emplace_back(&n); } - return v; + return result; } const std::vector getMutableEdges() { - std::vector v; - for (auto& e : Edges) { + std::vector result; + for (auto& e : edges_) { DEBUG_PRINT("Adding edge to mutable output (%p)\n", &e); - v.emplace_back(&e); + result.emplace_back(&e); } - return v; + return result; + } + + private: + std::list> nodes_; + std::list> edges_; + std::unordered_set nodeRefs_; + + NodeRef createNodeInternal(Node&& node) { + nodes_.emplace_back(std::move(node)); + NodeRef nodeRef = &nodes_.back(); + DEBUG_PRINT("Creating node (%p)\n", nodeRef); + nodeRefs_.insert(nodeRef); + return nodeRef; } void printEdges() { - for (const auto& edge : Edges) { + for (const auto& edge : edges_) { printf("Edge: %p (%p -> %p)\n", &edge, edge.tail(), edge.head()); } } void printNodes() const { - for (const auto& node : Nodes) { + for (const auto& node : nodes_) { printf("Node: %p\n", &node); } } - - private: - std::list> Nodes; - std::list> Edges; - std::unordered_set NodeRefs; - - NodeRef createNodeInternal(Node&& node) { - Nodes.emplace_back(std::move(node)); - NodeRef nodeRef = &Nodes.back(); - DEBUG_PRINT("Creating node (%p)\n", nodeRef); - NodeRefs.insert(nodeRef); - return nodeRef; - } }; } // namespace nom diff --git a/caffe2/core/nomnigraph/tests/binary_match_test.cc b/caffe2/core/nomnigraph/tests/binary_match_test.cc index 4834cea30f3e23..ca3fd11b3a9126 100644 --- a/caffe2/core/nomnigraph/tests/binary_match_test.cc +++ b/caffe2/core/nomnigraph/tests/binary_match_test.cc @@ -19,7 +19,7 @@ TEST(BinaryMatch, AllMatch) { auto matches = nom::algorithm::binaryMatch( &graph, [](decltype(graph)::NodeRef n) { return true; }); EXPECT_EQ(matches.size(), 1); - EXPECT_EQ(matches.front().Nodes.size(), graph.getMutableNodes().size()); + EXPECT_EQ(matches.front().getNodesCount(), graph.getMutableNodes().size()); } TEST(BinaryMatch, EmptyGraph) { @@ -58,9 +58,9 @@ TEST(BinaryMatch, Basic) { EXPECT_EQ(matches.size(), 1); auto match = matches.front(); - EXPECT_EQ(match.Nodes.size(), 4); + EXPECT_EQ(match.getNodesCount(), 4); std::set exp{"2", "3", "4", "6"}; - for (auto n : match.Nodes) { + for (auto n : match.getNodes()) { EXPECT_EQ(exp.count(n->data()), 1); exp.erase(n->data()); } @@ -104,16 +104,16 @@ TEST(BinaryMatch, RemovedMiddleNode) { auto match1 = matches.front(); auto match2 = matches.back(); - EXPECT_EQ(match1.Nodes.size(), 2); - EXPECT_EQ(match2.Nodes.size(), 1); + EXPECT_EQ(match1.getNodesCount(), 2); + EXPECT_EQ(match2.getNodesCount(), 1); std::set exp1{"2", "4"}; std::set exp2{"6"}; - for (auto n : match1.Nodes) { + for (auto n : match1.getNodes()) { EXPECT_EQ(exp1.count(n->data()), 1); exp1.erase(n->data()); } - for (auto n : match2.Nodes) { + for (auto n : match2.getNodes()) { EXPECT_EQ(exp2.count(n->data()), 1); exp2.erase(n->data()); } diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm index 45f55ab2407a2e..755e1b5a57b8a9 100644 --- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm +++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm @@ -489,7 +489,7 @@ bool RunOnDevice() override { "noise_size", 491 /* prime to avoid artifacts */); // Treaded as half4 in the kernel, so need half4 here. noiseSize = divRoundUp(noiseSize, 4) * 4; - if (!noiseBlob->IsType() || + if (!noiseBlob->IsType(CPU) || noiseBlob->Get().size() != noiseSize) { VLOG(2) << "Initializing stylizer with noise: " << noiseSize; caffe2::Timer rt; diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm index 9f032e6fe299d0..bcf588d8a384f0 100644 --- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm +++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm @@ -94,7 +94,7 @@ void testMPSCNN() { Workspace ws; for (auto i = 0; i < N; ++i) { - auto* t = ws.CreateBlob(cpu(i))->GetMutable(); + auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU); t->Resize(BS, C, H, W); CPUContext ctx; math::RandGaussian( @@ -152,7 +152,7 @@ void testMPSCNN() { Workspace ws; for (auto i = 0; i < N; ++i) { - auto* t = ws.CreateBlob(cpu(i))->GetMutable(); + auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU); switch (ndim) { case 1: t->Resize(5); @@ -210,7 +210,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNNormalizePlanarYUV Test: "; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(batch_size, channels, 8, 13); CPUContext ctx; math::RandGaussian( @@ -218,14 +218,14 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("mean")->GetMutable(); + auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU); t->Resize(1, channels); CPUContext ctx; math::RandGaussian( t->size(), 0, 1, t->mutable_data(), &ctx); } { - auto* t = ws.CreateBlob("stddev")->GetMutable(); + auto* t = ws.CreateBlob("stddev")->GetMutableTensor(CPU); t->Resize(1, channels); CPUContext ctx; math::RandUniform( @@ -290,7 +290,7 @@ void testMPSCNN() { for (const auto dim : {10, 40}) { Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(batchSize, channels, dim, dim); CPUContext ctx; // Too noisy. @@ -299,7 +299,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("W")->GetMutable(); + auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU); t->Resize(channels); CPUContext ctx; for (auto i = 0; i < t->size(); ++i) { @@ -310,7 +310,7 @@ void testMPSCNN() { // t->mutable_data(), &ctx); } { - auto* t = ws.CreateBlob("b")->GetMutable(); + auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU); t->Resize(channels); CPUContext ctx; for (auto i = 0; i < t->size(); ++i) { @@ -321,7 +321,7 @@ void testMPSCNN() { // t->mutable_data(), &ctx); } { - auto* t = ws.CreateBlob("pw")->GetMutable(); + auto* t = ws.CreateBlob("pw")->GetMutableTensor(CPU); t->Resize(prelu == PreluTy::SHARED ? 1 : channels); CPUContext ctx; // Too noisy. @@ -409,7 +409,7 @@ void testMPSCNN() { Workspace ws; const auto channels = array ? 12 : 3; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(batch_size, channels, 8, 13); CPUContext ctx; math::RandGaussian( @@ -417,7 +417,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("b")->GetMutable(); + auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU); t->Resize(shared ? channels : 1); CPUContext ctx; math::RandGaussian( @@ -480,7 +480,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNSpatialBN Test: " << channels; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(batch_size, channels, 8, 13); CPUContext ctx; math::RandGaussian( @@ -488,7 +488,7 @@ void testMPSCNN() { } for (const std::string name : {"scale", "bias", "mean", "var"}) { - auto* t = ws.CreateBlob(name)->GetMutable(); + auto* t = ws.CreateBlob(name)->GetMutableTensor(CPU); t->Resize(channels); CPUContext ctx; // High mean to avoid var division by zero. @@ -575,7 +575,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNFC Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(batchSize, CIn, H, W); CPUContext ctx; math::RandGaussian( @@ -583,7 +583,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("W")->GetMutable(); + auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU); t->Resize(COut, CIn * H * W); CPUContext ctx; math::RandGaussian( @@ -591,7 +591,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("b")->GetMutable(); + auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU); t->Resize(COut); CPUContext ctx; math::RandGaussian( @@ -683,7 +683,7 @@ void testMPSCNN() { Workspace ws; { auto* t = - ws.CreateBlob("X_cpu")->GetMutable(); + ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(batchSize, 8, 8, 13); CPUContext ctx; math::RandGaussian( @@ -784,7 +784,7 @@ void testMPSCNN() { std::vector>{{1, 3, 50, 80}, {1, 12, 50, 80}}) { Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(dims); CPUContext ctx; math::RandGaussian( @@ -860,7 +860,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNPreprocess Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(1, 8, 13, 4); CPUContext ctx; for (auto i = 0; i < t->size(); ++i) { @@ -869,7 +869,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("mean")->GetMutable(); + auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU); t->Resize(3); CPUContext ctx; t->mutable_data()[0] = 100; @@ -940,7 +940,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNDeprocess Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(1, 3, 8, 24); CPUContext ctx; for (auto i = 0; i < t->size(); ++i) { @@ -949,7 +949,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("mean")->GetMutable(); + auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU); t->Resize(3); CPUContext ctx; t->mutable_data()[0] = 100; @@ -999,7 +999,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNDeprocess Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(1, 3, 1280, 720); CPUContext ctx; for (auto i = 0; i < t->size(); ++i) { @@ -1008,7 +1008,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("mean")->GetMutable(); + auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU); t->Resize(3); CPUContext ctx; t->mutable_data()[0] = 30; @@ -1072,8 +1072,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNConv Test"; Workspace ws; { - auto* t = - ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(batchSize, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -1081,7 +1080,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("W")->GetMutable(); + auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU); t->Resize(8, 12, kernel_h, kernel_w); CPUContext ctx; math::RandGaussian( @@ -1093,7 +1092,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("b")->GetMutable(); + auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU); t->Resize(8); CPUContext ctx; math::RandGaussian( @@ -1189,7 +1188,7 @@ void testMPSCNN() { Workspace ws; int output_channels = input_channels * channel_multiplier; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(batchSize, input_channels, 57, 72); CPUContext ctx; math::RandGaussian( @@ -1197,7 +1196,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("W")->GetMutable(); + auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU); t->Resize(output_channels, 1, 3, 3); CPUContext ctx; math::RandGaussian( @@ -1205,7 +1204,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("b")->GetMutable(); + auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU); t->Resize(output_channels); CPUContext ctx; math::RandGaussian( @@ -1276,7 +1275,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNConvRelu Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(1, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -1284,7 +1283,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("W")->GetMutable(); + auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU); t->Resize(8, 12, 3, 3); CPUContext ctx; math::RandGaussian( @@ -1292,7 +1291,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("b")->GetMutable(); + auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU); t->Resize(8); CPUContext ctx; math::RandGaussian( @@ -1386,7 +1385,7 @@ void testMPSCNN() { LOG(INFO) << "MPSConv Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(1, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -1394,7 +1393,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("W")->GetMutable(); + auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU); t->Resize(8, 12, 3, 3); CPUContext ctx; math::RandGaussian( @@ -1402,7 +1401,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("b")->GetMutable(); + auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU); t->Resize(8); CPUContext ctx; math::RandGaussian( @@ -1494,7 +1493,7 @@ void testMPSCNN() { LOG(INFO) << "MPSConv Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(batchSize, C, 12, 16); CPUContext ctx; math::RandGaussian( @@ -1502,7 +1501,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("W")->GetMutable(); + auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU); t->Resize(M, C, K, K); CPUContext ctx; math::RandGaussian( @@ -1510,7 +1509,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("b")->GetMutable(); + auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU); t->Resize(M); CPUContext ctx; math::RandGaussian( @@ -1608,7 +1607,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNConv Test - group"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(batchSize, C, 12, 16); CPUContext ctx; math::RandGaussian( @@ -1616,7 +1615,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("W")->GetMutable(); + auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU); t->Resize(M, C / group, K, K); CPUContext ctx; math::RandGaussian( @@ -1624,7 +1623,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("b")->GetMutable(); + auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU); t->Resize(M); CPUContext ctx; math::RandGaussian( @@ -1727,7 +1726,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNMul Test"; Workspace ws; { - auto* t = ws.CreateBlob("X0_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU); t->Resize(1, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -1735,7 +1734,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("X1_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU); t->Resize(72); CPUContext ctx; math::RandGaussian( @@ -1792,7 +1791,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNSub Test"; Workspace ws; { - auto* t = ws.CreateBlob("X0_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU); t->Resize(1, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -1800,7 +1799,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("X1_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU); t->Resize(72); CPUContext ctx; math::RandGaussian( @@ -1857,7 +1856,7 @@ void testMPSCNN() { LOG(INFO) << "MPSAdd Test"; Workspace ws; { - auto* t = ws.CreateBlob("X0_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU); t->Resize(1, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -1865,7 +1864,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("X1_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU); t->Resize(1, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -1922,7 +1921,7 @@ void testMPSCNN() { LOG(INFO) << "MPSAdd Test"; Workspace ws; { - auto* t = ws.CreateBlob("X0_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU); t->Resize(1, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -1930,7 +1929,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("X1_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU); t->Resize(1, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -2012,7 +2011,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNNeuron Test: " << n; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(1, 4, 12, 12); CPUContext ctx; math::RandGaussian( @@ -2066,7 +2065,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNDropout Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(1, 12, 57, 72); CPUContext ctx; math::RandGaussian( @@ -2137,7 +2136,7 @@ void testMPSCNN() { << " - scale: " << scale; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(1, channels, 40, 40); CPUContext ctx; math::RandGaussian( @@ -2145,7 +2144,7 @@ void testMPSCNN() { } { // Use the batch-first encoding (n, [bbox]) - auto* t = ws.CreateBlob("R")->GetMutable(); + auto* t = ws.CreateBlob("R")->GetMutableTensor(CPU); t->Resize(6, 5); for (auto i = 0; i < t->dim32(0); ++i) { t->mutable_data()[5 * i + 0] = 0; // batch @@ -2251,14 +2250,14 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNRoIWarp Test 2"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(1, 8, 40, 40); CPUContext ctx; math::RandGaussian( t->size(), 4, 2, t->mutable_data(), &ctx); } { - auto* t = ws.CreateBlob("R")->GetMutable(); + auto* t = ws.CreateBlob("R")->GetMutableTensor(CPU); t->Resize(6, 4); for (auto i = 0; i < t->dim32(0); ++i) { t->mutable_data()[4 * i + 0] = (i % 4 + 1) * 1.0 / scale; @@ -2363,7 +2362,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNResizeNearestOp Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(N, C, 37, 89); CPUContext ctx; math::RandGaussian( @@ -2498,7 +2497,7 @@ void testMPSCNN() { vector im_info{60, 80, 0.166667}; vector anchors{-38, -16, 53, 31, -120, -120, 135, 135}; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(num_images, A, H, W); for (auto i = 0; i < t->size(); ++i) { t->mutable_data()[i] = scores[i]; @@ -2506,7 +2505,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("bbox_delta_cpu")->GetMutable(); + auto* t = ws.CreateBlob("bbox_delta_cpu")->GetMutableTensor(CPU); t->Resize(num_images, 4 * A, H, W); for (auto i = 0; i < t->size(); ++i) { t->mutable_data()[i] = bbx[i]; @@ -2514,7 +2513,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("im_info")->GetMutable(); + auto* t = ws.CreateBlob("im_info")->GetMutableTensor(CPU); t->Resize(num_images, 3); for (auto i = 0; i < t->size(); ++i) { t->mutable_data()[i] = im_info[i]; @@ -2522,7 +2521,7 @@ void testMPSCNN() { } { - auto* t = ws.CreateBlob("anchors")->GetMutable(); + auto* t = ws.CreateBlob("anchors")->GetMutableTensor(CPU); t->Resize(A, 4); for (auto i = 0; i < t->size(); ++i) { t->mutable_data()[i] = anchors[i]; @@ -2588,7 +2587,7 @@ void testMPSCNN() { LOG(INFO) << "MPSCNNSoftmax Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); // Only works for spatial dimension of (1, 1) - weird. t->Resize(batchSize, 12, 1, 1); CPUContext ctx; @@ -2662,8 +2661,8 @@ void testMPSCNN() { LOG(INFO) << "MPSConvTranspose Test"; Workspace ws; { - auto* t = ws.CreateBlob("X_cpu") - ->GetMutable(); + auto* t = + ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(batchSize, inputChannels, 8, 12); CPUContext ctx; math::RandGaussian( @@ -2676,7 +2675,7 @@ void testMPSCNN() { { auto* t = - ws.CreateBlob("W")->GetMutable(); + ws.CreateBlob("W")->GetMutableTensor(CPU); t->Resize( inputChannels, outputChannels, @@ -2693,7 +2692,7 @@ void testMPSCNN() { { auto* t = - ws.CreateBlob("b")->GetMutable(); + ws.CreateBlob("b")->GetMutableTensor(CPU); t->Resize(outputChannels); CPUContext ctx; math::RandGaussian( @@ -2810,7 +2809,7 @@ void testMPSCNN() { << batchSize; Workspace ws; for (auto i = 0; i < numInputs; ++i) { - auto* t = ws.CreateBlob(cpu(i))->GetMutable(); + auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU); t->Resize(batchSize, array ? (i + 1) * 4 : 4, 10, 10); CPUContext ctx; math::RandGaussian( @@ -2892,7 +2891,7 @@ void testMPSCNN() { } Workspace ws; { - auto* t = ws.CreateBlob("X_cpu")->GetMutable(); + auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU); t->Resize(batchSize, inputChannels, 53, 47); CPUContext ctx; math::RandGaussian( @@ -2965,7 +2964,7 @@ void testMPSCNN() { << numInputs << ", " << batchSize; Workspace ws; for (auto i = 0; i < numInputs; ++i) { - auto* t = ws.CreateBlob(cpu(i))->GetMutable(); + auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU); t->Resize(batchSize, channelCount, 9, 17); CPUContext ctx; math::RandGaussian( @@ -3338,7 +3337,7 @@ void compareModels(const NetDef& initNet, NetDef predictNet) { cws.RunNetOnce(initNet); { auto* t = - cws.CreateBlob(predictNet.external_input(0))->GetMutable(); + cws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU); t->Resize(1, 224, 224, 4); for (auto i = 0; i < t->size(); ++i) { t->mutable_data()[i] = i % 225; @@ -3350,7 +3349,7 @@ void compareModels(const NetDef& initNet, NetDef predictNet) { mws.RunNetOnce(initNet); { auto* t = - mws.CreateBlob(predictNet.external_input(0))->GetMutable(); + mws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU); t->Resize(1, 224, 224, 4); for (auto i = 0; i < t->size(); ++i) { t->mutable_data()[i] = i % 225; @@ -3398,16 +3397,16 @@ void verifyRewrite( dumpDef(predictNet); dumpDef(metalPredictNet); -#define RUN_NET(ws, predictNet) \ - ws.RunNetOnce(initNet); \ - { \ - auto* t = \ - ws.CreateBlob(predictNet.external_input(0))->GetMutable(); \ - t->Resize(inputDims); \ - CPUContext ctx; \ - math::RandGaussian( \ - t->size(), 0, 1, t->mutable_data(), &ctx); \ - } \ +#define RUN_NET(ws, predictNet) \ + ws.RunNetOnce(initNet); \ + { \ + auto* t = \ + ws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU); \ + t->Resize(inputDims); \ + CPUContext ctx; \ + math::RandGaussian( \ + t->size(), 0, 1, t->mutable_data(), &ctx); \ + } \ ws.RunNetOnce(predictNet); // initialize diff --git a/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h index 70b9ac05747511..2896bc26ac08d4 100644 --- a/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h +++ b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h @@ -41,7 +41,7 @@ void RowsWhereRoILevelEquals(Eigen::Ref rois, // distribute those proposals to their appropriate FPN levels for Faster RCNN. // An anchor at one FPN level may predict an RoI that will map to another // level, hence the need to redistribute the proposals. -// Reference: detectron/lib/ops/collect_and_distribute_fpn_rpn_proposals.py +// Reference: facebookresearch/Detectron/detectron/ops/collect_and_distribute_fpn_rpn_proposals.py template class CollectAndDistributeFpnRpnProposalsOp final : public Operator { public: diff --git a/caffe2/operators/generate_proposals_op.h b/caffe2/operators/generate_proposals_op.h index 81f7d9ac43123f..faf4936495244f 100644 --- a/caffe2/operators/generate_proposals_op.h +++ b/caffe2/operators/generate_proposals_op.h @@ -59,7 +59,7 @@ ERMatXf ComputeAllAnchors( // regression result 'deltas' as well as predefined bounding box shapes // 'anchors'. Greedy non-maximum suppression is applied to generate the // final bounding boxes. -// Reference: detectron/lib/ops/generate_proposals.py +// Reference: facebookresearch/Detectron/detectron/ops/generate_proposals.py template class GenerateProposalsOp final : public Operator { public: diff --git a/caffe2/operators/generate_proposals_op_util_boxes.h b/caffe2/operators/generate_proposals_op_util_boxes.h index 0c4c345d382cb1..333514102b7d4b 100644 --- a/caffe2/operators/generate_proposals_op_util_boxes.h +++ b/caffe2/operators/generate_proposals_op_util_boxes.h @@ -5,7 +5,7 @@ #include "caffe2/utils/math.h" // Bounding box utils for generate_proposals_op -// Reference: detectron/lib/utils/boxes.py +// Reference: facebookresearch/Detectron/detectron/utils/boxes.py namespace caffe2 { namespace utils { diff --git a/caffe2/operators/generate_proposals_op_util_nms.h b/caffe2/operators/generate_proposals_op_util_nms.h index 5d6f87d4d30563..7b38cd6a1420d6 100644 --- a/caffe2/operators/generate_proposals_op_util_nms.h +++ b/caffe2/operators/generate_proposals_op_util_nms.h @@ -19,7 +19,7 @@ namespace utils { // Reject a bounding box if its region has an intersection-overunion (IoU) // overlap with a higher scoring selected bounding box larger than a // threshold. -// Reference: detectron/lib/utils/cython_nms.pyx +// Reference: facebookresearch/Detectron/detectron/utils/cython_nms.pyx // proposals: pixel coordinates of proposed bounding boxes, // size: (M, 4), format: [x1; y1; x2; y2] // scores: scores for each bounding box, size: (M, 1) @@ -78,7 +78,7 @@ std::vector nms_cpu_upright( /** * Soft-NMS implementation as outlined in https://arxiv.org/abs/1704.04503. - * Reference: detectron/lib/utils/cython_nms.pyx + * Reference: facebookresearch/Detectron/detectron/utils/cython_nms.pyx * out_scores: Output updated scores after applying Soft-NMS * proposals: pixel coordinates of proposed bounding boxes, * size: (M, 4), format: [x1; y1; x2; y2] @@ -426,7 +426,7 @@ std::vector nms_cpu( // Reject a bounding box if its region has an intersection-overunion (IoU) // overlap with a higher scoring selected bounding box larger than a // threshold. -// Reference: detectron/lib/utils/cython_nms.pyx +// Reference: facebookresearch/Detectron/detectron/lib/utils/cython_nms.pyx // proposals: pixel coordinates of proposed bounding boxes, // size: (M, 4), format: [x1; y1; x2; y2] // size: (M, 5), format: [ctr_x; ctr_y; w; h; angle (degrees)] for RRPN diff --git a/caffe2/opt/device.cc b/caffe2/opt/device.cc index 9abca6d67e08b3..0cfdd6c1dc91a3 100644 --- a/caffe2/opt/device.cc +++ b/caffe2/opt/device.cc @@ -9,15 +9,14 @@ std::vector getInputEdges( const NNGraph::SubgraphType& sg, const NNGraph& g) { std::vector inputTensorEdges; - for (const auto& node : sg.Nodes) { + for (const auto& node : sg.getNodes()) { NOM_REQUIRE_OR_CONT(nn::is(node)); NOM_REQUIRE_OR_CONT(nn::hasInputs(node)); // Check if tensor's parents are in the sg for (const auto& input : nn::getInputs(node)) { NOM_REQUIRE_OR_CONT( - !nn::hasProducer(input) || - sg.Nodes.count(nn::getProducer(input)) == 0); + !nn::hasProducer(input) || !sg.hasNode(nn::getProducer(input))); inputTensorEdges.emplace_back(g.getEdge(input, node)); } } @@ -28,13 +27,13 @@ std::vector getOutputEdges( const NNGraph::SubgraphType& sg, const NNGraph& g) { std::vector outputTensorEdges; - for (const auto& node : sg.Nodes) { + for (const auto& node : sg.getNodes()) { NOM_REQUIRE_OR_CONT(nn::is(node)); for (const auto& output : nn::getOutputs(node)) { auto consumers = nn::getConsumers(output); for (const auto& consumer : consumers) { - NOM_REQUIRE_OR_CONT(sg.Nodes.count(consumer) == 0); + NOM_REQUIRE_OR_CONT(!sg.hasNode(consumer)); outputTensorEdges.emplace_back(g.getEdge(node, output)); } NOM_REQUIRE_OR_CONT(consumers.size() == 0); diff --git a/caffe2/opt/onnxifi_transformer.cc b/caffe2/opt/onnxifi_transformer.cc index 75baec0e9be66b..09528b99b5da51 100644 --- a/caffe2/opt/onnxifi_transformer.cc +++ b/caffe2/opt/onnxifi_transformer.cc @@ -323,8 +323,10 @@ void OnnxifiTransformer::Transform( // function to tell whether the ONNXIFI backend supports a given C2 op or not // TODO: choose backend id + onnxifi_library* backend = lib_; + onnxBackendID backend_id = backend_ids_[0]; auto supports = - [&exporter, &shape_hints, backend = lib_, backend_id = backend_ids_[0]]( + [&exporter, &shape_hints, backend, backend_id]( const caffe2::OperatorDef& op) { const OpSchema* schema = OpSchemaRegistry::Schema(op.type()); // NB: this might not be a hard constraint as we can just export C2 diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py index cb9932bc4542a2..e5c9c095a16c55 100644 --- a/caffe2/python/hypothesis_test.py +++ b/caffe2/python/hypothesis_test.py @@ -630,7 +630,7 @@ def _dense_gftrl(alpha, beta, lambda1, lambda2, w, nz, g): beta=st.floats(min_value=0.1, max_value=0.9), lambda1=st.floats(min_value=0.001, max_value=0.1), lambda2=st.floats(min_value=0.001, max_value=0.1), - engine=st.sampled_from([None]), + engine=st.sampled_from([None, "SIMD"]), **hu.gcs_cpu_only) def test_gftrl_sgd(self, inputs, in_place, alpha, beta, lambda1, lambda2, engine, gc, dc): diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py index db870972f83946..ee60d776d55a82 100644 --- a/caffe2/python/optimizer.py +++ b/caffe2/python/optimizer.py @@ -1421,7 +1421,8 @@ def build_ftrl(model, engine="SIMD", **kwargs): def build_gftrl(model, engine="", **kwargs): - # SIMD version of GFTRL is not supported + if engine == "SIMD": + assert core.IsOperator('GFtrl_ENGINE_SIMD') gftrl_optimizer = GFtrlOptimizer(engine=engine, **kwargs) return _build(model, gftrl_optimizer) diff --git a/caffe2/requirements.txt b/caffe2/requirements.txt index 9a1d67efc7c2f3..07fd95b72582a2 100644 --- a/caffe2/requirements.txt +++ b/caffe2/requirements.txt @@ -1,2 +1,4 @@ numpy enum34 +pyyaml +typing diff --git a/caffe2/utils/math_cpu.cc b/caffe2/utils/math_cpu.cc index e0ae5cc0336e2a..c573542af5763c 100644 --- a/caffe2/utils/math_cpu.cc +++ b/caffe2/utils/math_cpu.cc @@ -2605,6 +2605,13 @@ bool TransposeWithHPTT( axes_cm[i] = cm_fn(axes[cm_fn(i)]); dims_cm[i] = dims[cm_fn(i)]; } + + // HPTT doesn't handle 0 sized inputs. + for (auto dim : dims_cm) { + if (dim <= 0) { + return false; + } + } auto plan = hptt::create_plan( axes_cm.data(), ndim, diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake index bc30f35f2a2eee..3829219a933b5d 100644 --- a/cmake/Codegen.cmake +++ b/cmake/Codegen.cmake @@ -1,3 +1,9 @@ +# This ill-named file does a number of things: +# - Installs Caffe2 header files (this has nothing to do with code generation) +# - Configures caffe2/core/macros.h +# - Creates an ATen target for its generated C++ files and adds it +# as a dependency + if (DEFINED ENV{PYTORCH_PYTHON}) message(STATUS "Using python found in $ENV{PYTORCH_PYTHON}") set(PYCMD "$ENV{PYTORCH_PYTHON}") @@ -14,6 +20,11 @@ configure_file( install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../caffe2 DESTINATION include FILES_MATCHING PATTERN "*.h") +if (NOT BUILD_ATEN) + install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core + DESTINATION include/ATen/core + FILES_MATCHING PATTERN "*.h") +endif() install(FILES ${CMAKE_BINARY_DIR}/caffe2/core/macros.h DESTINATION include/caffe2/core) diff --git a/cmake/MiscCheck.cmake b/cmake/MiscCheck.cmake index 2a4e61f97b0b18..b05acdf1c90558 100644 --- a/cmake/MiscCheck.cmake +++ b/cmake/MiscCheck.cmake @@ -83,22 +83,26 @@ endif() cmake_pop_check_state() # ---[ Check for NUMA support -cmake_push_check_state(RESET) -set(CMAKE_REQUIRED_FLAGS "-std=c++11") -CHECK_CXX_SOURCE_COMPILES( +if (USE_NUMA) + cmake_push_check_state(RESET) + set(CMAKE_REQUIRED_FLAGS "-std=c++11") + CHECK_CXX_SOURCE_COMPILES( "#include #include int main(int argc, char** argv) { }" CAFFE2_IS_NUMA_AVAILABLE) - -if (CAFFE2_IS_NUMA_AVAILABLE) - message(STATUS "NUMA is available") + if (CAFFE2_IS_NUMA_AVAILABLE) + message(STATUS "NUMA is available") + else() + message(STATUS "NUMA is not available") + set(CAFFE2_DISABLE_NUMA 1) + endif() + cmake_pop_check_state() else() - message(STATUS "NUMA is not available") + message(STATUS "NUMA is disabled") set(CAFFE2_DISABLE_NUMA 1) endif() -cmake_pop_check_state() # ---[ Check if we want to turn off deprecated warning due to glog. # Note(jiayq): on ubuntu 14.04, the default glog install uses ext/hash_set that diff --git a/docs/source/nn.rst b/docs/source/nn.rst index 987044bbd212f4..9cf2326a9445fb 100644 --- a/docs/source/nn.rst +++ b/docs/source/nn.rst @@ -604,6 +604,12 @@ Loss functions .. autoclass:: CrossEntropyLoss :members: +:hidden:`CTCLoss` +~~~~~~~~~~~~~~~~~ + +.. autoclass:: CTCLoss + :members: + :hidden:`NLLLoss` ~~~~~~~~~~~~~~~~~ @@ -1180,6 +1186,11 @@ Loss functions .. autofunction:: cross_entropy +:hidden:`ctc_loss` +~~~~~~~~~~~~~~~~~~ + +.. autofunction:: ctc_loss + :hidden:`hinge_embedding_loss` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst index c3c85797b4cd82..06b0305d28aae8 100644 --- a/docs/source/tensors.rst +++ b/docs/source/tensors.rst @@ -46,7 +46,7 @@ A tensor can be constructed from a Python :class:`list` or sequence using the If you have a numpy array and want to avoid a copy, use :func:`torch.as_tensor`. -An tensor of specific data type can be constructed by passing a +A tensor of specific data type can be constructed by passing a :class:`torch.dtype` and/or a :class:`torch.device` to a constructor or tensor creation op: diff --git a/scripts/build_anaconda.sh b/scripts/build_anaconda.sh index 1db0f546724103..62185d1e9dc821 100755 --- a/scripts/build_anaconda.sh +++ b/scripts/build_anaconda.sh @@ -296,6 +296,10 @@ fi # Add packages required for all Caffe2 builds add_package 'glog' add_package 'gflags' +add_package 'mkl' '>=2018' +add_package 'mkl-include' +add_package 'typing' +append_to_section 'build' '- pyyaml' caffe2_cmake_args+=("-DUSE_LEVELDB=OFF") caffe2_cmake_args+=("-DUSE_LMDB=OFF") @@ -303,10 +307,6 @@ caffe2_cmake_args+=("-DUSE_LMDB=OFF") # Add packages required for pytorch if [[ -n $integrated ]]; then add_package 'cffi' - add_package 'mkl' '>=2018' - add_package 'mkl-include' - add_package 'typing' - append_to_section 'build' '- pyyaml' append_to_section 'build' '- setuptools' #caffe2_cmake_args+=("-DBLAS=MKL") if [[ -n $cuda_ver ]]; then diff --git a/setup.py b/setup.py index 042d8668bb7b96..67330417492c3f 100644 --- a/setup.py +++ b/setup.py @@ -1023,6 +1023,7 @@ def make_relative_rpath(path): 'lib/torch_shm_manager', 'lib/*.h', 'lib/include/ATen/*.h', + 'lib/include/ATen/core/*.h', 'lib/include/ATen/detail/*.h', 'lib/include/ATen/cuda/*.h', 'lib/include/ATen/cuda/*.cuh', diff --git a/test/common_nn.py b/test/common_nn.py index 6172f4b15adc3f..33b2e94204f3ca 100644 --- a/test/common_nn.py +++ b/test/common_nn.py @@ -448,6 +448,43 @@ def marginrankingloss_reference(input1, input2, target, margin=0, reduction='ele return output +# this directly follows Graves et al's paper, in contrast to the production implementation, it does not use log-space +def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0, reduction='elementwise_mean'): + input_lengths = torch.tensor(input_lengths, dtype=torch.long) + target_lengths = torch.tensor(target_lengths, dtype=torch.long) + dt = log_probs.dtype + log_probs = log_probs.double() # we need the accuracy as we are not in logspace + targets = targets.long() + cum_target_lengths = target_lengths.cumsum(0) + losses = [] + for i in range(log_probs.size(1)): + input_length = input_lengths[i].item() + target_length = target_lengths[i].item() + cum_target_length = cum_target_lengths[i].item() + targets_prime = targets.new_full((2 * target_length + 1,), blank) + if targets.dim() == 2: + targets_prime[1::2] = targets[i, :target_length] + else: + targets_prime[1::2] = targets[cum_target_length - target_length:cum_target_length] + probs = log_probs[:input_length, i].exp() + alpha = log_probs.new_zeros((target_length * 2 + 1,)) + alpha[0] = probs[0, blank] + alpha[1] = probs[0, targets_prime[1]] + mask_third = (targets_prime[:-2] != targets_prime[2:]) + for t in range(1, input_length): + alpha_next = alpha.clone() + alpha_next[1:] += alpha[:-1] + alpha_next[2:] += torch.where(mask_third, alpha[:-2], alpha.new_zeros(1)) + alpha = probs[t, targets_prime] * alpha_next + losses.append(-alpha[-2:].sum().log()[None]) + output = torch.cat(losses, 0) + if reduction == 'elementwise_mean': + return (output / target_lengths.to(dtype=output.dtype, device=output.device)).mean() + elif reduction == 'sum': + return output.sum() + output = output.to(dt) + return output + loss_reference_fns = { 'KLDivLoss': kldivloss_reference, 'NLLLoss': nllloss_reference, @@ -460,6 +497,7 @@ def marginrankingloss_reference(input1, input2, target, margin=0, reduction='ele 'CosineEmbeddingLoss': cosineembeddingloss_reference, 'TripletMarginLoss': tripletmarginloss_reference, 'MarginRankingLoss': marginrankingloss_reference, + 'CTCLoss': ctcloss_reference, } @@ -841,7 +879,7 @@ def check_criterion_jacobian(self, criterion, input, target): class TestBase(object): - _required_arg_names = {'constructor_args', 'input'} + _required_arg_names = {'constructor_args', 'input', 'extra_args'} def __init__(self, constructor, desc='', reference_fn=None, fullname=None, **kwargs): self.desc = desc @@ -850,8 +888,8 @@ def __init__(self, constructor, desc='', reference_fn=None, fullname=None, **kwa self.reference_fn = reference_fn for name in self._required_arg_names: if name not in kwargs and name + '_fn' not in kwargs and name + '_size' not in kwargs: - if name == 'constructor_args': - kwargs['constructor_args'] = tuple() + if name in {'constructor_args', 'extra_args'}: + kwargs[name] = tuple() else: raise ValueError("{}: Specify {} by a value, a function to generate it, or it's size!" .format(self.get_name(), name)) @@ -879,6 +917,10 @@ def _unpack(self, value): def constructor_args(self): return self._get_arg('constructor_args', True) + @property + def extra_args(self): + return self._get_arg('extra_args', True) + def _get_arg(self, name, unpack): assert name in self._required_arg_names @@ -1103,9 +1145,9 @@ def __call__(self, test_case): target = self._get_target() if self.reference_fn is not None: - out = test_case._forward_criterion(module, input, target) - expected_out = self.reference_fn(deepcopy(input), - deepcopy(target), module) + out = test_case._forward_criterion(module, input, target, extra_args=self.extra_args) + ref_args = (deepcopy(input), deepcopy(target)) + self.extra_args + (module,) + expected_out = self.reference_fn(*ref_args) if isinstance(expected_out, torch.Tensor): expected_out = expected_out.item() test_case.assertEqual(out, expected_out) diff --git a/test/expect/TestJit.test_concat_fusion.expect b/test/expect/TestJit.test_concat_fusion.expect index 027c2de33e5926..454a84cba1db76 100644 --- a/test/expect/TestJit.test_concat_fusion.expect +++ b/test/expect/TestJit.test_concat_fusion.expect @@ -3,12 +3,11 @@ graph(%0 : Float(3, 20) %2 : Float(6, 20) = prim::FusionGroup_0[device=0](%0, %1) return (%2); } -with prim::FusionGroup_0 = graph(%4 : Float(3, 20) - %5 : Float(3, 20)) { - %7 : int = prim::Constant[value=1]() - %8 : Float(3, 20) = aten::add(%4, %5, %7) - %6 : Float(3, 20) = aten::mul(%4, %5) - %2 : int = prim::Constant[value=0]() - %3 : Float(6, 20) = aten::cat(%8, %6, %2) - return (%3); +with prim::FusionGroup_0 = graph(%3 : Float(3, 20) + %4 : Float(3, 20)) { + %6 : int = prim::Constant[value=1]() + %7 : Float(3, 20) = aten::add(%3, %4, %6) + %5 : Float(3, 20) = aten::mul(%3, %4) + %2 : Float(6, 20) = prim::FusedConcat[dim=0](%7, %5) + return (%2); } diff --git a/test/expect/TestJit.test_constant_prop_nested.expect b/test/expect/TestJit.test_constant_prop_nested.expect new file mode 100644 index 00000000000000..5af0203c90349f --- /dev/null +++ b/test/expect/TestJit.test_constant_prop_nested.expect @@ -0,0 +1,16 @@ +graph(%a : Dynamic) { + %1 : int = prim::Constant[value=2]() + %2 : Dynamic = aten::lt(%a, %1) + %3 : int = prim::TensorToNum(%2) + %c : int = prim::If(%3) + block0() { + %5 : int = prim::Constant[value=5]() + -> (%5) + } + block1() { + %6 : int = prim::Constant[value=1]() + -> (%6) + } + %7 : Long() = prim::NumToTensor(%c) + return (%7); +} diff --git a/test/expect/TestJit.test_constant_prop_print.expect b/test/expect/TestJit.test_constant_prop_print.expect new file mode 100644 index 00000000000000..7cadfdbbc6b3ea --- /dev/null +++ b/test/expect/TestJit.test_constant_prop_print.expect @@ -0,0 +1,12 @@ +graph(%input_tensor : Dynamic) { + %1 : int = prim::Constant[value=6]() + %2 : Dynamic = ^FIXME_zerol()() + %a : Dynamic = aten::add(%1, %2) + = prim::Print(%a) + %4 : int = prim::Constant[value=2]() + %5 : int = prim::Constant[value=1]() + %b : Dynamic = aten::add(%a, %4, %5) + %7 : int = prim::Constant[value=1]() + %8 : Dynamic = aten::add(%b, %input_tensor, %7) + return (%8); +} diff --git a/test/expect/TestJit.test_constant_prop_rand.expect b/test/expect/TestJit.test_constant_prop_rand.expect new file mode 100644 index 00000000000000..a6c305258bff95 --- /dev/null +++ b/test/expect/TestJit.test_constant_prop_rand.expect @@ -0,0 +1,11 @@ +graph() { + %0 : int = prim::Constant[value=6]() + %1 : int = prim::Constant[value=0]() + %2 : int[] = prim::Constant[value=[0, -1]]() + %3 : int[] = prim::Constant[value=[3]]() + %a : Dynamic = aten::randn(%3, %0, %1, %2) + %5 : int = prim::Constant[value=2]() + %6 : int = prim::Constant[value=1]() + %b : Dynamic = aten::add(%a, %5, %6) + return (%b); +} diff --git a/test/expect/TestJit.test_constant_prop_simple.expect b/test/expect/TestJit.test_constant_prop_simple.expect new file mode 100644 index 00000000000000..029f9ac05a0783 --- /dev/null +++ b/test/expect/TestJit.test_constant_prop_simple.expect @@ -0,0 +1,5 @@ +graph(%input_tensor : Dynamic) { + %1 : int = prim::Constant[value=8]() + %2 : Dynamic = aten::add(%1, %input_tensor) + return (%2); +} diff --git a/test/expect/TestJit.test_lstm_fusion_concat.expect b/test/expect/TestJit.test_lstm_fusion_concat.expect index 7884a95c48c9a1..f0771c133c11d9 100644 --- a/test/expect/TestJit.test_lstm_fusion_concat.expect +++ b/test/expect/TestJit.test_lstm_fusion_concat.expect @@ -16,34 +16,33 @@ graph(%0 : Float(3, 10) %21 : Float(6, 20) = prim::FusionGroup_0[device=0](%2, %16, %20, %15, %19, %14, %18, %13, %17) return (%21); } -with prim::FusionGroup_0 = graph(%16 : Float(3, 20) +with prim::FusionGroup_0 = graph(%15 : Float(3, 20) + %25 : Float(3!, 20) %26 : Float(3!, 20) - %27 : Float(3!, 20) + %29 : Float(3!, 20) %30 : Float(3!, 20) - %31 : Float(3!, 20) + %33 : Float(3!, 20) %34 : Float(3!, 20) - %35 : Float(3!, 20) - %38 : Float(3!, 20) - %39 : Float(3!, 20)) { - %40 : int = prim::Constant[value=1]() - %41 : Float(3, 20) = aten::add(%38, %39, %40) - %36 : int = prim::Constant[value=1]() - %37 : Float(3, 20) = aten::add(%34, %35, %36) - %32 : int = prim::Constant[value=1]() - %33 : Float(3, 20) = aten::add(%30, %31, %32) - %28 : int = prim::Constant[value=1]() - %29 : Float(3, 20) = aten::add(%26, %27, %28) - %25 : Float(3, 20) = aten::sigmoid(%41) - %23 : Float(3, 20) = aten::sigmoid(%37) - %21 : Float(3, 20) = aten::tanh(%33) - %19 : Float(3, 20) = aten::sigmoid(%29) - %17 : Float(3, 20) = aten::mul(%23, %16) - %14 : Float(3, 20) = aten::mul(%25, %21) - %10 : int = prim::Constant[value=1]() - %11 : Float(3, 20) = aten::add(%17, %14, %10) - %7 : Float(3, 20) = aten::tanh(%11) - %6 : Float(3, 20) = aten::mul(%19, %7) - %2 : int = prim::Constant[value=0]() - %3 : Float(6, 20) = aten::cat(%6, %11, %2) - return (%3); + %37 : Float(3!, 20) + %38 : Float(3!, 20)) { + %39 : int = prim::Constant[value=1]() + %40 : Float(3, 20) = aten::add(%37, %38, %39) + %35 : int = prim::Constant[value=1]() + %36 : Float(3, 20) = aten::add(%33, %34, %35) + %31 : int = prim::Constant[value=1]() + %32 : Float(3, 20) = aten::add(%29, %30, %31) + %27 : int = prim::Constant[value=1]() + %28 : Float(3, 20) = aten::add(%25, %26, %27) + %24 : Float(3, 20) = aten::sigmoid(%40) + %22 : Float(3, 20) = aten::sigmoid(%36) + %20 : Float(3, 20) = aten::tanh(%32) + %18 : Float(3, 20) = aten::sigmoid(%28) + %16 : Float(3, 20) = aten::mul(%22, %15) + %13 : Float(3, 20) = aten::mul(%24, %20) + %9 : int = prim::Constant[value=1]() + %10 : Float(3, 20) = aten::add(%16, %13, %9) + %6 : Float(3, 20) = aten::tanh(%10) + %5 : Float(3, 20) = aten::mul(%18, %6) + %2 : Float(6, 20) = prim::FusedConcat[dim=0](%5, %10) + return (%2); } diff --git a/test/expect/TestScript.test_cat_lifts.expect b/test/expect/TestScript.test_cat_lifts.expect index ea2fa3737c0556..c8c82e5199c030 100644 --- a/test/expect/TestScript.test_cat_lifts.expect +++ b/test/expect/TestScript.test_cat_lifts.expect @@ -1,15 +1,18 @@ graph(%x : Dynamic) { %1 : int = prim::Constant[value=1]() - %2 : Dynamic = aten::cat(%x, %x, %1) - return (%2); + %2 : Dynamic[] = prim::ListConstruct(%x, %x) + %3 : Dynamic = aten::cat(%2, %1) + return (%3); } graph(%x : Dynamic) { %1 : int = prim::Constant[value=1]() - %2 : Dynamic = aten::cat(%1) - return (%2); + %2 : Dynamic[] = prim::ListConstruct() + %3 : Dynamic = aten::cat(%2, %1) + return (%3); } graph(%x : Dynamic) { %1 : int = prim::Constant[value=1]() - %2 : Dynamic = aten::cat(%x, %1) - return (%2); + %2 : Dynamic[] = prim::ListConstruct(%x) + %3 : Dynamic = aten::cat(%2, %1) + return (%3); } diff --git a/test/expect/TestScript.test_index_put_trace_with_view.expect b/test/expect/TestScript.test_index_put_trace_with_view.expect index 591e499da96671..37f08643f139a4 100644 --- a/test/expect/TestScript.test_index_put_trace_with_view.expect +++ b/test/expect/TestScript.test_index_put_trace_with_view.expect @@ -6,6 +6,7 @@ graph(%0 : Double(100) %5 : Double(4) = aten::view(%2, %4) %6 : int = prim::Constant[value=0]() %7 : Long(4) = aten::_cast_Long(%1, %6) - %19 : Double(100) = aten::index_put(%0, %7, %5) - return (%19); + %8 : Dynamic[] = prim::ListConstruct(%7) + %20 : Double(100) = aten::index_put(%0, %8, %5) + return (%20); } diff --git a/test/expect/TestScript.test_index_put_trace_without_view.expect b/test/expect/TestScript.test_index_put_trace_without_view.expect index 42f8e49142942e..772308223b454b 100644 --- a/test/expect/TestScript.test_index_put_trace_without_view.expect +++ b/test/expect/TestScript.test_index_put_trace_without_view.expect @@ -3,6 +3,7 @@ graph(%0 : Double(100) %2 : Double(4)) { %3 : int = prim::Constant[value=0]() %4 : Long(4) = aten::_cast_Long(%1, %3) - %16 : Double(100) = aten::index_put(%0, %4, %2) - return (%16); + %5 : Dynamic[] = prim::ListConstruct(%4) + %17 : Double(100) = aten::index_put(%0, %5, %2) + return (%17); } diff --git a/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect b/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect index b1ff53c2e4e7d8..3c1321664dd3fd 100644 --- a/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect +++ b/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect @@ -10,33 +10,33 @@ graph { t { dims: 4 data_type: INT64 - raw_data: "\001\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000" + raw_data: "\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\003\000\000\000\000\000\000\000\004\000\000\000\000\000\000\000" } type: TENSOR } } node { - input: "0" - input: "1" output: "2" - op_type: "Reshape" - } - node { - output: "3" op_type: "Constant" attribute { name: "value" t { dims: 4 data_type: INT64 - raw_data: "\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\003\000\000\000\000\000\000\000\004\000\000\000\000\000\000\000" + raw_data: "\001\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000" } type: TENSOR } } node { + input: "0" input: "2" + output: "3" + op_type: "Reshape" + } + node { input: "3" + input: "1" output: "4" op_type: "Tile" } diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py index 85ef2eac5bf2ce..0284828e400487 100644 --- a/test/onnx/test_pytorch_onnx_caffe2.py +++ b/test/onnx/test_pytorch_onnx_caffe2.py @@ -676,6 +676,40 @@ def forward(self, x): x = Variable(torch.randn(*shape)) self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False) + def test_repeat(self): + class MyModel(torch.nn.Module): + def __init__(self): + super(MyModel, self).__init__() + + def forward(self, x): + return x.repeat(1, 2, 3, 4) + + x = Variable(torch.randn(4, 3, 2, 1), requires_grad=True) + self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False) + + def test_repeat_dim_overflow(self): + class MyModel(torch.nn.Module): + def __init__(self): + super(MyModel, self).__init__() + + def forward(self, x): + return x.repeat(1, 2, 3, 4) + + x = Variable(torch.randn(1, 2), requires_grad=True) + self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False) + + def test_repeat_dynamic(self): + class MyModel(torch.nn.Module): + def __init__(self): + super(MyModel, self).__init__() + + def forward(self, x, y): + return x.repeat(y.size()[0] / 2, y.size()[1] * 2) + + x = Variable(torch.randn(1, 2), requires_grad=True) + y = Variable(torch.randn(2, 4), requires_grad=True) + self.run_model_test(MyModel(), train=False, input=(x, y), batch_size=BATCH_SIZE, use_gpu=False) + def test_mean(self): shape = (3, 4, 5) for params in [{}] + [{'dim': i} for i in range(len(shape))]: diff --git a/test/test_distributions.py b/test/test_distributions.py index 7effb9012e9fc6..263c9041da991d 100644 --- a/test/test_distributions.py +++ b/test/test_distributions.py @@ -3475,7 +3475,7 @@ def setUp(self): ), ( Binomial(10, simplex_tensor), - scipy.stats.binom(10 * np.ones(simplex_tensor.shape), simplex_tensor) + scipy.stats.binom(10 * np.ones(simplex_tensor.shape), simplex_tensor.numpy()) ), ( Cauchy(random_var, positive_var), diff --git a/test/test_jit.py b/test/test_jit.py index ab4c907e72d19f..75fe7fca47b9f7 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -1122,6 +1122,87 @@ def test_fn(ten, mask): ten = torch.rand(3, 3) self.assertEqual(test_fn(ten, mask), traced_test_fn(ten, mask)) + def test_constant_prop_simple(self): + @torch.jit.script + def constant_prop(input_tensor): + a = 2 * 3 + b = a + 2 + return b + input_tensor + + x = torch.tensor(2) + out_ref = constant_prop(x) + self.run_pass('constant_propagation', constant_prop.graph) + out_test = constant_prop(torch.tensor(2)) + self.assertEqual(out_ref, out_test) + self.assertExpected(canonical(constant_prop.graph)) + + def test_constant_prop_nested(self): + @torch.jit.script + def constant_prop(a): + b = 2 + 1 + if a < 2: + c = b + 2 + else: + c = b - 2 + return c + + out_ref = constant_prop(torch.tensor(2)) + self.run_pass('constant_propagation', constant_prop.graph) + out_test = constant_prop(torch.tensor(2)) + self.assertEqual(out_ref, out_test) + self.assertExpected(canonical(constant_prop.graph)) + + def test_constant_prop_print(self): + @torch.jit.script + def constant_prop(input_tensor): + a = 2 * 3 + FIXME_zerol() + print(a) + b = a + 2 + return b + input_tensor + + self.run_pass('constant_propagation', constant_prop.graph) + self.assertExpected(canonical(constant_prop.graph)) + + def test_constant_prop_rand(self): + @torch.jit.script + def constant_prop(): + a = torch.randn([3]) + b = a + 2 + return b + + self.run_pass('constant_propagation', constant_prop.graph) + self.assertExpected(canonical(constant_prop.graph)) + + # TODO: implement + @unittest.expectedFailure + def test_constant_prop_if_constant(self): + @torch.jit.script + def constant_prop(): + b = 3 + if True: + b = 1 + if False: + b = 2 + return b + + self.run_pass('constant_propagation', constant_prop.graph) + self.assertExpected(canonical(constant_prop.graph)) + + # TODO: implement + @unittest.expectedFailure + def test_constant_prop_loop_constant(self): + @torch.jit.script + def constant_prop(): + b = 0 + while True: + b = 1 + while False: + b = 2 + return b + + self.run_pass('constant_propagation', constant_prop.graph) + self.assertExpected(canonical(constant_prop.graph)) + class TestBatched(TestCase): # generate random examples and create an batchtensor with them diff --git a/test/test_legacy_nn.py b/test/test_legacy_nn.py index 1463d15cf22d0c..de65e6fc8ce7a0 100644 --- a/test/test_legacy_nn.py +++ b/test/test_legacy_nn.py @@ -693,14 +693,18 @@ def _backward(self, module, input, output, grad_output, create_graph=False): return module.backward(input, grad_output) - def _forward_criterion(self, criterion, input, target): + def _forward_criterion(self, criterion, input, target, extra_args=None): + if extra_args is None: + extra_args = tuple() with torch.no_grad(): - return criterion.forward(input, target) + return criterion.forward(input, target, *extra_args) - def _backward_criterion(self, criterion, input, target, gradOutput=None): + def _backward_criterion(self, criterion, input, target, gradOutput=None, extra_args=None): + if extra_args is None: + extra_args = tuple() # Ignore gradOutput. It's used for non-legacy tests. with torch.no_grad(): - return criterion.backward(input, target) + return criterion.backward(input, target, *extra_args) def _zero_grad_parameters(self, module): return module.zeroGradParameters() diff --git a/test/test_nn.py b/test/test_nn.py index ccd698747ae8d5..4498434f1cf2c4 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -36,7 +36,7 @@ TEST_CUDNN_VERSION from common_nn import NNTestCase, ModuleTest, CriterionTest, TestBase, \ module_tests, criterion_tests, loss_reference_fns, get_reduction, \ - get_weight, smoothl1loss_reference, kldivloss_reference + get_weight, smoothl1loss_reference, kldivloss_reference, ctcloss_reference if TEST_SCIPY: @@ -383,6 +383,8 @@ class NewCriterionTest(InputVariableMixin, CriterionTest): def __init__(self, *args, **kwargs): super(NewCriterionTest, self).__init__(*args, **kwargs) self.check_gradgrad = kwargs.get('check_gradgrad', True) + self.check_half = kwargs.get('check_half', True) + self.convert_target = kwargs.get('convert_target', True) def _do_extra_tests(self, test_case, module, input, target): if not self.check_gradgrad: @@ -407,7 +409,7 @@ def apply_fn(input1, input2, *params): gradcheck(apply_fn, inputs) gradgradcheck(apply_fn, inputs) - def test_cuda(self, test_case, dtype=None): + def test_cuda(self, test_case, dtype=None, extra_args=None): def convert_dtype(obj, dtype, requires_grad=False): if isinstance(obj, torch.Tensor): return torch.tensor(obj.data, dtype=dtype, requires_grad=requires_grad) @@ -430,7 +432,7 @@ def convert_dtype(obj, dtype, requires_grad=False): if dtype is not None: cpu_input = convert_dtype(cpu_input, dtype, True) # NLLLoss requires target to be LongTensor - if not isinstance(cpu_target, torch.LongTensor): + if not isinstance(cpu_target, torch.LongTensor) and self.convert_target: cpu_target = convert_dtype(cpu_target, dtype) cpu_module.type(dtype) gpu_module.type(dtype) @@ -447,13 +449,13 @@ def convert_dtype(obj, dtype, requires_grad=False): # Loss modules with weights require consistent input/module weight types cpu_module = self.constructor(*self.constructor_args) - cpu_output = test_case._forward_criterion(cpu_module, cpu_input, cpu_target) - gpu_output = test_case._forward_criterion(gpu_module, gpu_input, gpu_target) + cpu_output = test_case._forward_criterion(cpu_module, cpu_input, cpu_target, extra_args=extra_args) + gpu_output = test_case._forward_criterion(gpu_module, gpu_input, gpu_target, extra_args=extra_args) # dtype can be None, so set precision in this way instead of a precision map test_case.assertEqual(cpu_output, gpu_output, 1e-1 if dtype == torch.half else 4e-4) - cpu_gradInput = test_case._backward_criterion(cpu_module, cpu_input, cpu_target) - gpu_gradInput = test_case._backward_criterion(gpu_module, gpu_input, gpu_target) + cpu_gradInput = test_case._backward_criterion(cpu_module, cpu_input, cpu_target, extra_args=extra_args) + gpu_gradInput = test_case._backward_criterion(gpu_module, gpu_input, gpu_target, extra_args=extra_args) test_case.assertEqual(cpu_gradInput, gpu_gradInput, 1e-1 if dtype == torch.half else 4e-4) except NotImplementedError: pass @@ -465,6 +467,10 @@ def _get_target(self): def constructor_args(self): return self._get_arg('constructor_args', False) + @property + def extra_args(self): + return self._get_arg('extra_args', False) + class TestNN(NNTestCase): _do_cuda_memory_leak_check = True @@ -479,20 +485,24 @@ def _backward(self, module, input, output, grad_output, create_graph=False): return None return input.grad.data - def _forward_criterion(self, criterion, input, target): + def _forward_criterion(self, criterion, input, target, extra_args=None): + if extra_args is None: + extra_args = tuple() if isinstance(input, tuple): - args = input + (target,) + args = input + (target,) + extra_args output = criterion(*args) else: - output = criterion(input, target) + output = criterion(input, target, *extra_args) return output.item() - def _backward_criterion(self, criterion, input, target, gradOutput=None): + def _backward_criterion(self, criterion, input, target, gradOutput=None, extra_args=None): + if extra_args is None: + extra_args = tuple() input_tuple = input if isinstance(input, tuple) else (input,) for i in input_tuple: if i.grad is not None: i.grad.data.zero_() - args = input_tuple + (target,) + args = input_tuple + (target,) + extra_args if gradOutput is None: gradOutput = torch.ones(()) criterion(*args).backward(gradOutput.type_as(input_tuple[0])) @@ -3578,6 +3588,19 @@ def test_NLLLoss_mismatched_batch(self): with self.assertRaisesRegex(ValueError, 'Expected.*batch_size'): F.nll_loss(x, t) + @unittest.skipIf(not (TEST_CUDNN and TEST_CUDNN_VERSION >= 7000), "needs cudnn >= 7.0") + def test_CTCLoss_cudnn(self): + target_lengths = [30, 25, 20] + input_lengths = [50, 50, 50] + targets = torch.randint(1, 15, (sum(target_lengths),), dtype=torch.int) + log_probs = torch.randn(50, 3, 15, dtype=torch.float, device='cuda').log_softmax(2) + res = torch.nn.functional.ctc_loss(log_probs, targets, input_lengths, target_lengths) + expected = ctcloss_reference(log_probs, targets.cuda(), input_lengths, target_lengths).float() + with torch.backends.cudnn.flags(enabled=False): + res2 = torch.nn.functional.ctc_loss(log_probs, targets.cuda().long(), input_lengths, target_lengths) + self.assertEqual(res, expected) + self.assertEqual(res2, res) + def test_RNN_cell_no_broadcasting(self): def test(cell_module, input, hx, input_size, hidden_size): cell = cell_module(input_size, hidden_size) @@ -6006,15 +6029,20 @@ def add(test_name, fn): add(test_name, lambda self, test=test: test(self)) cuda_test_name = test_name + '_cuda' # With dtype enable, it's good enough to test against three floating types + kwargs = {} + if 'extra_args' in get_function_arglist(test.test_cuda): + kwargs['extra_args'] = test.extra_args + if 'dtype' in get_function_arglist(test.test_cuda): add(cuda_test_name + '_float', lambda self, - test=test: test.test_cuda(self, dtype=torch.float)) + test=test, kwargs=kwargs: test.test_cuda(self, dtype=torch.float, **kwargs)) add(cuda_test_name + '_double', lambda self, - test=test: test.test_cuda(self, dtype=torch.double)) - add(cuda_test_name + '_half', lambda self, - test=test: test.test_cuda(self, dtype=torch.half)) + test=test, kwargs=kwargs: test.test_cuda(self, dtype=torch.double, **kwargs)) + if getattr(test, 'check_half', True): + add(cuda_test_name + '_half', lambda self, + test=test: test.test_cuda(self, dtype=torch.half, **kwargs)) else: - add(cuda_test_name, lambda self, test=test: test.test_cuda(self)) + add(cuda_test_name, lambda self, test=test, kwargs=kwargs: test.test_cuda(self, **kwargs)) def wrap_functional(fn, **kwargs): @@ -6174,6 +6202,45 @@ def forward(self, *args): check_sum_reduction=True, check_gradgrad=False, ), + dict( + module_name='CTCLoss', + constructor_args=(14,), # blank=14 + extra_args=([50, 50, 50], [30, 25, 20]), # input_lengths, target_lengths + input_fn=lambda: torch.randn(50, 3, 15).log_softmax(2), + target_fn=lambda: torch.randint(0, 14, (3, 30), dtype=torch.long), + reference_fn=lambda i, t, il, tl, m: + ctcloss_reference(i, t, il, tl, blank=14, reduction=get_reduction(m)), + check_sum_reduction=True, + check_gradgrad=False, + check_half=False, + ), + dict( + module_name='CTCLoss', + desc='1d_target', + constructor_args=(14,), # blank=14 + extra_args=([50, 50, 50], [30, 25, 20]), # input_lengths, target_lengths + input_fn=lambda: torch.randn(50, 3, 15).log_softmax(2), + target_fn=lambda: torch.randint(0, 14, (3, 30), dtype=torch.long), + reference_fn=lambda i, t, il, tl, m: + ctcloss_reference(i, t, il, tl, blank=14, reduction=get_reduction(m)), + check_sum_reduction=True, + check_gradgrad=False, + check_half=False, + ), + dict( + module_name='CTCLoss', + desc='2d_int_target', + constructor_args=(0,), # blank=0 + extra_args=([50, 50, 50], [30, 25, 20]), # input_lengths, target_lengths + input_fn=lambda: torch.randn(50, 3, 15).log_softmax(2), + target_fn=lambda: torch.randint(1, 15, (3, 30), dtype=torch.int), + reference_fn=lambda i, t, il, tl, m: + ctcloss_reference(i, t, il, tl, blank=0, reduction=get_reduction(m)), + check_sum_reduction=True, + check_gradgrad=False, + check_half=False, + convert_target=False, + ), ] diff --git a/test/test_torch.py b/test/test_torch.py index 2a8c897713111f..745e4cf4b52616 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -1690,6 +1690,7 @@ def test_einsum(self): ("...ii->...i", I), # batch diagonal # -- Other ("bn,anm,bm->ba", l, w, r), # as torch.bilinear + ("... ii->...i ", I), # batch diagonal with spaces ] for test in test_list: actual = torch.einsum(test[0], test[1:]) @@ -7947,6 +7948,20 @@ def test_ctor_with_numpy_array(self): for i in range(len(array)): self.assertEqual(tensor[i], array[i]) + @unittest.skipIf(not TEST_NUMPY, "Numpy not found") + def test_ctor_with_numpy_scalar_ctor(self): + dtypes = [ + np.double, + np.float, + np.float16, + np.int64, + np.int32, + np.int16, + np.uint8 + ] + for dtype in dtypes: + self.assertEqual(dtype(42), torch.tensor(dtype(42)).item()) + @unittest.skipIf(not TEST_NUMPY, "Numpy not found") def test_numpy_index(self): i = np.int32([0, 1, 2]) @@ -8034,6 +8049,17 @@ def test_numpy_array_interface(self): for i in range(len(x)): self.assertEqual(geq2_x[i], geq2_array[i]) + @unittest.skipIf(not TEST_NUMPY, "Numpy not found") + def test_multiplication_numpy_scalar(self): + np_sc = np.float64(2.0) + t = torch.ones(2, requires_grad=True) + r1 = np_sc * t + self.assertIsInstance(r1, torch.Tensor) + self.assertTrue(r1.requires_grad) + r2 = t * np_sc + self.assertIsInstance(r2, torch.Tensor) + self.assertTrue(r2.requires_grad) + def test_error_msg_type_translation(self): with self.assertRaisesRegex( RuntimeError, diff --git a/third_party/onnx b/third_party/onnx index c761845c7f6880..df01dbc0051906 160000 --- a/third_party/onnx +++ b/third_party/onnx @@ -1 +1 @@ -Subproject commit c761845c7f6880ab7eb7e2866d673834c7149e89 +Subproject commit df01dbc00519067ba6d4e818421eccd1d2552e35 diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index 14fd6d7cf5e09c..e3fc510a9a0d21 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -201,6 +201,9 @@ - name: conv_tbc(Tensor self, Tensor weight, Tensor bias, int64_t pad) self, weight, bias: conv_tbc_backward(grad, self, weight, bias, pad) +- name: _ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank) + log_probs: _ctc_loss_backward(grad, log_probs, targets, input_lengths, target_lengths, result0, result1, blank) + - name: det(Tensor self) self: det_backward(grad, self, result) @@ -1145,6 +1148,8 @@ output: -2 * output * grad * grad_output # cudnn +- name: _cudnn_ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank, bool deterministic) + log_probs: result1 - name: cudnn_convolution_transpose(Tensor self, Tensor weight, Tensor bias, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) self, weight, bias: cudnn_convolution_transpose_backward(self, grad, weight, padding, output_padding, stride, dilation, groups, benchmark, deterministic, grad_input_mask) diff --git a/tools/clang_tidy.py b/tools/clang_tidy.py index abbadc70691b46..77b101dedf0f3e 100644 --- a/tools/clang_tidy.py +++ b/tools/clang_tidy.py @@ -7,6 +7,7 @@ import subprocess import sys + DEFAULT_FILE_PATTERN = r".*\.[ch](pp)?" # @@ -start,count +start,count @@ @@ -26,6 +27,11 @@ def run_shell_command(arguments, process_name=None): return output.decode() +def normalize_directory_path(path): + """Normalizes a directory path.""" + return path.rstrip('/') + + def transform_globs_into_regexes(globs): """Turns glob patterns into regular expressions.""" return [glob.replace("*", ".*").replace("?", ".") for glob in globs] @@ -49,16 +55,37 @@ def git_diff(args, verbose): return run_shell_command(command, process_name="git diff") -def filter_files(files, file_patterns): +def filter_files(files, file_patterns, verbose): """Returns all files that match any of the patterns.""" filtered = [] for file in files: + has_match = False for pattern in file_patterns: - if pattern.match(file): + if pattern.search(file): filtered.append(file) + has_match = True + if not has_match and verbose: + message = "{} does not match any ".format(file) + message += "file pattern in {{{}}}".format(', '.join(map(str, file_patterns))) + print(message) return filtered +def remove_recursive_files(files, paths, verbose): + """ + Removes all files that are not immediately under one of the given paths. + """ + for file in files: + if os.path.dirname(file) in paths: + yield file + else: + if verbose: + + message = "{} ({}) does not match any ".format(file, os.path.dirname(file)) + message += "non-recursive path in {{{}}}".format(", ".join(paths)) + print(message) + + def get_changed_files(revision, paths, verbose): """Runs git diff to get the paths of all changed files.""" # --diff-filter AMU gets us files that are (A)dded, (M)odified or (U)nmerged (in the working copy). @@ -152,7 +179,17 @@ def parse_options(): ) parser.add_argument("-r", "--revision", help="Git revision to get changes from") parser.add_argument( - "-p", "--paths", nargs="+", default=["."], help="Lint only the given paths" + "-p", + "--paths", + nargs="+", + default=["."], + help="Lint only the given paths (recursively)", + ) + parser.add_argument( + "-n", + "--no-recursive", + action="store_true", + help="If paths are supplied with -p/--paths, do not recurse into paths", ) parser.add_argument( "-s", @@ -173,12 +210,15 @@ def parse_options(): def main(): options = parse_options() + paths = map(normalize_directory_path, options.paths) if options.revision: - files = get_changed_files(options.revision, options.paths, options.verbose) + files = get_changed_files(options.revision, paths, options.verbose) else: - files = get_all_files(options.paths) + files = get_all_files(paths) + if options.no_recursive: + files = remove_recursive_files(files, paths, options.verbose) file_patterns = get_file_patterns(options.glob, options.regex) - files = filter_files(files, file_patterns) + files = filter_files(files, file_patterns, options.verbose) # clang-tidy error's when it does not get input files. if not files: diff --git a/tools/cpp_build/build_caffe2.sh b/tools/cpp_build/build_caffe2.sh index b35435acb388c6..6a50c14e05523e 100755 --- a/tools/cpp_build/build_caffe2.sh +++ b/tools/cpp_build/build_caffe2.sh @@ -24,6 +24,7 @@ cmake -DUSE_CUDA:BOOL=$USE_CUDA \ -DCMAKE_BUILD_TYPE:STRING=$BUILD_TYPE \ -DCMAKE_INSTALL_PREFIX:STRING=$INSTALL_PREFIX \ -DCMAKE_INSTALL_MESSAGE=NEVER \ + -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=ON \ -G "$GENERATE" \ $PYTORCHPATH/ $MAKE -j "$JOBS" install diff --git a/tools/cpp_build/build_libtorch.sh b/tools/cpp_build/build_libtorch.sh index 92a9b9981ed697..6dd9a589cf1074 100755 --- a/tools/cpp_build/build_libtorch.sh +++ b/tools/cpp_build/build_libtorch.sh @@ -24,6 +24,7 @@ cmake -DUSE_CUDA:BOOL=$USE_CUDA \ -DCMAKE_INSTALL_MESSAGE=NEVER \ -Dnanopb_BUILD_GENERATOR:BOOL=OFF \ -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON \ + -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=ON \ -DVERBOSE:BOOL=${VERBOSE:-0} \ -G "$GENERATE" \ $PYTORCHPATH/torch diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py index ad9ad2e05c4f4c..be99490ab41a1c 100644 --- a/tools/jit/gen_jit_dispatch.py +++ b/tools/jit/gen_jit_dispatch.py @@ -84,6 +84,7 @@ def from_attribute(arg): 'Scalar': '{}.toScalar()', 'ScalarType': 'static_cast({}.toInt())', 'Tensor': '{}.toTensor()', + 'TensorList': '{}.toTensorList()->elements()', 'bool': 'bool({}.toInt())', 'double': '{}.toDouble()', 'int64_t': '{}.toInt()', @@ -106,7 +107,7 @@ def from_ivalue(arg, value): ); """) CALL_METHOD = CodeTemplate("""\ -DeviceGuard device_guard(deviceForInputs(stack, ${num_dynamic_inputs})); +DeviceGuard device_guard(deviceForInputs(stack, ${num_inputs})); auto result = (${first}).${name}( ${args} ); @@ -129,7 +130,7 @@ def from_ivalue(arg, value): return Operation([=](Stack & stack) { autograd::profiler::RecordFunction record("${name}"); ${call} - drop(stack, ${num_dynamic_inputs}); + drop(stack, ${num_inputs}); pack(stack, std::move(result)); return 0; }); @@ -171,9 +172,6 @@ def is_jit_op(decl): # we currently only support vararg tensor lists when they are the _first_ argument # and the only tensor argument arguments = decl['arguments'] - # Only support a single TensorList arg - if sum(arg['simple_type'] == 'TensorList' for arg in arguments) > 1: - return False return ((not decl['api_name'].endswith('_') or is_magic_method(decl['api_name'])) and not decl['name'].endswith('_out') and @@ -197,7 +195,7 @@ def gen_jit_dispatch(declarations, out, template_path): ops = [] - def get_invocation(decl, args, num_dynamic_inputs): + def get_invocation(decl, args, num_inputs): # because the arg list can get lengthy we put them on a separate line def pack_arguments(args): @@ -211,67 +209,26 @@ def pack_arguments(args): elif 'namespace' in decl['method_of']: return CALL_NAMESPACE.substitute(name=decl['name'], args=pack_arguments(args), - num_dynamic_inputs=num_dynamic_inputs) + num_inputs=num_inputs) else: return CALL_METHOD.substitute( name=decl['name'], first=args[0], args=pack_arguments(args[1:]), - num_dynamic_inputs=num_dynamic_inputs) + num_inputs=num_inputs) - def emit_decl_variant(decl, is_positional_arg, has_tensorlist): + def emit_decl_variant(decl, is_positional_arg): # is_positional_arg is a boolean list the same length as decl['arguments'] # that indicates if the argument should come from the postional list # of inputs. If false, the argument comes from the constant attributes kw_assignments = [] arguments = [] - - if has_tensorlist: - kw_assignments.append('size_t varargs_length = node->inputs().size();') - # arguments look like: [tensor list], arg1, arg2, arg3 - # we use peek(, static_inputs) to read the non-vararg inputs - # from the end of the stack - static_inputs = sum(is_positional_arg) - 1 - num_dynamic_inputs = 'varargs_length' - tensorlist_idx = [i for i, arg in enumerate(decl['arguments']) if arg['simple_type'] == 'TensorList'][0] - else: - static_inputs = sum(is_positional_arg) - num_dynamic_inputs = static_inputs + num_inputs = sum(is_positional_arg) real_inputs = 0 for i, arg in enumerate(decl['arguments']): - # This conditional allows us to process argument lists with a flattened argument list - # with a single TensorList. Given the sequence of arguments: - # a b c [d e f g] h i # [] is the list - # - # 1. For the section where we are processing positional inputs before the - # TensorList: - # a b c [d e f g] h i # [] is the list - # ~~~~~~~~~~~~ <- N - # we set this view_length to the total number of varargs inputs (i.e. the length) - # of the whole argument list. This means that indexing into the list using peek() - # we will retrieve arguments ar their true indices (i.e. peek at 0 points to a, - # 1 points to b, etc...). Similarly, we can use peekSlice() to index into the - # list itself this way. - # 2. After the list: - # a b c [d e f g] h i # [] is the list - # ~~~~~~ <- N - # Here we set the view length to static_inputs. In our example, - # we effectively ignore the fact that we have a list here. What is - # significant is that our index i is equivalent when the view length - # is right-justified, whether we have the list or not. Concretely, - # indexing h or i from `a b c [d e f g] h i` is equvalent to indexing - # h or i from `a b c h i`. - view_length = 'varargs_length' if has_tensorlist and i < tensorlist_idx else static_inputs - - if arg['simple_type'] == 'TensorList': - # NOTE: don't advance real_inputs here. After this we are going - # to switch over to indexing from the end as if we only had - # the static arguments. - arguments.append('toTensors(peekSlice(stack, {}, varargs_length - {}, varargs_length))' - .format(real_inputs, static_inputs)) - elif arg['simple_type'] in default_only_types: + if arg['simple_type'] in default_only_types: arguments.append(arg['default']) elif is_tensor_arg(arg) or is_positional_arg[i]: - value = '(std::move(peek(stack, {}, {})))'.format(real_inputs, view_length) + value = '(std::move(peek(stack, {}, {})))'.format(real_inputs, num_inputs) arguments.append(from_ivalue(arg, value)) real_inputs += 1 else: @@ -279,20 +236,18 @@ def emit_decl_variant(decl, is_positional_arg, has_tensorlist): kw_assignments.append(assign) arguments.append(arg['name']) - call = get_invocation(decl, arguments, num_dynamic_inputs) + call = get_invocation(decl, arguments, num_inputs) returns = decl['returns'] - all_scalars = all(r['dynamic_type'] != 'TensorList' for r in returns) constructor = CONSTRUCTOR.substitute(name=decl['name'], call=call, kw_assignments=kw_assignments, - num_dynamic_inputs=num_dynamic_inputs) + num_inputs=num_inputs) return constructor def emit_decl(decl): arguments = decl['arguments'] - has_tensorlist = any(arg['simple_type'] == 'TensorList' for arg in arguments) num_tensor_args = sum(map(is_tensor_arg, arguments)) # Right now, we generate dispatch methods that either take all non-tensor arguments @@ -304,12 +259,12 @@ def emit_decl(decl): all_real_arguments_are_inputs = tuple(arg['simple_type'] not in default_only_types for arg in arguments) only_tensors_are_inputs = tuple(is_tensor_arg(arg) for arg in arguments) - variants = [emit_decl_variant(decl, all_real_arguments_are_inputs, has_tensorlist)] + variants = [emit_decl_variant(decl, all_real_arguments_are_inputs)] # in some cases there are no inputs that are possibly attributes, so the # variants are actually the same. If so avoid generating both to save compilation # time. if all_real_arguments_are_inputs != only_tensors_are_inputs: - variants += [',', emit_decl_variant(decl, only_tensors_are_inputs, has_tensorlist)] + variants += [',', emit_decl_variant(decl, only_tensors_are_inputs)] ops.append(OPERATOR.substitute(signature=signature(decl), ops=variants)) diff --git a/tools/jit/templates/register_aten_ops.cpp b/tools/jit/templates/register_aten_ops.cpp index 06ad9c2840b1cc..2ff38e80b56e3f 100644 --- a/tools/jit/templates/register_aten_ops.cpp +++ b/tools/jit/templates/register_aten_ops.cpp @@ -29,7 +29,6 @@ using autograd::Variable; using autograd::variable_list; using at::Scalar; using at::Tensor; -using at::TensorList; using at::TensorOptions; using at::DeviceGuard; @@ -42,12 +41,6 @@ int deviceForInputs(Stack & stack, size_t N) { return t.type().is_cuda() ? (int) t.get_device() : -1; } -std::vector toTensors(at::ArrayRef ivalues) { - return fmap(ivalues, [](const IValue& v) { - return v.toTensor(); - }); -} - template std::array as_bool_array(const std::vector& vec) { std::array res; diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index 88546fda7ed604..3e43a6e00b7359 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -138,6 +138,7 @@ set(TORCH_SRCS ${TORCH_SRC_DIR}/csrc/jit/operator.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/batch_mm.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/canonicalize.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/constant_propagation.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/common_subexpression_elimination.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/create_autodiff_subgraphs.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/dead_code_elimination.cpp diff --git a/torch/csrc/api/include/torch/nn/cursor.h b/torch/csrc/api/include/torch/nn/cursor.h index c0f56eea72fbd0..2ae5c5d93752c1 100644 --- a/torch/csrc/api/include/torch/nn/cursor.h +++ b/torch/csrc/api/include/torch/nn/cursor.h @@ -48,7 +48,7 @@ class CursorBase { /// A `(key, value)` pair exposed by cursor iterators. struct Item { - Item(const std::string& key_, T& module_); + Item(const std::string& key_, T& value_); T& operator*(); const T& operator*() const; diff --git a/torch/csrc/autograd/anomaly_mode.h b/torch/csrc/autograd/anomaly_mode.h index 7327d03f11b887..1f12f0a65c7460 100644 --- a/torch/csrc/autograd/anomaly_mode.h +++ b/torch/csrc/autograd/anomaly_mode.h @@ -18,7 +18,7 @@ struct AnomalyMode { struct AnomalyMetadata { - virtual ~AnomalyMetadata(){}; + virtual ~AnomalyMetadata() = default; virtual void store_stack() = 0; virtual void print_stack() = 0; }; diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp index 8309ba1ce1038c..74e15f5caefe9d 100644 --- a/torch/csrc/autograd/engine.cpp +++ b/torch/csrc/autograd/engine.cpp @@ -159,7 +159,7 @@ struct GraphTask { std::unordered_map exec_info; std::vector captured_vars; - void init_to_execute(Function& graph_root, const edge_list& captures); + void init_to_execute(Function& graph_root, const edge_list& outputs); // The value of worker_device in the thread that created this task. // See Note [Reentrant backwards] @@ -499,14 +499,14 @@ struct ClearCallbacks { std::mutex& callbacks_lock; }; -auto Engine::execute(const edge_list& input_roots, +auto Engine::execute(const edge_list& roots, const variable_list& inputs, bool keep_graph, bool create_graph, const edge_list& outputs) -> variable_list { std::call_once(start_threads_flag, &Engine::start_threads, this); - validate_outputs(input_roots, const_cast(inputs), [](const std::string& msg) { + validate_outputs(roots, const_cast(inputs), [](const std::string& msg) { return msg; }); @@ -517,7 +517,7 @@ auto Engine::execute(const edge_list& input_roots, std::unique_lock lock(graph_task.mutex); // Now compute the dependencies for all executable functions and queue the root - auto graph_root = std::make_shared(input_roots, inputs); + auto graph_root = std::make_shared(roots, inputs); compute_dependencies(graph_root.get(), graph_task); if (!outputs.empty()) { graph_task.init_to_execute(*graph_root, outputs); diff --git a/torch/csrc/autograd/engine.h b/torch/csrc/autograd/engine.h index db8b3357ac2536..94490303ccc240 100644 --- a/torch/csrc/autograd/engine.h +++ b/torch/csrc/autograd/engine.h @@ -57,7 +57,7 @@ struct TORCH_API Engine { ReadyQueue& ready_queue(int device); void start_threads(); virtual void thread_init(int device); - virtual void thread_main(GraphTask *task); + virtual void thread_main(GraphTask *graph_task); virtual void thread_on_exception(FunctionTask& task, std::exception& e); std::once_flag start_threads_flag; diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h index b02bdf3928f2ff..46a80b90b29ffa 100644 --- a/torch/csrc/autograd/function.h +++ b/torch/csrc/autograd/function.h @@ -328,7 +328,7 @@ struct TORCH_API Function : std::enable_shared_from_this { /// See Function::is_traceable() for definition. struct TraceableFunction : public Function { using Function::Function; - bool is_traceable() final override { + bool is_traceable() final { return true; } }; diff --git a/torch/csrc/autograd/function_hook.h b/torch/csrc/autograd/function_hook.h index 03c52fea54535c..f3cf5b2e793c6a 100644 --- a/torch/csrc/autograd/function_hook.h +++ b/torch/csrc/autograd/function_hook.h @@ -10,12 +10,12 @@ struct Variable; using variable_list = std::vector; struct FunctionPreHook { - virtual ~FunctionPreHook() {} + virtual ~FunctionPreHook() = default; virtual variable_list operator()(const variable_list& grads) = 0; }; struct FunctionPostHook { - virtual ~FunctionPostHook() {} + virtual ~FunctionPostHook() = default; virtual variable_list operator()(const variable_list& grad_input, const variable_list& grad_output) = 0; }; diff --git a/torch/csrc/autograd/functions/accumulate_grad.h b/torch/csrc/autograd/functions/accumulate_grad.h index 44d4b7f106c860..db86ae428d4060 100644 --- a/torch/csrc/autograd/functions/accumulate_grad.h +++ b/torch/csrc/autograd/functions/accumulate_grad.h @@ -6,9 +6,9 @@ namespace torch { namespace autograd { struct AccumulateGrad : public Function { - explicit AccumulateGrad(Variable variable); + explicit AccumulateGrad(Variable variable_); - variable_list apply(variable_list&& inputs) override; + variable_list apply(variable_list&& grads) override; Variable variable; }; diff --git a/torch/csrc/autograd/functions/basic_ops.cpp b/torch/csrc/autograd/functions/basic_ops.cpp index b04b0f25ca42d5..c4a54d99d08702 100644 --- a/torch/csrc/autograd/functions/basic_ops.cpp +++ b/torch/csrc/autograd/functions/basic_ops.cpp @@ -11,7 +11,7 @@ namespace torch { namespace autograd { -auto Error::apply(variable_list&& grad_outputs) -> variable_list { +auto Error::apply(variable_list&& inputs) -> variable_list { throw std::runtime_error(msg); } diff --git a/torch/csrc/autograd/functions/tensor.h b/torch/csrc/autograd/functions/tensor.h index aa4b422136930f..1a21a360ba9fc2 100644 --- a/torch/csrc/autograd/functions/tensor.h +++ b/torch/csrc/autograd/functions/tensor.h @@ -13,7 +13,7 @@ namespace torch { namespace autograd { struct CopyBackwards : public Function { - variable_list apply(variable_list&& inputs) override; + variable_list apply(variable_list&& grads) override; at::Type *src_type; int32_t src_device = -1; @@ -23,9 +23,12 @@ struct CopyBackwards : public Function { // grad[idx] is defined by the relative sizes, strides, and offset of base and // view. struct CopySlices : public Function { - CopySlices(const Variable& base, at::TensorGeometry view, std::shared_ptr fn); + CopySlices( + const Variable& base_var, + at::TensorGeometry view_, + std::shared_ptr fn_); - variable_list apply(variable_list&& grads) override; + variable_list apply(variable_list&& inputs) override; void release_variables() override; at::TensorGeometry base; diff --git a/torch/csrc/autograd/input_buffer.h b/torch/csrc/autograd/input_buffer.h index 2e0febfc84b0bc..f1c02e0d78e565 100644 --- a/torch/csrc/autograd/input_buffer.h +++ b/torch/csrc/autograd/input_buffer.h @@ -22,14 +22,14 @@ struct InputBuffer { InputBuffer& operator=(InputBuffer&& other) = default; // Accumulates the variable at a specified index. - void add(size_t idx, Variable var); + void add(size_t pos, Variable var); int device() const; Variable operator[](size_t pos) { return buffer[pos]; } // Returns the inputs as a list of variables. Destroys given InputBuffer. - static std::vector variables(InputBuffer&& buffer); + static std::vector variables(InputBuffer&& g); private: std::vector buffer; diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h index dd77dc193ba9bd..ba0fee1510baa2 100644 --- a/torch/csrc/autograd/profiler.h +++ b/torch/csrc/autograd/profiler.h @@ -185,7 +185,7 @@ struct TORCH_API RecordFunction { using thread_event_lists = std::vector>; // NOTE: changing profiler modes is **NOT THREAD SAFE**. You should ensure that // there no autograd functions are being executed when these function are used. -TORCH_API void enableProfiler(ProfilerState state); +TORCH_API void enableProfiler(ProfilerState new_state); TORCH_API thread_event_lists disableProfiler(); } // namespace profiler diff --git a/torch/csrc/autograd/saved_variable.h b/torch/csrc/autograd/saved_variable.h index 61a1d3b3eac172..037f06a7f95c11 100644 --- a/torch/csrc/autograd/saved_variable.h +++ b/torch/csrc/autograd/saved_variable.h @@ -45,10 +45,10 @@ class TORCH_API SavedVariable { std::weak_ptr grad_accumulator_; VariableVersion version_counter_; - uint32_t saved_version_; - uint32_t output_nr_; + uint32_t saved_version_ = 0; + uint32_t output_nr_ = 0; bool was_default_constructed_ = true; - bool requires_grad_; - bool has_grad_fn_; + bool requires_grad_ = false; + bool has_grad_fn_ = false; }; }} // namespace torch::autograd diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h index c97a0322359a4d..633b8028f765df 100644 --- a/torch/csrc/autograd/variable.h +++ b/torch/csrc/autograd/variable.h @@ -263,7 +263,7 @@ struct Variable::Impl : public at::TensorImpl { TORCH_API explicit Impl( at::Tensor data, bool requires_grad = false, - Edge edge = Edge()); + Edge gradient_edge = Edge()); ~Impl() override; diff --git a/torch/csrc/jit/argument_spec.h b/torch/csrc/jit/argument_spec.h index d6bd90cb708784..f404b4ce9a05c6 100644 --- a/torch/csrc/jit/argument_spec.h +++ b/torch/csrc/jit/argument_spec.h @@ -59,20 +59,21 @@ struct ArgumentSpec { for(int32_t i = 0; i < num_inputs; i++) { auto & pod = pods[i]; pod.is_tensor = static_cast(inputs[i].isTensor()); - if (!pod.is_tensor) continue; - at::Tensor t = inputs[i].toTensor(); - pod.defined = t.defined(); - if (pod.defined) { - pod.type = static_cast(t.type().scalarType()); - pod.device = (!t.type().is_cuda()) ? -1 : t.get_device(); - pod.requires_grad = with_grad && autograd::as_variable_ref(t).requires_grad(); - total_dims += t.ndimension(); - auto sizes = t.sizes(); - std::copy(sizes.begin(),sizes.end(), next_dim); - next_dim += sizes.size(); - auto strides = t.strides(); - std::copy(strides.begin(), strides.end(), next_dim); - next_dim += strides.size(); + if (pod.is_tensor) { + at::Tensor t = inputs[i].toTensor(); + pod.defined = t.defined(); + if (pod.defined) { + pod.type = static_cast(t.type().scalarType()); + pod.device = (!t.type().is_cuda()) ? -1 : t.get_device(); + pod.requires_grad = with_grad && autograd::as_variable_ref(t).requires_grad(); + total_dims += t.ndimension(); + auto sizes = t.sizes(); + std::copy(sizes.begin(),sizes.end(), next_dim); + next_dim += sizes.size(); + auto strides = t.strides(); + std::copy(strides.begin(), strides.end(), next_dim); + next_dim += strides.size(); + } } // each POD has a running tally of all dimensions including its own pod.total_dims = total_dims; diff --git a/torch/csrc/jit/attributes.h b/torch/csrc/jit/attributes.h index f69790cab52e00..53b87af9ef991d 100644 --- a/torch/csrc/jit/attributes.h +++ b/torch/csrc/jit/attributes.h @@ -28,7 +28,7 @@ struct AttributeValue { Symbol name; virtual AttributeKind kind() const = 0; virtual Ptr clone() const = 0; - virtual ~AttributeValue() {} + virtual ~AttributeValue() = default; }; template @@ -101,7 +101,7 @@ struct AttributeError : public std::exception { // we return Derived* pointers because Nodes are normally held as pointers. template struct Attributes { - Attributes() {} + Attributes() = default; void copyAttributes(const Attributes & rhs) { values_.clear(); for(auto & i : rhs.values_) { diff --git a/torch/csrc/jit/autodiff.cpp b/torch/csrc/jit/autodiff.cpp index c830dc45a537f5..7f250bf7c452aa 100644 --- a/torch/csrc/jit/autodiff.cpp +++ b/torch/csrc/jit/autodiff.cpp @@ -9,6 +9,7 @@ #include #include +#include namespace torch { namespace jit { @@ -564,14 +565,13 @@ static void lambdaLiftReverse(Gradient& grad_desc, ReverseDetails& rev_info) { reverse_block->owningNode()->destroy(); } -Gradient differentiate(std::shared_ptr& _graph, const std::vector& requires_grad) { +Gradient differentiate(std::shared_ptr& graph, const std::vector& requires_grad) { Gradient grad_desc; // Take ownership of the graph - JIT_ASSERTM( - _graph.use_count() == 1, - "differentiate will mutate and destroy the graph, so it requires " - "graph.use_count() == 1, but found ", _graph.use_count()); - std::swap(_graph, grad_desc.f); + JIT_ASSERTM(graph.use_count() == 1, + "differentiate will mutate and destroy the graph, so it requires " + "graph.use_count() == 1, but found %d", graph.use_count()); + std::swap(graph, grad_desc.f); // XXX: Take care when handling outputs - they can be duplicated! WithInsertPoint guard(grad_desc.f->block()); diff --git a/torch/csrc/jit/autodiff.h b/torch/csrc/jit/autodiff.h index 6dd2be9db0e779..ea2b7a1170efeb 100644 --- a/torch/csrc/jit/autodiff.h +++ b/torch/csrc/jit/autodiff.h @@ -4,7 +4,9 @@ #include "torch/csrc/jit/ir.h" #include + #include +#include namespace torch { namespace jit { diff --git a/torch/csrc/jit/constants.cpp b/torch/csrc/jit/constants.cpp index 3c4ad0c130ea31..e8445734662b55 100644 --- a/torch/csrc/jit/constants.cpp +++ b/torch/csrc/jit/constants.cpp @@ -24,6 +24,11 @@ Value* insertConstant( } else if(val.isIntList()) { n->is_(attr::value, val.toIntList()->elements()); n->output()->setType(ListType::ofInts()); + } else if(val.isTensorList()) { + n->ts_(attr::value, fmap(val.toTensorList()->elements(), [](const at::Tensor & t) { + return autograd::Variable(t).data(); + })); + n->output()->setType(ListType::ofTensors()); } else { throw std::runtime_error("Unsupported value kind: " + val.tagKind()); } @@ -66,6 +71,14 @@ RegisterOperators reg({ push(stack, is); return 0; }; + } else if(type->isSubtypeOf(ListType::ofTensors())) { + auto ts = fmap(node->ts(attr::value), [](const at::Tensor & t) -> at::Tensor { + return autograd::make_variable(t); + }); + return [ts](Stack& stack) { + push(stack, ts); + return 0; + }; } else { std::stringstream ss; ss << "constant literal not supported for: " << type->str(); diff --git a/torch/csrc/jit/fusion_compiler.cpp b/torch/csrc/jit/fusion_compiler.cpp index 8d20045efefe6a..22f8b40ba30542 100644 --- a/torch/csrc/jit/fusion_compiler.cpp +++ b/torch/csrc/jit/fusion_compiler.cpp @@ -345,18 +345,14 @@ std::vector emitCompilationUnit(std::ostream & out, size_t i = 0; for(auto o : subgraph.outputs()) { auto & desc = agraph.output_desc[i++]; - if(o->node()->kind() != aten::cat) { + if(o->node()->kind() != prim::FusedConcat) { emitFormal(o, desc); concat_desc.emplace_back(); flat_output_nodes.push_back(o); } else { auto cat = o->node(); - auto tensor_inputs = cat->inputs(); - // We need to drop the dim arg - tensor_inputs = tensor_inputs.slice(0, tensor_inputs.size() - 1); - size_t nInputs = tensor_inputs.size(); - concat_desc.emplace_back(desc, nInputs, cat->get(attr::dim).value()); - for(auto c : tensor_inputs) { + concat_desc.emplace_back(desc, cat->inputs().size(), cat->i(attr::dim)); + for(auto c : cat->inputs()) { emitFormal(c, *concat_desc.back().subtensorDesc); flat_output_nodes.push_back(c); } @@ -386,8 +382,9 @@ std::vector emitCompilationUnit(std::ostream & out, } for(auto n : subgraph.nodes()) { - if(n->kind() == aten::cat) - continue; // Concat nodes by narrowing the output Tensors before the kernel runs + // FusedConcat nodes work by narrowing the output Tensors before the kernel runs + if (n->kind() == prim::FusedConcat) + continue; env.s("node",valueName(n->output())); env.s("rhs", encodeRHS(n)); body << format("auto ${node} = ${rhs};\n",env); diff --git a/torch/csrc/jit/fusion_compiler.h b/torch/csrc/jit/fusion_compiler.h index 6c4759aefb692a..c2f35ee0aa2074 100644 --- a/torch/csrc/jit/fusion_compiler.h +++ b/torch/csrc/jit/fusion_compiler.h @@ -86,7 +86,7 @@ struct CompiledFusionFunction { TH_DISALLOW_COPY_AND_ASSIGN(CompiledFusionFunction); CompiledFusionFunction(const std::string & name, AnnotatedGraph & agraph); - virtual ~CompiledFusionFunction() {} + virtual ~CompiledFusionFunction() = default; // expects outputs to be pre-allocated void launch_with_tensors(at::ArrayRef inputs, at::ArrayRef outputs); diff --git a/torch/csrc/jit/graph_executor.cpp b/torch/csrc/jit/graph_executor.cpp index df81c378ad137d..bd115516208f04 100644 --- a/torch/csrc/jit/graph_executor.cpp +++ b/torch/csrc/jit/graph_executor.cpp @@ -21,6 +21,7 @@ #include "torch/csrc/jit/passes/specialize_undef.h" #include "torch/csrc/jit/passes/loop_unrolling.h" #include "torch/csrc/jit/passes/lower_grad_of.h" +#include "torch/csrc/jit/passes/constant_propagation.h" #include "torch/csrc/jit/symbolic_variable.h" #include "torch/csrc/jit/ivalue.h" @@ -516,28 +517,28 @@ void runRequiredPasses(const std::shared_ptr& g) { RemoveExpands(g); } -void specializeToSpec(const std::shared_ptr& graph_, const ArgumentSpec& spec) { +void specializeToSpec(const std::shared_ptr& graph, const ArgumentSpec& spec) { // clean up GradOf and AutogradAdd nodes // this must be first because later passes do not know what GradOfs are std::vector defined; for(size_t i = 0; i < spec.size(); ++i) { defined.push_back(spec.at(i).defined()); } - specializeUndef(*graph_, defined); + specializeUndef(*graph, defined); // required passes shared with autograd fallback - runRequiredPasses(graph_); + runRequiredPasses(graph); // Decompose addmm nodes to add + mm, so expands can be inserted and // gradients accumulated on the backward pass // // In the future, if we need more passes like this, we should convert this // into a generic canonicalization pass. - DecomposeAddmm(graph_); + DecomposeAddmm(graph); // clean up dead constants from specialization - EliminateDeadCode(graph_); + EliminateDeadCode(graph); // calculate all input shapes - PropagateInputShapes(*graph_, spec); + PropagateInputShapes(*graph, spec); } void runOptimization(std::shared_ptr & graph, bool graphMustSupportVariables) { @@ -554,7 +555,7 @@ void runOptimization(std::shared_ptr & graph, bool graphMustSupportVariab // They also may assume that concrete sizes/strides are availiable UnrollLoops(graph); - + ConstantPropagation(graph); //TODO: create peephole optimizations that are safe to run // when we are using variables, and when we do not know sizes. PeepholeOptimize(graph); diff --git a/torch/csrc/jit/graph_executor.h b/torch/csrc/jit/graph_executor.h index 4e862c9e0a1e44..2693af50af1025 100644 --- a/torch/csrc/jit/graph_executor.h +++ b/torch/csrc/jit/graph_executor.h @@ -34,7 +34,7 @@ struct GraphExecutorState { struct GraphExecutorImpl; struct TORCH_API GraphExecutor { - GraphExecutor() {} + GraphExecutor() = default; GraphExecutor(std::shared_ptr graph, bool optimize = true); // note: if not specified, symbolically_differentiable is computed from the graph. GraphExecutor(std::shared_ptr graph, bool optimize, bool symbolically_differentiable); diff --git a/torch/csrc/jit/graph_node_list.h b/torch/csrc/jit/graph_node_list.h index 996a8b2c75fa0f..054b9517776863 100644 --- a/torch/csrc/jit/graph_node_list.h +++ b/torch/csrc/jit/graph_node_list.h @@ -1,3 +1,5 @@ +#pragma once + #include "torch/csrc/jit/assertions.h" namespace torch { namespace jit { diff --git a/torch/csrc/jit/import.cpp b/torch/csrc/jit/import.cpp index 5b128fd822dafd..40bc9966b45c18 100644 --- a/torch/csrc/jit/import.cpp +++ b/torch/csrc/jit/import.cpp @@ -66,7 +66,7 @@ struct Model_ { // Readers struct ReaderBase { - ReaderBase() {} + ReaderBase() = default; ReaderBase(pb_callback_t& cb) { initialize_callback(cb); } diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp index d3a9bd9139a96e..7499e3152e41b3 100644 --- a/torch/csrc/jit/init.cpp +++ b/torch/csrc/jit/init.cpp @@ -18,6 +18,7 @@ #include "torch/csrc/jit/passes/onnx/fixup_onnx_loop.h" #include "torch/csrc/jit/passes/shape_analysis.h" #include "torch/csrc/jit/passes/decompose_addmm.h" +#include "torch/csrc/jit/passes/constant_propagation.h" #include "torch/csrc/jit/passes/loop_unrolling.h" #include "torch/csrc/jit/passes/to_batch.h" #include "torch/csrc/jit/passes/specialize_undef.h" @@ -75,6 +76,9 @@ void initJITBindings(PyObject *module) { .def("_jit_pass_remove_expands", RemoveExpands) .def("_jit_pass_erase_number_types", EraseNumberTypes) .def("_jit_pass_loop_unrolling", UnrollLoops) + .def("_jit_pass_constant_propagation", [](std::shared_ptr& g) { + return ConstantPropagation(g); + }) .def("_jit_run_cpp_tests", [] { // We have to release the GIL inside this method, because if we happen to // initialize the autograd engine in these tests, the newly spawned worker threads will diff --git a/torch/csrc/jit/interned_strings.h b/torch/csrc/jit/interned_strings.h index 52b8cb0eaccd98..c567793552d73a 100644 --- a/torch/csrc/jit/interned_strings.h +++ b/torch/csrc/jit/interned_strings.h @@ -50,6 +50,7 @@ _(prim, TensorToNum) \ _(prim, AutogradAdd) \ _(prim, GradOf) \ _(prim, AnyDefined) \ +_(prim, FusedConcat) \ _(aten, __not__) \ FORALL_ATEN_BASE_SYMBOLS(_) \ _(onnx, Add) \ diff --git a/torch/csrc/jit/interpreter.cpp b/torch/csrc/jit/interpreter.cpp index 65bdcf695f6de2..da6f629d629e41 100644 --- a/torch/csrc/jit/interpreter.cpp +++ b/torch/csrc/jit/interpreter.cpp @@ -339,7 +339,7 @@ struct ContainerTensor : public at::TensorImpl { ContainerTensor() : TensorImpl(&(at::globalContext().getType(at::Backend::Undefined,at::ScalarType::Undefined)), nullptr) {} - virtual ~ContainerTensor() {} + virtual ~ContainerTensor() = default; virtual at::IntList sizes() const override { throw std::runtime_error("sizes() on ContainerTensor"); } @@ -685,8 +685,8 @@ struct CodeImpl { // InterpreterState state that is held across stages and used to compute a Code struct InterpreterStateImpl { - InterpreterStateImpl(const Code & function_) - : function(function_.pImpl), + InterpreterStateImpl(const Code & code) + : function(code.pImpl), int_data(function->int_data.data()), bool_data(function->bool_data), registers(function->register_size) { @@ -775,15 +775,15 @@ std::ostream & operator<<(std::ostream & out, const Code & code) { Code::Code(std::shared_ptr& graph) : pImpl(new CodeImpl(graph)) {} -Code::~Code() {} +Code::~Code() = default; const std::vector& Code::executors() { return pImpl->executors(); } -InterpreterState::InterpreterState(const Code & function) - : pImpl(new InterpreterStateImpl(function)) {} -InterpreterState::~InterpreterState() {} +InterpreterState::InterpreterState(const Code & code) + : pImpl(new InterpreterStateImpl(code)) {} +InterpreterState::~InterpreterState() = default; void InterpreterState::runOneStage(Stack & stack) { return pImpl->runOneStage(stack); diff --git a/torch/csrc/jit/ir.cpp b/torch/csrc/jit/ir.cpp index 7f09b22b324d11..317d30b05967a5 100644 --- a/torch/csrc/jit/ir.cpp +++ b/torch/csrc/jit/ir.cpp @@ -355,7 +355,7 @@ void Graph::lint() const { // - every use will occur later in the topsort struct LintScope { - LintScope() {} + LintScope() = default; LintScope(std::unique_ptr parent) : parent(std::move(parent)) {} bool contains(const Value * v) { @@ -487,13 +487,13 @@ void LintGraph(std::shared_ptr& graph) { graph->lint(); } -void Block::cloneFrom(Block * src, std::function outer_map) { +void Block::cloneFrom(Block * src, std::function value_map) { std::unordered_map local_map; auto env = [&](Value * v) { auto it = local_map.find(v); if(it != local_map.end()) return it->second; - return outer_map(v); + return value_map(v); }; auto graph = owningGraph(); @@ -619,23 +619,8 @@ Value* Node::namedInput(Symbol name) const { // so this is completely unsafe and needs to be gone as soon as possible. return v; } - const auto & the_schema = schema(); - int64_t tensor_list_pos = 0; - for (auto & arg : the_schema.arguments) { - if (*arg.type == *ListType::ofTensors()) - break; - tensor_list_pos++; - } int64_t arg_pos = findArgument(schema(), name).first; - // XXX: we don't have a single value we could give for a Tensor[], - // because we flatten lists into arguments - JIT_ASSERT(arg_pos != tensor_list_pos); - // NB: if there's no tensor list, then tensor_list_pos == arguments.size(), so this is always true - if (arg_pos < tensor_list_pos) { - return input(arg_pos); - } else { - return input(inputs().size() - (the_schema.arguments.size() - arg_pos)); - } + return input(arg_pos); } bool Node::matches(const char *signature_literal, at::ArrayRef const_inputs) { diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h index 9af468e6ee06e7..9a5c3342bf5df6 100644 --- a/torch/csrc/jit/ir.h +++ b/torch/csrc/jit/ir.h @@ -54,7 +54,7 @@ struct Value; TORCH_API std::ostream& operator<<(std::ostream & out, const Graph & g); TORCH_API std::ostream& operator<<(std::ostream & out, const Type & t); -TORCH_API std::ostream& operator<<(std::ostream & out, const Node & t); +TORCH_API std::ostream& operator<<(std::ostream & out, const Node & n); // A list of nodes, with inputs and outputs struct Block; @@ -683,7 +683,7 @@ struct Node : public Attributes { return *schema_; } - virtual ~Node() {} + virtual ~Node() = default; private: std::pair findInput(Symbol name); void findSchema() const; @@ -889,8 +889,7 @@ friend struct Block; , block_(new Block(this, nullptr)) , insert_before_(return_node()) {} - Graph() - : Graph( std::make_shared()) {} + Graph() : Graph(std::make_shared()) {} at::ArrayRef inputs() { return block_->inputs(); diff --git a/torch/csrc/jit/ivalue.h b/torch/csrc/jit/ivalue.h index 42a5be89e55e4b..60801c49c2ce33 100644 --- a/torch/csrc/jit/ivalue.h +++ b/torch/csrc/jit/ivalue.h @@ -83,6 +83,7 @@ struct ConstantList; struct IValue; using Tuple = ConstantList; using IntList = ConstantList; +using TensorList = ConstantList; using DoubleList = ConstantList; // IValue is the generic tagged union used by the interpreter to hold @@ -93,7 +94,7 @@ using DoubleList = ConstantList; // retain/release calls. #define TORCH_FORALL_TAGS(_) \ - _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) + _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(TensorList) struct IValue { IValue() @@ -223,6 +224,20 @@ struct IValue { return toRetainable(); } + //TensorList + IValue(Shared v); + IValue(std::vector v); + bool isTensorList() const { return Tag::TensorList == tag; } + Shared toTensorList() && { + JIT_ASSERT(isTensorList()); + return moveToRetainable(); + } + Shared toTensorList() const & { + JIT_ASSERT(isTensorList()); + return toRetainable(); + } + + // None bool isNone() { return Tag::None == tag; } @@ -369,6 +384,13 @@ inline IValue::IValue(Shared v) inline IValue::IValue(std::vector v) : IValue(DoubleList::create(std::move(v))) {} +inline IValue::IValue(Shared v) +: tag(Tag::TensorList), retainable(true) { + as_retainable = v.detach(); +} +inline IValue::IValue(std::vector v) +: IValue(TensorList::create(std::move(v))) {} + inline std::vector IValue::copyToIntList() const { return std::vector(toIntList()->elements()); } diff --git a/torch/csrc/jit/operator.cpp b/torch/csrc/jit/operator.cpp index f19d18caa9289e..66af83bb9525fb 100644 --- a/torch/csrc/jit/operator.cpp +++ b/torch/csrc/jit/operator.cpp @@ -359,22 +359,6 @@ bool Operator::matches(const Node* node) const { return false; } attributes_seen++; - } else if(*arg.type == *ListType::ofTensors()) { - // Tensor[] is handled as varargs, consume inputs until the remaining required arguments - // XXX - there can only be a single Tensor[] in a declaration - size_t remaining_required = 0; - for(size_t j = arg_i + 1; j < schema.arguments.size(); ++j){ - // remaining arguments are only those that won't be consumed from attributes - if(attributes_size == 0 || !attributeKindOf(schema.arguments[j].type)) - remaining_required++; - } - while(inputs_size - input_i > remaining_required) { - auto input = node->inputs()[input_i++]; - if(!typeMatches(input->type(), DynamicType::get())) { - // std::cout << "vararg argument is not Dynamic\n"; - return false; - } - } } else { if(input_i == inputs_size) { // std::cout << "not enough inputs\n"; diff --git a/torch/csrc/jit/operator.h b/torch/csrc/jit/operator.h index 7e6a314d2cb8c3..081344d0d1d83e 100644 --- a/torch/csrc/jit/operator.h +++ b/torch/csrc/jit/operator.h @@ -10,7 +10,7 @@ namespace torch { namespace jit { -FunctionSchema parseSchema(const std::string& decl); +FunctionSchema parseSchema(const std::string& schema); using OperationCreator = std::function; @@ -33,7 +33,7 @@ struct TORCH_API Operator { FunctionSchema schema; - bool matches(const Node* n) const; + bool matches(const Node* node) const; // Operators have different versions depending on if some inputs are encoded // as attributes or inputs. This function returns the right Operation function, // given a node encoded for one variant. diff --git a/torch/csrc/jit/passes/batch_mm.cpp b/torch/csrc/jit/passes/batch_mm.cpp index 0e40bc8831a6df..414dc1652a4da1 100644 --- a/torch/csrc/jit/passes/batch_mm.cpp +++ b/torch/csrc/jit/passes/batch_mm.cpp @@ -3,8 +3,9 @@ #include "torch/csrc/jit/passes/dead_code_elimination.h" #include "torch/csrc/jit/interned_strings.h" #include "torch/csrc/jit/constants.h" -#include "torch/csrc/utils/functional.h" +#include "torch/csrc/jit/symbolic_variable.h" #include "torch/csrc/jit/assertions.h" +#include "torch/csrc/utils/functional.h" #include #include @@ -191,12 +192,11 @@ void BatchMMBlock(Block* block) { int cat_dim = s == Side::LHS ? 1 : 0; cat_sizes[cat_dim] *= matmuls.size(); // make them really cat_sizes - auto inputs = fmap(matmuls, [=](Node *mm) { return mm->inputs()[inputs_off]; }); WithInsertPoint iguard { root.node }; - inputs.push_back(insertConstant(*graph, cat_dim)); - Node *cat = graph->insertNode(graph->create(aten::cat, inputs)); - cat->output()->setType(type->withSizes(cat_sizes)); - return cat->output(); + auto inputs = fmap(matmuls, [=](Node *mm) -> SymbolicVariable { return mm->inputs()[inputs_off]; }); + auto cat_output = SymbolicVariable::cat(inputs, cat_dim).value(); + cat_output->setType(type->withSizes(cat_sizes)); + return cat_output; }; auto lhs_batch = batch_inputs(Side::LHS, root.lhs_sizes); diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp new file mode 100644 index 00000000000000..39492f9e76c50c --- /dev/null +++ b/torch/csrc/jit/passes/constant_propagation.cpp @@ -0,0 +1,95 @@ +#include "torch/csrc/jit/passes/constant_propagation.h" +#include "torch/csrc/autograd/variable.h" +#include "torch/csrc/jit/constants.h" +#include "torch/csrc/jit/interpreter.h" +#include "torch/csrc/jit/ir.h" +#include "torch/csrc/jit/ivalue.h" +#include "torch/csrc/jit/operator.h" +#include "torch/csrc/jit/passes/dead_code_elimination.h" +#include "torch/csrc/utils/functional.h" + +namespace torch { namespace jit { + +namespace { + +std::unordered_set skip_list = { + //FIXME If & Loop require special casing because they cannot be run as a + //single node. + prim::If, + prim::Loop, + //FIXME Same problem as in DCE - cpp & python PythonOp and CppOp should be + //FIXME treated as having side effects but ONNX depends on them being removed + prim::Print, + //all the rand functions from native_functions.yaml + aten::permute, + aten::rand, + aten::rand_out, + aten::rand_like, + aten::randint, + aten::randint_out, + aten::randint_like, + aten::randn, + aten::randn_out, + aten::randn_like, + aten::randperm, + aten::randperm_out, + }; + +std::vector runNode(Node* n) { + auto op = getOperation(n); + Stack stack; + for (auto input : n->inputs()) { + stack.push_back(*(toIValue(input))); + } + op(stack); + auto var_outputs = fmap(stack, [&](IValue v) { + if (v.isTensor()) { + return IValue(autograd::as_variable_ref(v.toTensor()).data()); + } else { + return v; + } + }); + return var_outputs; +} + +void propagateNode(Node* n) { + auto outputs = runNode(n); + auto graph = n->owningGraph(); + WithInsertPoint guard(n); + for (size_t i = 0; i < outputs.size(); ++i) { + auto new_output = insertConstant(*graph, outputs[i]); + n->outputs()[i]->replaceAllUsesWith(new_output); + // let dce elimination remove n + } +} + +} // anonymous namespace + +void ConstantPropagation(Node* n, bool recurse) { + bool constant_inputs = (n->inputs().size() > 0) && + std::all_of(n->inputs().begin(), n->inputs().end(), [&](Value* v) { + return v->node()->kind() == prim::Constant; + }); + bool supported_node = skip_list.count(n->kind()) == 0; + if (constant_inputs && supported_node) { + propagateNode(n); + } + if (recurse) { + for (Block * block : n->blocks()) + ConstantPropagation(block, recurse); + } +} + +void ConstantPropagation(Block* block, bool recurse) { + ConstantPropagation(block->param_node(), recurse); + for (auto n: block->nodes()) { + ConstantPropagation(n, recurse); + } +} + +void ConstantPropagation(std::shared_ptr& graph) { + ConstantPropagation(graph->block(), true); + EliminateDeadCode(graph); +} + +}} diff --git a/torch/csrc/jit/passes/constant_propagation.h b/torch/csrc/jit/passes/constant_propagation.h new file mode 100644 index 00000000000000..12df329c81ccfc --- /dev/null +++ b/torch/csrc/jit/passes/constant_propagation.h @@ -0,0 +1,11 @@ +#pragma once + +#include "torch/csrc/jit/ir.h" + +namespace torch { namespace jit { + +TORCH_API void ConstantPropagation(std::shared_ptr& graph); +TORCH_API void ConstantPropagation(Block* block, bool recurse); +TORCH_API void ConstantPropagation(Node* n, bool recurse); + +}} diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp index cb3757cffb0e34..cc8dcb8926dee0 100644 --- a/torch/csrc/jit/passes/graph_fuser.cpp +++ b/torch/csrc/jit/passes/graph_fuser.cpp @@ -177,16 +177,25 @@ struct GraphFuser { } } - bool allCatInputsHaveSameSize(Node * node) { - JIT_ASSERT(node->kind() == aten::cat); - std::vector inputs = node->inputs(); - if (!node->hasAttributes()) { - inputs.pop_back(); // Get rid of the dim argument - } + bool isFusableCatNode(Node * node) { + if (node->kind() != aten::cat) + return false; + if (!node->is_constant(attr::dim)) + return false; - auto expected = inputs.at(0)->type()->cast(); + auto tensors_node = node->namedInput(attr::tensors)->node(); + if (tensors_node->kind() != prim::ListConstruct) return false; + // NB: Note that technically other uses of the list aren't a big problem for us. + // It would be enough to place the prim::FusedConcat before the prim::ListConstruct, and + // allUsersAreThisConsumerOrOccurAfterIt would still be satisfied. However, I don't expect this + // to be necessary any time soon, and so we're simply assuming that we don't have to deal with that. + if (tensors_node->output()->uses().size() > 1) return false; + auto tensors = tensors_node->inputs(); + + // Our fusion code assumes that all inputs have the same shapes, so we need to check this too. + auto expected = tensors.at(0)->type()->cast(); if (!expected) return false; - return std::all_of(inputs.begin(), inputs.end(), [expected](Value *v) { + return std::all_of(tensors.begin(), tensors.end(), [&expected](Value *v) { auto actual = v->type()->cast(); return actual && actual->sizes() == expected->sizes(); }); @@ -197,15 +206,7 @@ struct GraphFuser { // because it is not a simple map, can be put in a fusion group // as long as no items in the group read the output of concat bool isFusableAsExitNode(Node * node) { - if(isFusable(node)) - return true; - // this concat fusion only works when all the inputs are the same size - // and we can statically infer the dimension along which we should concat - // otherwise they cannot partipate in the same map - if(node->kind() == aten::cat && node->is_constant(attr::dim) && allCatInputsHaveSameSize(node)) - return true; - - return false; + return isFusable(node) || isFusableCatNode(node); } // necessary condition for fusion. If all of the uses of producer are consumer @@ -241,8 +242,9 @@ struct GraphFuser { // we can move the consumer up into the producer. // but this requires better handling of merging fusion groups so it is not done now at::optional consumer_device = getDevice(consumer); + Node *real_consumer = consumer->kind() == aten::cat ? consumer->namedInput(attr::tensors)->node() : consumer; return isFusable(producer->node()) && - allUsersAreThisConsumerOrOccurAfterIt(consumer, producer) && + allUsersAreThisConsumerOrOccurAfterIt(real_consumer, producer) && consumer_device && consumer_device == getDevice(producer->node()) && (*consumer_device != kCPUDevice || sharedFusionCompiler().canCompileOnCPU()); } @@ -389,7 +391,24 @@ struct GraphFuser { Node * fuse(Node * consumer, Value * producer) { auto group = consumer; - if(group->kind() != prim::FusionGroup) { + if (consumer->kind() == aten::cat) { + Graph * graph = consumer->owningGraph(); + Node * list_construct = consumer->namedInput(attr::tensors)->node(); + int64_t dim = consumer->get(attr::dim).value(); + + Node * fused_cat = graph->create(prim::FusedConcat, list_construct->inputs())->i_(attr::dim, dim); + fused_cat->insertBefore(list_construct); + fused_cat->output()->copyMetadata(consumer->output()); + consumer->output()->replaceAllUsesWith(fused_cat->output()); + topological_index[fused_cat] = topological_index[list_construct]; + + // NB: this deletes the fused_cat node from the original graph + group = createSingletonFusionGroup(fused_cat); + consumer->destroy(); + if (list_construct->output()->uses().empty()) { + list_construct->destroy(); + } + } else if (consumer->kind() != prim::FusionGroup) { group = createSingletonFusionGroup(consumer); } if (producer->node()->kind() == prim::FusionGroup) { @@ -450,7 +469,6 @@ struct GraphFuser { } } - // TODO: Remove this restriction if we ever need to distribute across // multiple return operators Node * producer_for_chunk_node = producer_for_chunk->node(); JIT_ASSERT(producer_for_chunk_node->outputs().size() == 1); @@ -521,11 +539,14 @@ struct GraphFuser { std::pair scanNode(Node * consumer) { auto stage_guard = block->owningGraph()->setStageTemporary(consumer->stage()); if(isFusableAsExitNode(consumer)) { + value_list inputs; + auto consumer_inputs = consumer->kind() == aten::cat ? + consumer->namedInput(attr::tensors)->node()->inputs() : + consumer->inputs(); // handle inputs in reverse topological order as well... // otherwise in f(a,a+b) it will appear a is used twice if we consider // the f-a fusion before the f-(a+b) fusion first. - value_list inputs; - for(auto i : consumer->inputs()) { + for(auto i : consumer_inputs) { if (i->node()->owningBlock() == block) { inputs.push_back(i); JIT_ASSERT(topological_index.count(i->node()) > 0); diff --git a/torch/csrc/jit/passes/lower_grad_of.h b/torch/csrc/jit/passes/lower_grad_of.h index a0a881e3002ed9..0ec3589e3acd31 100644 --- a/torch/csrc/jit/passes/lower_grad_of.h +++ b/torch/csrc/jit/passes/lower_grad_of.h @@ -10,6 +10,6 @@ namespace torch { namespace jit { // outputs = // else: // outputs = undefineds -TORCH_API void LowerGradOf(Graph& graph); +TORCH_API void LowerGradOf(Graph& g); }} diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp index 63fb7030aa3ad1..ee9b76f417bd17 100644 --- a/torch/csrc/jit/passes/shape_analysis.cpp +++ b/torch/csrc/jit/passes/shape_analysis.cpp @@ -263,6 +263,39 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) { default: break; // fall-through } + if (node->matches("aten::cat(Tensor[] tensors, int dim) -> Tensor", /*with_const=*/attr::dim)) { + auto list_node = node->namedInput(attr::tensors)->node(); + JIT_ASSERT(list_node->kind() == prim::ListConstruct); + auto tensors = list_node->inputs(); + if (tensors.size() > 0) { + auto input_types = fmap(tensors, [](Value *v) { return v->type()->cast(); }); + if (std::all_of(input_types.begin(), input_types.end(), + [](const TensorTypePtr& tp) { return tp != nullptr; })) { + std::vector sizes = input_types[0]->sizes(); + const int64_t dim = wrapDim(node->get(attr::dim).value(), sizes); + const int64_t ndim = sizes.size(); + + if (dim < 0 || dim >= ndim) + goto cat_fail; + + sizes[dim] = 0; + for (auto & tp : input_types) { + auto & tp_sizes = tp->sizes(); + if (sizes.size() != tp_sizes.size()) + goto cat_fail; + for (int64_t i = 0; i < ndim; ++i) { + if (sizes[i] != tp_sizes[i] && i != dim) { + goto cat_fail; + } + } + sizes[dim] += tp_sizes[dim]; + } + node->output()->setType(input_types[0]->withSizes(sizes)); + return; + } + } + } +cat_fail: bool can_propagate_by_running = canPropagateShapeByRunningIt(node); auto maybe_tensor_types = gatherTensorTypes(node); diff --git a/torch/csrc/jit/python_arg_flatten.h b/torch/csrc/jit/python_arg_flatten.h index b5139032fde169..9894b802b2d29c 100644 --- a/torch/csrc/jit/python_arg_flatten.h +++ b/torch/csrc/jit/python_arg_flatten.h @@ -104,7 +104,7 @@ struct ParsedArgs { ParsedArgs flatten(py::handle obj); -PyObject* unflatten(at::ArrayRef outputs, +PyObject* unflatten(at::ArrayRef vars, const IODescriptor& structure); }}} // namespace torch::jit::python diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp index 81211085569953..b72fdb6b8860b1 100644 --- a/torch/csrc/jit/python_ir.cpp +++ b/torch/csrc/jit/python_ir.cpp @@ -451,10 +451,22 @@ void initPythonIRBindings(PyObject * module_) { .def("scalarType",[](Type& t) { return at::toString(t.expect()->scalarType()); }) - ; + .def("__eq__", [](std::shared_ptr& self, std::shared_ptr& other) { + return *self == *other; + }) + .def("isSubtypeOf", [](std::shared_ptr& self, std::shared_ptr other) { + return self->isSubtypeOf(other); + }); + py::class_>(m, "NumberType") + .def_static("get", &NumberType::get); + py::class_>(m, "IntType") + .def_static("get", &IntType::get); + py::class_>(m, "FloatType") + .def_static("get", &FloatType::get); py::class_>(m, "DynamicType") - .def(py::init([](){ return DynamicType::create(); })); + .def_static("get", &DynamicType::get); + py::class_>(m, "TupleType") .def(py::init([](std::vector a){ return TupleType::create(a); })) .def("elements", [](TupleType &self){ @@ -465,7 +477,9 @@ void initPythonIRBindings(PyObject * module_) { return types; }); py::class_>(m, "ListType") - .def_static("ofInts", &ListType::ofInts); + .def_static("ofInts", &ListType::ofInts) + .def_static("ofTensors", &ListType::ofTensors) + .def("getElementType", &ListType::getElementType); py::class_(m,"Use") .def_readonly("user",&Use::user) diff --git a/torch/csrc/jit/python_tracer.cpp b/torch/csrc/jit/python_tracer.cpp index 7439b2b5e334cc..0496af67412654 100644 --- a/torch/csrc/jit/python_tracer.cpp +++ b/torch/csrc/jit/python_tracer.cpp @@ -103,10 +103,10 @@ void pythonRecordSourceLocation(Node* n) { n->setSourceLocation(sl); } -void initPythonTracerBindings(PyObject* module_) { +void initPythonTracerBindings(PyObject* module) { setRecordSourceLocation(pythonRecordSourceLocation); - auto m = py::handle(module_).cast(); + auto m = py::handle(module).cast(); py::class_>(m, "TracingState", py::dynamic_attr()) // NB: no constructor; you have to get it from C++ code .def("__repr__", [](const TracingState& s) { diff --git a/torch/csrc/jit/register_prim_ops.cpp b/torch/csrc/jit/register_prim_ops.cpp index 8fe747e59900f0..90d49e230792e9 100644 --- a/torch/csrc/jit/register_prim_ops.cpp +++ b/torch/csrc/jit/register_prim_ops.cpp @@ -231,6 +231,18 @@ RegisterOperators reg({ push(stack, std::move(vals)); return 0; }; + } else if (lt->getElementType()->isSubtypeOf(DynamicType::get())) { + return [=](Stack& stack) { + const size_t stack_size = stack.size(); + std::vector vals; + vals.reserve(num_inputs); + for (size_t i = stack_size - num_inputs; i < stack_size; ++i) { + vals.push_back(std::move(stack[i]).toTensor()); + } + drop(stack, num_inputs); + push(stack, std::move(vals)); + return 0; + }; } else { std::stringstream ss; ss << "unsupported list type: " << *lt->getElementType(); diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp index 0016f69b5ce07b..961dd3980933b3 100644 --- a/torch/csrc/jit/script/compiler.cpp +++ b/torch/csrc/jit/script/compiler.cpp @@ -351,27 +351,6 @@ Value* createNumber(Graph& g, const SourceRange& loc, const at::Tensor& val) { return output; } -Value* createStack(Graph& g, const SourceRange& loc, at::ArrayRef inputs) { - // bake in constant propagation for the all-constant case because it is - // common to see constant lists like [1, 2] passed to attributes - bool all_constant = std::all_of(inputs.begin(), inputs.end(), [&](Value* v) { - return v->node()->kind() == prim::Constant; - }); - if(all_constant) { - auto values = fmap(inputs, [&](Value* v) { - return v->node()->t(attr::value); - }); - return insertConstant(g, at::stack(values), loc); - } - return g.insertNode(g.create(aten::stack, inputs) - ->i_(attr::dim, 0) - ->setSourceLocation(std::make_shared(loc)))->output(); -} - -static bool isTensorSubtype(Value* v) { - return v->type()->isSubtypeOf(DynamicType::get()); -} - at::optional> getIntListAttribute(at::optional N, Value* input) { auto list = constant_as>(input); if(list) @@ -455,51 +434,46 @@ at::optional> tryMatchSchema( } // check input types - std::vector flat_inputs; + std::vector matched_inputs; for(size_t i = 0; i < schema.arguments.size(); ++i) { - NamedValue v = *positional_inputs[i]; + Value* value = positional_inputs[i]->value; const auto& arg = schema.arguments[i]; // some functions that take lists of integers for fixed size arrays // also allow single ints to be passed in their place. // the single int is then repeated to the length of the list - if (isIntUsedAsIntList(v.value, arg)) { - std::vector repeated(*arg.N, v.value); - v.value = graph.insertNode(graph.createList(IntType::get(), repeated))->output(); + if (isIntUsedAsIntList(value, arg)) { + std::vector repeated(*arg.N, value); + value = graph.insertNode(graph.createList(IntType::get(), repeated))->output(); } - // Allow tuples that only contain integers to turn into lists of integers - if(*ListType::ofInts() == *arg.type && - v.value->type()->kind() == TypeKind::TupleType && - v.value->type()->isSubtypeOf(ListType::ofInts())) { - auto unpacked = createTupleUnpack(v.value); - v.value = graph.insertNode(graph.createList(IntType::get(), unpacked))->output(); + // Allow homogeneous tuples to be casted implicitly to lists of appropriate types + if (arg.type->kind() == TypeKind::ListType && + value->type()->kind() == TypeKind::TupleType && + value->type()->isSubtypeOf(arg.type)) { + auto unpacked = createTupleUnpack(value); + auto elem_type = arg.type->expect()->getElementType(); + value = graph.insertNode(graph.createList(elem_type, unpacked))->output(); } - if (v.value->node()->kind() == prim::None){ + if (value->node()->kind() == prim::None){ if (arg.type->isSubtypeOf(NumberType::get())) - v.value = insertConstant(graph, at::Scalar(NAN), loc); + value = insertConstant(graph, at::Scalar(NAN), loc); else - v.value = graph.insertNode(graph.createUndefined())->output(); + value = graph.insertNode(graph.createUndefined())->output(); } - if(!v.value->type()->isSubtypeOf(arg.type)) { + if(!value->type()->isSubtypeOf(arg.type)) { err() << "expected a value of type " << arg.type->str() << " for argument '" << arg.name << "' but found " - << v.value->type()->str() << "\n" - << v.loc; + << value->type()->str() << "\n" + << positional_inputs[i]->loc; return at::nullopt; } - // we only support tensor lists for builtins, where they must be flattened - if(arg.type->isSubtypeOf(ListType::ofTensors())) { - auto outputs = createTupleUnpack(v.value); - flat_inputs.insert(flat_inputs.end(), outputs.begin(), outputs.end()); - } else { - flat_inputs.push_back(v.value); - } + matched_inputs.push_back(value); } - return flat_inputs; + return matched_inputs; } @@ -513,20 +487,20 @@ static std::shared_ptr tryEmitBuiltin( at::ArrayRef attributes) { auto graph = method.graph(); - auto flat_inputs = tryMatchSchema(op->schema, loc, *graph, inputs, attributes, failure_messages); - if(!flat_inputs) + auto matched_inputs = tryMatchSchema(op->schema, loc, *graph, inputs, attributes, failure_messages); + if(!matched_inputs) return nullptr; // we successfully matched this schema, construct the node NodeKind kind(Symbol::aten(name)); - auto n = graph->insertNode(graph->create(kind, *flat_inputs, 0)) + auto n = graph->insertNode(graph->create(kind, *matched_inputs, 0)) ->setSourceLocation(std::make_shared(loc)); // special case for chunk when the chunks= is known // DO NOT ADD MORE SPECIAL CASES HERE, REFACTOR INTO A FUNCTION IF // NEEDED if(n->kind() == aten::chunk) { - auto value = constant_as((*flat_inputs)[1]); + auto value = constant_as((*matched_inputs)[1]); if(!value) { throw ErrorReport(loc) << "argument 'chunks' must be a constant"; } @@ -588,7 +562,7 @@ std::shared_ptr emitBuiltinCall( } static Value* ensureTensor(const SourceRange& range, Value* v) { - if(!isTensorSubtype(v)) { + if(!v->type()->isSubtypeOf(DynamicType::get())) { throw ErrorReport(range) << "expected a tensor value but found a " << v->type()->str(); } diff --git a/torch/csrc/jit/script/compiler.h b/torch/csrc/jit/script/compiler.h index 0b87cf56be6ad3..3c4dcb07a248ee 100644 --- a/torch/csrc/jit/script/compiler.h +++ b/torch/csrc/jit/script/compiler.h @@ -68,7 +68,7 @@ struct SugaredValue : public std::enable_shared_from_this { SourceRange loc, Method & m, // note: names for args will be 'argument 0', 'argument 1', etc.. - at::ArrayRef inputs, + at::ArrayRef inputs_, at::ArrayRef attributes, size_t n_binders) { // n_binders is always set to the number of variables an expression is @@ -89,7 +89,7 @@ struct SugaredValue : public std::enable_shared_from_this { throw ErrorReport(loc) << "cannot call a " << kind(); } - virtual ~SugaredValue() {} + virtual ~SugaredValue() = default; }; // most things in the environment are just simple value types diff --git a/torch/csrc/jit/script/tree.h b/torch/csrc/jit/script/tree.h index e3d69d2790682d..0b9bc7009e0162 100644 --- a/torch/csrc/jit/script/tree.h +++ b/torch/csrc/jit/script/tree.h @@ -89,7 +89,7 @@ struct Tree : std::enable_shared_from_this { throw std::runtime_error(ss.str()); } } - virtual ~Tree() {} + virtual ~Tree() = default; private: int kind_; diff --git a/torch/csrc/jit/stack.h b/torch/csrc/jit/stack.h index 2c74ae7e0a4c77..7a23aa55df538f 100644 --- a/torch/csrc/jit/stack.h +++ b/torch/csrc/jit/stack.h @@ -77,8 +77,8 @@ inline void pack(Stack & stack, T&& v) { } template<> -inline void pack(Stack & stack, std::vector&& ts) { - for(auto& t : ts) { +inline void pack(Stack & stack, std::vector&& v) { + for(auto& t : v) { stack.push_back(IValue(std::move(t))); } } diff --git a/torch/csrc/jit/symbolic_variable.h b/torch/csrc/jit/symbolic_variable.h index e4d2f98ba0ea0f..ef6d41005789f8 100644 --- a/torch/csrc/jit/symbolic_variable.h +++ b/torch/csrc/jit/symbolic_variable.h @@ -119,18 +119,20 @@ struct SymbolicVariable { return create(t("narrow"), { *this, insertConstant(dim), insertConstant(start), insertConstant(length) }, 1)[0]; } static SymbolicVariable cat(ArrayRef inputs, Value* dim) { - std::vector all_inputs = inputs; - all_inputs.push_back(dim); - return create(aten::cat, all_inputs)[0]; + Graph *g = dim->owningGraph(); + auto value_inputs = fmap(inputs, [](const SymbolicVariable & v) { return v.value(); }); + Value *input_list = g->insertNode(g->createList(DynamicType::get(), value_inputs))->output(); + return create(aten::cat, {input_list, dim})[0]; } static SymbolicVariable cat(ArrayRef inputs, int dim) { JIT_ASSERT(inputs.size() > 0); return SymbolicVariable::cat(inputs, inputs[0].insertConstant(dim)); } static SymbolicVariable stack(ArrayRef inputs, Value* dim) { - std::vector all_inputs = inputs; - all_inputs.push_back(dim); - return create(aten::stack, all_inputs)[0]; + Graph *g = dim->owningGraph(); + auto value_inputs = fmap(inputs, [](const SymbolicVariable & v) { return v.value(); }); + Value *input_list = g->insertNode(g->createList(DynamicType::get(), value_inputs))->output(); + return create(aten::stack, {input_list, dim})[0]; } static SymbolicVariable stack(ArrayRef inputs, int dim) { JIT_ASSERT(inputs.size() > 0); diff --git a/torch/csrc/jit/test_jit.cpp b/torch/csrc/jit/test_jit.cpp index 8c9763f88353e5..d5d204f9465bd8 100644 --- a/torch/csrc/jit/test_jit.cpp +++ b/torch/csrc/jit/test_jit.cpp @@ -220,6 +220,9 @@ static void fusionTests() { testOne(1,2,0,2); + auto createFusedConcat = [](Graph & graph, at::ArrayRef inputs, int64_t dim) -> Value* { + return graph.insertNode(graph.create(prim::FusedConcat, inputs)->i_(attr::dim, dim))->output(); + }; auto testConcat = [&](int dim) { Graph graph; @@ -227,7 +230,7 @@ static void fusionTests() { Var i1 = Var::asNewInput(graph); auto o0 = i0 * i1; o0.addAsOutput(); - Var::cat({i0, o0}, dim).addAsOutput(); + Var(createFusedConcat(graph, {i0, o0}, dim)).addAsOutput(); auto a = at::rand({3,4,5}, at::kCUDA); auto b = at::rand({4,3,5}, at::kCUDA).transpose(0,1); @@ -776,6 +779,9 @@ void argumentSpecTest() { REQUIRE(!(c == a)); REQUIRE(spec.count(c) == 0); + Stack stack = { var(CF, {1,2}, true), 3, var(CF, {1,2}, true) }; + ArgumentSpec with_const(true, stack); + REQUIRE(with_const.at(2).sizes().size() == 2); } void shapeAnalysisTest() { diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp index aec6eb4ddc9447..a0e2f65e617754 100644 --- a/torch/csrc/jit/tracer.cpp +++ b/torch/csrc/jit/tracer.cpp @@ -38,9 +38,9 @@ void addInputs(Node *n, const char * name, const std::string& value) { b void addInputs(Node *n, const char * name, const at::SparseTensorRef& value) { badArgType(); } void addInputs(Node *n, const char * name, at::TensorList value) { - for (auto & t : value) { - n->addInput(getValueTrace(t)); - } + Graph *g = n->owningGraph(); + Node *list_node = g->appendNode(g->createList(DynamicType::get(), fmap(value, getValueTrace))); + n->addInput(list_node->output()); } void addInputs(Node *n, const char * name, at::IntList value) { diff --git a/torch/csrc/jit/type.cpp b/torch/csrc/jit/type.cpp index ebcc91a908c213..ddb4dfad0154ad 100644 --- a/torch/csrc/jit/type.cpp +++ b/torch/csrc/jit/type.cpp @@ -46,31 +46,31 @@ std::ostream& operator<<(std::ostream & out, const Type & t) { return out; } -TypePtr DynamicType::get() { +DynamicTypePtr DynamicType::get() { static auto value = DynamicType::create(); return value; } -TypePtr NumberType::get() { +NumberTypePtr NumberType::get() { static auto value = NumberType::create(); return value; } -TypePtr IntType::get() { +IntTypePtr IntType::get() { static auto value = IntType::create(); return value; } -TypePtr FloatType::get() { +FloatTypePtr FloatType::get() { static auto value = FloatType::create(); return value; } -TypePtr NoneType::get() { +NoneTypePtr NoneType::get() { static auto value = NoneType::create(); return value; } -TypePtr ListType::ofTensors() { +ListTypePtr ListType::ofTensors() { static auto value = ListType::create(DynamicType::get()); return value; } -TypePtr ListType::ofInts() { +ListTypePtr ListType::ofInts() { static auto value = ListType::create(IntType::get()); return value; } diff --git a/torch/csrc/jit/type.h b/torch/csrc/jit/type.h index 7b7d708a549b32..5d01cf0a1552c1 100644 --- a/torch/csrc/jit/type.h +++ b/torch/csrc/jit/type.h @@ -80,7 +80,7 @@ struct TORCH_API Type : std::enable_shared_from_this { JIT_ASSERT(T::Kind == kind()); return std::static_pointer_cast(shared_from_this()); } - virtual ~Type() {} + virtual ~Type() = default; }; inline bool operator!=(const Type & lhs, const Type & rhs) { @@ -104,7 +104,7 @@ struct TORCH_API DynamicType : public Type { } static const TypeKind Kind = TypeKind::DynamicType; // global singleton - static TypePtr get(); + static DynamicTypePtr get(); private: DynamicType() : Type(TypeKind::DynamicType) {} @@ -237,8 +237,8 @@ struct TORCH_API ListType : public Type { return elem; } // common cast List[Tensor] - static TypePtr ofTensors(); - static TypePtr ofInts(); + static ListTypePtr ofTensors(); + static ListTypePtr ofInts(); private: ListType(TypePtr elem) : Type(TypeKind::ListType), elem(elem) {} @@ -326,7 +326,7 @@ struct TORCH_API NumberType : public Type { } static const TypeKind Kind = TypeKind::NumberType; // global singleton - static TypePtr get(); + static NumberTypePtr get(); private: NumberType() : Type(TypeKind::NumberType) {} @@ -351,7 +351,7 @@ struct TORCH_API FloatType : public Type { } static const TypeKind Kind = TypeKind::FloatType; // global singleton - static TypePtr get(); + static FloatTypePtr get(); private: FloatType() : Type(TypeKind::FloatType) {} @@ -376,7 +376,7 @@ struct TORCH_API IntType : public Type { } static const TypeKind Kind = TypeKind::IntType; // global singleton - static TypePtr get(); + static IntTypePtr get(); private: IntType() : Type(TypeKind::IntType) {} @@ -401,7 +401,7 @@ struct NoneType : public Type { } static const TypeKind Kind = TypeKind::NoneType; // global singleton - static TypePtr get(); + static NoneTypePtr get(); private: NoneType() : Type(TypeKind::NoneType) {} diff --git a/torch/csrc/jit/variable_tensor_list.h b/torch/csrc/jit/variable_tensor_list.h index eeae2a66b17e5f..0916fe6ac051d2 100644 --- a/torch/csrc/jit/variable_tensor_list.h +++ b/torch/csrc/jit/variable_tensor_list.h @@ -6,10 +6,10 @@ namespace torch { namespace jit { // a wrapper to mark places where we expect all the at::Tensors to be // variables struct variable_tensor_list : public std::vector { - variable_tensor_list() {} + variable_tensor_list() = default; template variable_tensor_list(InputIt first, InputIt last) - : std::vector(first, last) {} + : std::vector(first, last) {} explicit variable_tensor_list(std::vector && tensor) : std::vector(std::move(tensor)) {} }; diff --git a/torch/csrc/utils/hash.h b/torch/csrc/utils/hash.h index 05a5a27b51223a..954a7b5b7d0814 100644 --- a/torch/csrc/utils/hash.h +++ b/torch/csrc/utils/hash.h @@ -32,7 +32,7 @@ namespace torch { // DEALINGS IN THE SOFTWARE. inline size_t hash_combine(size_t seed, size_t value) { - return seed ^ (value + 0x9e3779b9 + (seed << 6) + (seed >> 2)); + return seed ^ (value + 0x9e3779b9 + (seed << 6u) + (seed >> 2u)); } //////////////////////////////////////////////////////////////////////////////// diff --git a/torch/csrc/utils/invalid_arguments.cpp b/torch/csrc/utils/invalid_arguments.cpp index f8d5fd1ba1cd63..0160bdd2d8e506 100644 --- a/torch/csrc/utils/invalid_arguments.cpp +++ b/torch/csrc/utils/invalid_arguments.cpp @@ -16,7 +16,7 @@ std::string py_typename(PyObject *object) { struct Type { virtual bool is_matching(PyObject *object) = 0; - virtual ~Type() {}; + virtual ~Type() = default; }; struct SimpleType: public Type { diff --git a/torch/csrc/utils/invalid_arguments.h b/torch/csrc/utils/invalid_arguments.h index 138c3331113b7c..daaccfd877f377 100644 --- a/torch/csrc/utils/invalid_arguments.h +++ b/torch/csrc/utils/invalid_arguments.h @@ -7,7 +7,9 @@ namespace torch { std::string format_invalid_args( - PyObject *args, PyObject *kwargs, const std::string& name, + PyObject* given_args, + PyObject* given_kwargs, + const std::string& function_name, const std::vector& options); } // namespace torch diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h index b00bd27c087495..0f2f51904c2554 100644 --- a/torch/csrc/utils/python_arg_parser.h +++ b/torch/csrc/utils/python_arg_parser.h @@ -90,8 +90,8 @@ struct PythonArgParser { private: [[noreturn]] - void print_error(PyObject* args, PyObject* kwargs, PyObject* dst[]); - PythonArgs raw_parse(PyObject* args, PyObject* kwargs, PyObject* dst[]); + void print_error(PyObject* args, PyObject* kwargs, PyObject* parsed_args[]); + PythonArgs raw_parse(PyObject* args, PyObject* kwargs, PyObject* parsed_args[]); std::vector signatures_; std::string function_name; diff --git a/torch/csrc/utils/tensor_apply.h b/torch/csrc/utils/tensor_apply.h index 47fbaa672c4262..5dfdef98c81db4 100644 --- a/torch/csrc/utils/tensor_apply.h +++ b/torch/csrc/utils/tensor_apply.h @@ -6,8 +6,8 @@ namespace torch { namespace utils { at::Tensor & apply_(at::Tensor & self, PyObject* fn); -at::Tensor & map_(at::Tensor & self, const at::Tensor & other, PyObject* fn); -at::Tensor & map2_(at::Tensor & self, const at::Tensor & other1, - const at::Tensor & other2, PyObject* fn); +at::Tensor & map_(at::Tensor & self, const at::Tensor & other_, PyObject* fn); +at::Tensor & map2_(at::Tensor & self, const at::Tensor & x_, + const at::Tensor & y_, PyObject* fn); }} // namespace torch::utils diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp index 3a8b4a7bbc1592..d03fd55f2accfc 100644 --- a/torch/csrc/utils/tensor_new.cpp +++ b/torch/csrc/utils/tensor_new.cpp @@ -139,8 +139,10 @@ ScalarType infer_scalar_type(PyObject *obj) { } #ifdef USE_NUMPY if (PyArray_Check(obj)) { - auto array = (PyArrayObject*)obj; - return numpy_dtype_to_aten(PyArray_TYPE(array)); + return numpy_dtype_to_aten(PyArray_TYPE((PyArrayObject*)obj)); + } + if (PyArray_CheckScalar(obj)) { + return numpy_dtype_to_aten(PyArray_TYPE((PyArrayObject*)(PyArray_FromScalar(obj, NULL)))); } #endif if (PySequence_Check(obj)) { diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py index f8b26b121fd3e8..a2086ae95b899c 100644 --- a/torch/distributed/__init__.py +++ b/torch/distributed/__init__.py @@ -61,7 +61,8 @@ def init_process_group(backend, init_method='env://', **kwargs): group_name (str, optional): Group name. See description of init methods. To enable ``backend == mpi``, PyTorch needs to built from source on a system that - supports MPI. + supports MPI. If you want to use Openmpi with CUDA-aware support, please use Openmpi + major version 2 and above. """ world_size = kwargs.pop('world_size', -1) diff --git a/torch/jit/annotations.py b/torch/jit/annotations.py index 77e6cf777f2784..c06a927add64b1 100644 --- a/torch/jit/annotations.py +++ b/torch/jit/annotations.py @@ -204,9 +204,9 @@ def as_ann(ann): def ann_to_type(ann): if ann is None: - return DynamicType() + return DynamicType.get() elif ann is torch.Tensor: - return DynamicType() + return DynamicType.get() elif is_tuple(ann): return TupleType([ann_to_type(a) for a in ann.__args__]) raise ValueError("The only supported annotations kinds are Tensor and Tuple[...]") diff --git a/torch/lib/THD/base/data_channels/DataChannelMPI.cpp b/torch/lib/THD/base/data_channels/DataChannelMPI.cpp index cc176931d8c0c2..b23157581bdfc0 100644 --- a/torch/lib/THD/base/data_channels/DataChannelMPI.cpp +++ b/torch/lib/THD/base/data_channels/DataChannelMPI.cpp @@ -100,6 +100,14 @@ void DataChannelMPI::destroy() {} bool DataChannelMPI::init() { +#ifdef OMPI_MAJOR_VERSION + // OMPI_* is specific to Openmpi implementation. + // Openmpi v1.10 segfaults in MPI_Bcast with CUDA buffer. + if (int(OMPI_MAJOR_VERSION) < 2) { + throw std::runtime_error("Please use Openmpi major version 2 and above for distributed."); + } +#endif /* OMPI_MAJOR_VERSION */ + int provided; MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &provided); if (provided != MPI_THREAD_MULTIPLE) { diff --git a/torch/nn/functional.py b/torch/nn/functional.py index 17a7c09b012da6..454230ad9e6171 100644 --- a/torch/nn/functional.py +++ b/torch/nn/functional.py @@ -1350,6 +1350,41 @@ def local_response_norm(input, size, alpha=1e-4, beta=0.75, k=1): # loss +def ctc_loss(log_probs, targets, input_lengths, target_lengths, blank=0, + reduction='elementwise_mean'): + r"""The Connectionist Temporal Classification loss. + + See :class:`~torch.nn.CTCLoss` for details. + + Args: + log_probs: :math:`(T, N, C)` where `C = number of characters in alphabet including blank`, + `T = input length`, and `N = batch size`. + The logarithmized probabilities of the outputs + (e.g. obtained with :func:`torch.nn.functional.log_softmax`). + targets: :math:`(N, S)` or `(sum(target_lenghts))`. + Targets (cannot be blank). In the second form, the targets are assumed to be concatenated. + input_lengths: :math:`(N)`. + Lengths of the inputs (must each be :math:`\leq T`) + target_lengths: :math:`(N)`. + Lengths of the targets + blank (int, optional): + Blank label. Default :math:`0`. + reduction (string, optional): Specifies the reduction to apply to the output: + 'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied, + 'elementwise_mean': the output losses will be divided by the target lengths and + then the mean over the batch is taken. Default: 'elementwise_mean' + + Example:: + + >>> log_probs = torch.randn(50, 16, 20).log_softmax(2).detach().requires_grad_() + >>> targets = torch.randint(1, 21, (16, 30), dtype=torch.long) + >>> input_lengths = torch.full((16,), 50, dtype=torch.long) + >>> target_lengths = torch.randint(10,30,(16,), dtype=torch.long) + >>> loss = F.ctc_loss(log_probs, targets, input_lengths, target_lengths) + >>> loss.backward() + """ + return torch.ctc_loss(log_probs, targets, input_lengths, target_lengths, blank, _Reduction.get_enum(reduction)) + def nll_loss(input, target, weight=None, size_average=None, ignore_index=-100, reduce=None, reduction='elementwise_mean'): diff --git a/torch/nn/modules/__init__.py b/torch/nn/modules/__init__.py index 4d98f482768a63..7f67ca9fce804d 100644 --- a/torch/nn/modules/__init__.py +++ b/torch/nn/modules/__init__.py @@ -6,7 +6,7 @@ Softmax, Softmax2d, LogSoftmax, ELU, SELU, Hardshrink, LeakyReLU, LogSigmoid, \ Softplus, Softshrink, PReLU, Softsign, Softmin, Tanhshrink, RReLU, GLU from .loss import L1Loss, NLLLoss, KLDivLoss, MSELoss, BCELoss, BCEWithLogitsLoss, NLLLoss2d, \ - CosineEmbeddingLoss, HingeEmbeddingLoss, MarginRankingLoss, \ + CosineEmbeddingLoss, CTCLoss, HingeEmbeddingLoss, MarginRankingLoss, \ MultiLabelMarginLoss, MultiLabelSoftMarginLoss, MultiMarginLoss, \ SmoothL1Loss, SoftMarginLoss, CrossEntropyLoss, TripletMarginLoss, PoissonNLLLoss from .container import Container, Sequential, ModuleList, ModuleDict, ParameterList, ParameterDict @@ -34,7 +34,7 @@ 'Sigmoid', 'Tanh', 'Softmax', 'Softmax2d', 'LogSoftmax', 'ELU', 'SELU', 'GLU', 'Hardshrink', 'LeakyReLU', 'LogSigmoid', 'Softplus', 'Softshrink', 'PReLU', 'Softsign', 'Softmin', 'Tanhshrink', 'RReLU', 'L1Loss', 'NLLLoss', 'KLDivLoss', 'MSELoss', 'BCELoss', 'BCEWithLogitsLoss', - 'NLLLoss2d', 'PoissonNLLLoss', 'CosineEmbeddingLoss', 'HingeEmbeddingLoss', 'MarginRankingLoss', + 'NLLLoss2d', 'PoissonNLLLoss', 'CosineEmbeddingLoss', 'CTCLoss', 'HingeEmbeddingLoss', 'MarginRankingLoss', 'MultiLabelMarginLoss', 'MultiLabelSoftMarginLoss', 'MultiMarginLoss', 'SmoothL1Loss', 'SoftMarginLoss', 'CrossEntropyLoss', 'Container', 'Sequential', 'ModuleList', 'ModuleDict', 'ParameterList', 'ParameterDict', 'AvgPool1d', 'AvgPool2d', 'AvgPool3d', 'MaxPool1d', 'MaxPool2d', diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py index 489e8998843f98..ec7d60d8125152 100644 --- a/torch/nn/modules/loss.py +++ b/torch/nn/modules/loss.py @@ -1123,6 +1123,61 @@ def forward(self, anchor, positive, negative): return F.triplet_margin_loss(anchor, positive, negative, margin=self.margin, p=self.p, eps=self.eps, swap=self.swap, reduction=self.reduction) + +class CTCLoss(_Loss): + r"""The Connectionist Temporal Classification loss. + + Args: + blank (int, optional): blank label. Default :math:`0`. + reduction (string, optional): Specifies the reduction to apply to the output: + 'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied, + 'elementwise_mean': the output losses will be divided by the target lengths and + then the mean over the batch is taken. Default: 'elementwise_mean' + + Inputs: + log_probs: :math:`(T, N, C)` where `C = number of characters in alphabet including blank`, + `T = input length`, and `N = batch size`. + The logarithmized probabilities of the outputs + (e.g. obtained with :func:`torch.nn.functional.log_softmax`). + targets: :math:`(N, S)` or `(sum(target_lenghts))`. + Targets (cannot be blank). In the second form, the targets are assumed to be concatenated. + input_lengths: :math:`(N)`. + Lengths of the inputs (must each be :math:`\leq T`) + target_lengths: :math:`(N)`. + Lengths of the targets + + + Example:: + + >>> ctc_loss = nn.CTCLoss() + >>> log_probs = torch.randn(50, 16, 20).log_softmax(2).detach().requires_grad_() + >>> targets = torch.randint(1, 21, (16, 30), dtype=torch.long) + >>> input_lengths = torch.full((16,), 50, dtype=torch.long) + >>> target_lengths = torch.randint(10,30,(16,), dtype=torch.long) + >>> loss = ctc_loss(log_probs, targets, input_lengths, target_lengths) + >>> loss.backward() + + Reference: + A. Graves et al.: Connectionist Temporal Classification: + Labelling Unsegmented Sequence Data with Recurrent Neural Networks: + https://www.cs.toronto.edu/~graves/icml_2006.pdf + + .. Note:: + In order to use CuDNN, the following must be satisfied: :attr:`targets` must be + in concatenated format, all :attr:`input_lengths` must be `T`. :math:`blank=0`, + :attr:`target_lengths` :math:`\leq 256`, the integer arguments must be of + :class:`torch.IntTensor`. + + The regular implementation uses the (more common in PyTorch) `torch.long` dtype. + """ + + def __init__(self, blank=0, reduction='elementwise_mean'): + super(CTCLoss, self).__init__(reduction=reduction) + self.blank = blank + + def forward(self, log_probs, targets, input_lengths, target_lengths): + return F.ctc_loss(log_probs, targets, input_lengths, target_lengths, self.blank, self.reduction) + # TODO: L1HingeEmbeddingCriterion # TODO: MSECriterion weight # TODO: ClassSimplexCriterion diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py index 3ca44f35c4eff3..d19dadd0ceb935 100644 --- a/torch/onnx/symbolic.py +++ b/torch/onnx/symbolic.py @@ -70,6 +70,12 @@ def _get_const(value, desc, arg_name): return _parse_arg(value, desc) +def _unpack_list(list_value): + list_node = list_value.node() + assert list_node.kind() == "prim::ListConstruct" + return list_node.inputs() + + def parse_args(*arg_descriptors): def decorator(fn): def wrapper(g, *args): @@ -215,13 +221,18 @@ def reciprocal(g, self): return g.op("Div", _if_scalar_type_as(g, torch.ones(1), self), self) -# This syntax is Python 2 portable -def cat(g, *args): - dim = _get_const(args[-1], 'i', 'dim') - tensors = args[:-1] +@parse_args('v', 'i') +def cat(g, tensor_list, dim): + tensors = _unpack_list(tensor_list) return g.op("Concat", *tensors, axis_i=dim) +@parse_args('v', 'i') +def stack(g, tensor_list, dim): + unsqueezed = [g.op("Unsqueeze", t, axes_i=[dim]) for t in _unpack_list(tensor_list)] + return g.op("Concat", *unsqueezed, axis_i=dim) + + def mm(g, self, other): # Create a dummy C tensor. Only needed for API purposes, the value is # since beta = 0 @@ -349,11 +360,6 @@ def view(g, self, size): return g.op("Reshape", self, shape) -def stack(g, *args): - unsqueezed = [g.op("Unsqueeze", t, axes_i=[dim]) for t in args[:-1]] + [args[-1]] - return concat(g, *unsqueezed) - - @parse_args('v', 'i', 'i') def split(g, self, split_size, dim): size = self.type().sizes()[dim] @@ -555,9 +561,10 @@ def replication_pad(g, input, padding): @parse_args('v', 'is') def upsample_nearest2d(g, input, output_size): + height_scale = float(output_size[-2]) / input.type().sizes()[-2] + width_scale = float(output_size[-1]) / input.type().sizes()[-1] return g.op("Upsample", input, - height_scale_f=float(output_size[-2]) / input.type().sizes()[-2], - width_scale_f=float(output_size[-1]) / input.type().sizes()[-1], + scales_f=[1., 1., height_scale, width_scale], mode_s="nearest") @@ -565,10 +572,11 @@ def upsample_nearest2d(g, input, output_size): def upsample_bilinear2d(g, input, output_size, align_corners): if align_corners: return _unimplemented("upsample_bilinear2d", "align_corners == True") - w_scale = float(output_size[-1]) / input.type().sizes()[-1] - h_scale = float(output_size[-2]) / input.type().sizes()[-2] - return g.op("Upsample", input, width_scale_f=w_scale, - height_scale_f=h_scale, mode_s="bilinear") + height_scale = float(output_size[-2]) / input.type().sizes()[-2] + width_scale = float(output_size[-1]) / input.type().sizes()[-1] + return g.op("Upsample", input, + scales_f=[1., 1., height_scale, width_scale], + mode_s="bilinear") def gt(g, input, other): @@ -676,8 +684,10 @@ def index_select(g, self, dim, index): return g.op("Gather", self, index, axis_i=dim) -def index_put(g, *inputs): - return g.op("ATen", *inputs, operator_s='index_put') +def index_put(g, self, indices_list_value, values): + indices_list = list(_unpack_list(indices_list_value)) + args = [self] + indices_list + [values] + return g.op("ATen", *args, operator_s='index_put') def type_as(g, self, other): @@ -868,14 +878,17 @@ def topk(g, self, k, dim, largest, sorted, out=None): return g.op("TopK", self, k_i=k, axis_i=dim, outputs=2) -@parse_args('v', 'is') def repeat(g, self, repeats): - if self.isTensor(): + if not _is_value(repeats): + repeats = g.op("Constant", value_t=torch.LongTensor(repeats)) + const_repeats = _maybe_get_const(repeats, 'is') + + if self.isTensor() and not _is_value(const_repeats): sizes = self.type().sizes() - diff_dims = len(repeats) - len(sizes) + diff_dims = len(const_repeats) - len(sizes) if diff_dims > 0: self = view(g, self, [1] * diff_dims + sizes) - return g.op("Tile", self, g.op("Constant", value_t=torch.LongTensor(repeats))) + return g.op("Tile", self, repeats) def instance_norm(g, input, **kwargs): diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py index 4f9299d258ea3e..b770b900c4edd3 100644 --- a/torch/onnx/utils.py +++ b/torch/onnx/utils.py @@ -480,8 +480,14 @@ def _run_symbolic_function(g, n, inputs, env, operator_export_type=OperatorExpor raise RuntimeError("Unsupported prim::Constant kind: `{}`. Send a bug report.".format( n.kindOf("value"))) elif op_name == "ListConstruct": - unsqueezed = [g.op("Unsqueeze", input, axes_i=[0]) for input in inputs] - return g.op("Concat", *unsqueezed, axis_i=0) + t = n.output().type() + # Tensor lists are used mostly for inputs to cat/stack. They need to be handled + # in those symbolics, and should become dead afterwards. + if t == torch._C.ListType.ofTensors(): + return None + elif t == torch._C.ListType.ofInts(): + unsqueezed = [g.op("Unsqueeze", input, axes_i=[0]) for input in inputs] + return g.op("Concat", *unsqueezed, axis_i=0) elif op_name == "Undefined": # Undefined is not an ONNX operator; keep it as prim::Undefined # and let the exporter handle finally eliminating these diff --git a/torch/tensor.py b/torch/tensor.py index 6b587fcf903586..9784fd59c9d2fb 100644 --- a/torch/tensor.py +++ b/torch/tensor.py @@ -384,6 +384,8 @@ def __dir__(self): return sorted(keys) # Numpy array interface, to support `numpy.asarray(tensor) -> ndarray` + __array_priority__ = 1000 # prefer Tensor ops over numpy ones + def __array__(self, dtype=None): if dtype is None: return self.cpu().numpy()