diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index f1eda3103a24af..0f26005f74cb22 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -1,15 +1,28 @@ #!/bin/bash +# For distributed, four environmental configs: +# (1) build with only NCCL +# (2) build with NCCL and MPI +# (3) build with only MPI +# (4) build with neither +if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-* ]]; then + # TODO: move this to Docker + sudo apt-get update + sudo apt-get install libnccl-dev=2.2.13-1+cuda9.0 libnccl2=2.2.13-1+cuda9.0 +fi + +if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda8-* ]] || [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-cudnn7-py2* ]]; then + # TODO: move this to Docker + sudo apt-get update + sudo apt-get install openmpi-bin libopenmpi-dev + sudo apt-get install -y --no-install-recommends openssh-client openssh-server + sudo mkdir -p /var/run/sshd +fi + if [[ "$BUILD_ENVIRONMENT" == "pytorch-linux-xenial-py3-clang5-asan" ]]; then exec "$(dirname "${BASH_SOURCE[0]}")/build-asan.sh" $* fi -# TODO: move this to Docker -# TODO: add both NCCL and MPI in CI test by fixing these test first -sudo apt-get update -sudo apt-get install libnccl-dev libnccl2 -# sudo apt-get install openmpi-bin libopenmpi-dev - # Required environment variable: $BUILD_ENVIRONMENT # (This is set by default in the Docker images we build, so you don't # need to set it yourself. diff --git a/CMakeLists.txt b/CMakeLists.txt index 1009e5a4ec30f7..75b4bf7b4512d1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -306,7 +306,7 @@ if(BUILD_DOCS) if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/docs) file(REMOVE_RECURSE ${CMAKE_CURRENT_BINARY_DIR}/docs) - endif (EXISTS ${CMAKE_CURRENT_BINARY_DIR}/docs) + endif() file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/docs) configure_file(${DOXYGEN_C_IN} ${DOXYGEN_C_OUT} @ONLY) @@ -323,10 +323,10 @@ if(BUILD_DOCS) WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMENT "Generating Python API documentation with Doxygen" VERBATIM) - else (DOXYGEN_FOUND) + else() message(FATAL_ERROR "Doxygen needs to be installed to generate the documentation") - endif (DOXYGEN_FOUND) -endif (BUILD_DOCS) + endif() +endif() # ---[ CMake related files # Uninistall option. diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp index f85996f74c4b76..5b420d87b34fc1 100644 --- a/aten/src/ATen/Context.cpp +++ b/aten/src/ATen/Context.cpp @@ -9,6 +9,9 @@ #include #include "ATen/CPUGenerator.h" +#include "ATen/RegisterCPU.h" + +#include "TH/TH.h" // for USE_LAPACK #ifdef USE_SSE3 #include @@ -34,7 +37,7 @@ Context::Context() generator_registry[static_cast(DeviceType::CPU)] .reset(new CPUGenerator(this)); - Type::registerCPU(this); + register_cpu_types(this); } // TODO: This could be bad juju if someone calls globalContext() in the @@ -79,6 +82,14 @@ bool Context::hasMKL() const { #endif } +bool Context::hasLAPACK() const { +#ifdef USE_LAPACK + return true; +#else + return false; +#endif +} + bool Context::setFlushDenormal(bool on) { #ifdef USE_SSE3 // Setting flush-to-zero (FTZ) flag diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index f2b3a452cfed57..bab1fa5dc5d069 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -50,6 +50,10 @@ class AT_API Context { return *generator; } bool hasMKL() const; + bool hasLAPACK() const; + bool hasMAGMA() const { + return detail::getCUDAHooks().hasMAGMA(); + } bool hasCUDA() const { return detail::getCUDAHooks().hasCUDA(); } @@ -114,6 +118,7 @@ class AT_API Context { std::atomic next_id; std::unique_ptr thc_state; friend struct Type; + friend void register_cpu_types(Context * context); friend void register_cuda_types(Context * context); }; @@ -157,6 +162,14 @@ static inline bool hasMKL() { return globalContext().hasMKL(); } +static inline bool hasLAPACK() { + return globalContext().hasLAPACK(); +} + +static inline bool hasMAGMA() { + return globalContext().hasMAGMA(); +} + static inline int64_t current_device() { return globalContext().current_device(); } diff --git a/aten/src/ATen/DeviceGuard.h b/aten/src/ATen/DeviceGuard.h index 7adddfca27c9eb..b51d80d22d350f 100644 --- a/aten/src/ATen/DeviceGuard.h +++ b/aten/src/ATen/DeviceGuard.h @@ -1,7 +1,7 @@ #pragma once -#include -#include +#include +#include #include #include #include diff --git a/aten/src/ATen/Formatting.cpp b/aten/src/ATen/Formatting.cpp index ef04cc4bdfd975..dcdf7653f2308b 100644 --- a/aten/src/ATen/Formatting.cpp +++ b/aten/src/ATen/Formatting.cpp @@ -1,6 +1,5 @@ #include "ATen/Formatting.h" #include "ATen/Tensor.h" -#include "ATen/Context.h" #include "ATen/TensorMethods.h" #include diff --git a/aten/src/ATen/Storage.h b/aten/src/ATen/Storage.h index 8db0b231bf53f9..d797618b285e6a 100644 --- a/aten/src/ATen/Storage.h +++ b/aten/src/ATen/Storage.h @@ -26,8 +26,8 @@ struct AT_API Storage { template T* unsafe_data() const { return storage_impl_->unsafe_data(); } - size_t elementSize() const { return storage_impl_->elementSize(); } - ptrdiff_t size() const { return storage_impl_->size(); } + size_t elementSize() const { return storage_impl_->itemsize(); } + ptrdiff_t size() const { return storage_impl_->numel(); } bool resizable() const { return storage_impl_->resizable(); } // get() use here is to get const-correctness void* data() const { return storage_impl_.get()->data(); } diff --git a/aten/src/ATen/StorageImpl.cpp b/aten/src/ATen/StorageImpl.cpp index 233540bfa06f28..0ed836b9b3010a 100644 --- a/aten/src/ATen/StorageImpl.cpp +++ b/aten/src/ATen/StorageImpl.cpp @@ -1,31 +1,29 @@ -#include #include namespace at { StorageImpl::StorageImpl( at::DataType data_type, - ptrdiff_t size, + int64_t numel, at::DataPtr data_ptr, at::Allocator* allocator, bool resizable) : data_type_(data_type), data_ptr_(std::move(data_ptr)), - size_(size), + numel_(numel), resizable_(resizable), - allocator_(allocator), - finalizer_(nullptr) {} + allocator_(allocator) {} StorageImpl::StorageImpl( at::DataType data_type, - ptrdiff_t size, + int64_t numel, at::Allocator* allocator, bool resizable) : StorageImpl( data_type, - size, + numel, allocator->allocate( - at::elementSize(dataTypeToScalarType(data_type)) * size), + at::elementSize(dataTypeToScalarType(data_type)) * numel), allocator, resizable) {} diff --git a/aten/src/ATen/StorageImpl.h b/aten/src/ATen/StorageImpl.h index a9394d53935636..35639478df664e 100644 --- a/aten/src/ATen/StorageImpl.h +++ b/aten/src/ATen/StorageImpl.h @@ -3,7 +3,6 @@ #include #include #include -#include #include @@ -21,16 +20,16 @@ struct Type; struct AT_API StorageImpl : public c10::intrusive_ptr_target { public: StorageImpl() = delete; - virtual ~StorageImpl() {}; + ~StorageImpl() {}; StorageImpl( at::DataType data_type, - ptrdiff_t size, + int64_t numel, at::DataPtr data_ptr, at::Allocator* allocator, bool resizable); StorageImpl( at::DataType data_type, - ptrdiff_t size, + int64_t numel, at::Allocator* allocator, bool resizable); StorageImpl(StorageImpl&) = delete; @@ -44,7 +43,7 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target { template inline T* data() const { auto data_type_T = - at::scalarTypeToDataType(at::CTypeToScalarType>::to()); + at::scalarTypeToDataType(at::CTypeToScalarType::to()); if (dtype() != data_type_T) { AT_ERROR( "Attempt to access StorageImpl having data type ", @@ -61,27 +60,22 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target { } void release_resources() override { - if (finalizer_) { - (*finalizer_)(); - } - finalizer_ = nullptr; data_ptr_.clear(); } void operator=(const StorageImpl&) = delete; - size_t elementSize() const { + size_t itemsize() const { return at::elementSize(dataTypeToScalarType(data_type_)); } Type& type(); - // TODO: Rename to size() and size to size_ - ptrdiff_t size() const { - return size_; + int64_t numel() const { + return numel_; }; - void set_size(ptrdiff_t size) { - size_ = size; + void set_numel(int64_t numel) { + numel_ = numel; }; bool resizable() const { return resizable_; @@ -132,9 +126,8 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target { private: at::DataType data_type_; at::DataPtr data_ptr_; - ptrdiff_t size_; + int64_t numel_; bool resizable_; at::Allocator* allocator_; - std::unique_ptr finalizer_; }; } // namespace at diff --git a/aten/src/ATen/TensorBase.h b/aten/src/ATen/TensorBase.h deleted file mode 100644 index 1bda3ddfa14915..00000000000000 --- a/aten/src/ATen/TensorBase.h +++ /dev/null @@ -1,53 +0,0 @@ -#pragma once - -#include "ATen/TensorImpl.h" -#include "ATen/UndefinedTensor.h" -#include "ATen/core/Error.h" - -namespace at { namespace detail { - -// TensorBase is the base class for Tensor. -// TODO: Eliminate this, once we remove TensorBase from Scalar. At -// the moment it's only used to break an include cycle for Scalar -struct TensorBase { - TensorBase() {} - TensorBase(TensorImpl * tensor_impl, bool retain) : tensor_impl_(c10::intrusive_ptr::reclaim(tensor_impl)) { - if (tensor_impl == nullptr) { - throw std::runtime_error("TensorBaseImpl with nullptr not supported"); - } - if (retain && tensor_impl != UndefinedTensor::singleton()) { - c10::raw::intrusive_ptr::incref(tensor_impl); - } - } - TensorBase(c10::intrusive_ptr&& ptr) : tensor_impl_(std::move(ptr)) {} - TensorBase(const c10::intrusive_ptr& ptr) : tensor_impl_(ptr) {} - - int64_t dim() const { - return tensor_impl_->dim(); - } - - TensorImpl * unsafeGetTensorImpl() const { - return tensor_impl_.get(); - } - TensorImpl * unsafeReleaseTensorImpl() { - return tensor_impl_.release(); - } - const c10::intrusive_ptr& getIntrusivePtr() const { - return tensor_impl_; - } - - bool defined() const { - return tensor_impl_; - } - - void reset() { - tensor_impl_.reset(); - } - - friend struct WeakTensor; - -protected: - c10::intrusive_ptr tensor_impl_; -}; - -}} // namespace at::detail diff --git a/aten/src/ATen/TensorImpl.h b/aten/src/ATen/TensorImpl.h index 30b34cabec769f..8976acb6a40904 100644 --- a/aten/src/ATen/TensorImpl.h +++ b/aten/src/ATen/TensorImpl.h @@ -3,8 +3,6 @@ #include #include -#include "ATen/Retainable.h" -#include "ATen/StorageImpl.h" #include "ATen/Storage.h" #include "ATen/core/optional.h" #include "ATen/core/TensorTypeId.h" diff --git a/aten/src/ATen/TensorOptions.h b/aten/src/ATen/TensorOptions.h index c8717689833408..a598290485196d 100644 --- a/aten/src/ATen/TensorOptions.h +++ b/aten/src/ATen/TensorOptions.h @@ -2,10 +2,10 @@ #include #include -#include +#include #include #include -#include +#include #include #include diff --git a/aten/src/ATen/UndefinedTensor.cpp b/aten/src/ATen/UndefinedTensor.cpp index 79f58479e90b52..f50a4e71da9cae 100644 --- a/aten/src/ATen/UndefinedTensor.cpp +++ b/aten/src/ATen/UndefinedTensor.cpp @@ -1,5 +1,4 @@ #include "ATen/UndefinedTensor.h" -#include "ATen/Context.h" #include "ATen/core/Error.h" namespace at { diff --git a/aten/src/ATen/UndefinedType.cpp b/aten/src/ATen/UndefinedType.cpp index 60d9c884b8aef2..2bc3965c6d33ae 100644 --- a/aten/src/ATen/UndefinedType.cpp +++ b/aten/src/ATen/UndefinedType.cpp @@ -3,8 +3,8 @@ namespace at { -UndefinedType::UndefinedType(Context* context) - : Type(context, UndefinedTensorId(), /*is_variable=*/false, /*is_undefined=*/true) {} +UndefinedType::UndefinedType() + : Type(UndefinedTensorId(), /*is_variable=*/false, /*is_undefined=*/true) {} ScalarType UndefinedType::scalarType() const { return ScalarType::Undefined; } diff --git a/aten/src/ATen/UndefinedType.h b/aten/src/ATen/UndefinedType.h index 9ca00cfb516ff7..d216e3131dd693 100644 --- a/aten/src/ATen/UndefinedType.h +++ b/aten/src/ATen/UndefinedType.h @@ -1,7 +1,6 @@ #pragma once #include "ATen/Type.h" -#include "ATen/Context.h" #include "ATen/CheckGenerator.h" #ifdef _MSC_VER @@ -13,7 +12,7 @@ namespace at { struct UndefinedType final : public Type { - explicit UndefinedType(Context* context); + explicit UndefinedType(); virtual ScalarType scalarType() const override; virtual Backend backend() const override; virtual bool is_cuda() const override; diff --git a/aten/src/ATen/SparseTensorRef.h b/aten/src/ATen/core/SparseTensorRef.h similarity index 100% rename from aten/src/ATen/SparseTensorRef.h rename to aten/src/ATen/core/SparseTensorRef.h diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp index 7d73fafc994da5..570a375e3888a3 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp +++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp @@ -69,7 +69,7 @@ DynamicCUDAInterfaceSetter _; // let's not if we don't need to!) std::unique_ptr CUDAHooks::initCUDA() const { THCState* thc_state = THCState_alloc(); - + THCudaInit(thc_state); return std::unique_ptr( thc_state, [](THCState* p) { @@ -92,6 +92,14 @@ bool CUDAHooks::hasCUDA() const { return true; } +bool CUDAHooks::hasMAGMA() const { +#ifdef USE_MAGMA + return true; +#else + return false; +#endif +} + bool CUDAHooks::hasCuDNN() const { return AT_CUDNN_ENABLED(); } diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h index 766ab62b8ef79f..491adfc4d73f1a 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.h +++ b/aten/src/ATen/cuda/detail/CUDAHooks.h @@ -13,6 +13,7 @@ struct CUDAHooks : public at::CUDAHooksInterface { std::unique_ptr initCUDA() const override; std::unique_ptr initCUDAGenerator(Context*) const override; bool hasCUDA() const override; + bool hasMAGMA() const override; bool hasCuDNN() const override; int64_t current_device() const override; Allocator* getPinnedMemoryAllocator() const override; diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h index 6b2e87c4f762af..cccf6dc28453dc 100644 --- a/aten/src/ATen/detail/CUDAHooksInterface.h +++ b/aten/src/ATen/detail/CUDAHooksInterface.h @@ -65,6 +65,10 @@ struct AT_API CUDAHooksInterface { return false; } + virtual bool hasMAGMA() const { + return false; + } + virtual bool hasCuDNN() const { return false; } diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py index 0f859edd3ede3a..f7a4deb58dc941 100644 --- a/aten/src/ATen/gen.py +++ b/aten/src/ATen/gen.py @@ -109,6 +109,9 @@ def check_all_files_written(self): TYPE_H = CodeTemplate.from_file(TEMPLATE_PATH + "/Type.h") TYPE_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/Type.cpp") +REGISTER_CPU_H = CodeTemplate.from_file(TEMPLATE_PATH + "/RegisterCPU.h") +REGISTER_CPU_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/RegisterCPU.cpp") + REGISTER_CUDA_H = CodeTemplate.from_file(TEMPLATE_PATH + "/RegisterCUDA.h") REGISTER_CUDA_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/RegisterCUDA.cpp") @@ -122,7 +125,7 @@ def check_all_files_written(self): TYPE_REGISTER = CodeTemplate("""\ context->type_registry[static_cast(Backend::${backend})] [static_cast(ScalarType::${scalar_type})] - .reset(new ${type_name}(context)); + .reset(new ${type_name}()); detail::getVariableHooks().registerVariableTypeFor(context, Backend::${backend}, ScalarType::${scalar_type}); """) @@ -280,19 +283,7 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations if scalar_name == "Half": env['SparseTensor'] = 'Tensor' if backend == "CUDA": - env['to_th_type'] = 'HalfFix<__half,Half>' - env['to_at_type'] = 'HalfFix' env['AS_REAL'] = 'convert' - env['THScalarType'] = 'half' - else: - env['to_th_type'] = 'HalfFix' - env['to_at_type'] = 'HalfFix' - elif scalar_name == 'Long': - env['to_th_type'] = 'long' - env['to_at_type'] = 'int64_t' - else: - env['to_th_type'] = '' - env['to_at_type'] = '' declarations, definitions = function_wrapper.create_derived( env, declarations) @@ -340,7 +331,8 @@ def iterate_types(): def declare_outputs(): files = ['Declarations.yaml', 'Type.h', 'Type.cpp', 'Tensor.h', 'TensorMethods.h', 'Functions.h', - 'CPUCopy.cpp', 'NativeFunctions.h'] + 'CPUCopy.cpp', 'NativeFunctions.h', + 'RegisterCPU.cpp', 'RegisterCPU.h'] for f in files: file_manager.will_write(f) cuda_files = ['CUDACopy.cpp', 'RegisterCUDA.cpp', 'RegisterCUDA.h'] @@ -409,6 +401,9 @@ def generate_outputs(): file_manager.write('Type.h', TYPE_H, top_env) file_manager.write('Type.cpp', TYPE_CPP, top_env) + file_manager.write('RegisterCPU.h', REGISTER_CPU_H, top_env) + file_manager.write('RegisterCPU.cpp', REGISTER_CPU_CPP, top_env) + cuda_file_manager.write('RegisterCUDA.h', REGISTER_CUDA_H, top_env) cuda_file_manager.write('RegisterCUDA.cpp', REGISTER_CUDA_CPP, top_env) diff --git a/aten/src/ATen/native/LegacyBridge.cpp b/aten/src/ATen/native/LegacyBridge.cpp index 5b73a09ad9b004..07d7e46ff79a56 100644 --- a/aten/src/ATen/native/LegacyBridge.cpp +++ b/aten/src/ATen/native/LegacyBridge.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include #include namespace at { namespace native { diff --git a/aten/src/ATen/native/Unique.cpp b/aten/src/ATen/native/Unique.cpp index d9bd94e1f7810b..d5ff300c0dd9e2 100644 --- a/aten/src/ATen/native/Unique.cpp +++ b/aten/src/ATen/native/Unique.cpp @@ -47,6 +47,82 @@ std::tuple _unique_cpu_template( } return std::make_tuple(output, inverse_indices); } + +template +ForwardIt _unique_dim_cpu_impl(ForwardIt first, ForwardIt last, + std::vector& indices, Tensor inverse_indices_vec) { + if (first == last) { + return last; + } + // save to calculate distance to iterators + ForwardIt begin = first; + + // set first inverse index + inverse_indices_vec[indices[0]] = 0; + + ForwardIt result = first; + while (++first != last) { + if (!at::equal(*result, *first) && ++result != first) { + *result = std::move(*first); + } + int64_t idx_result = std::distance(begin, result); + int64_t idx_first = std::distance(begin, first); + inverse_indices_vec[indices[idx_first]] = idx_result; + } + + return ++result; + } + +template +std::tuple _unique_dim_cpu_template( + const Tensor& self, + const int64_t dim, + const bool return_inverse) { + // reshape tensor as [dim, -1] + Tensor input_flat = self.transpose(dim, 0); + auto orig_sizes = input_flat.sizes().vec(); + input_flat = input_flat.contiguous().view({input_flat.size(0), -1}); + + std::vector indices(input_flat.size(0)); + std::iota(indices.begin(), indices.end(), 0); + int64_t numel = input_flat.size(1); + scalar_t* input_flat_ptr = ((scalar_t*)input_flat.data_ptr()); + + // sort indices using data + std::sort(indices.begin(), indices.end(), + [&](int64_t a, int64_t b) -> bool { + for (int64_t i = 0; i < numel; ++i) { + scalar_t lhs = input_flat_ptr[i + a * numel]; + scalar_t rhs = input_flat_ptr[i + b * numel]; + if (lhs < rhs) { + return true; + } else if (lhs > rhs) { + return false; + } + } + return false; + }); + + Tensor input_sorted = at::empty(input_flat.sizes(), input_flat.type()); + for (int i = 0; i < indices.size(); ++i) { + input_sorted[i] = input_flat[indices[i]]; + } + + Tensor inverse_indices = at::empty(indices.size(), self.type().toScalarType(kLong)); + std::vector input_unbind = at::unbind(input_sorted, 0); + auto last = _unique_dim_cpu_impl( + input_unbind.begin(), input_unbind.end(), indices, inverse_indices); + input_unbind.erase(last, input_unbind.end()); + + // reshape back + auto output = at::stack(input_unbind, 0); + auto new_sizes = std::vector(orig_sizes); + new_sizes[0] = -1; + output = output.view(new_sizes); + output = output.transpose(0, dim); + + return std::make_tuple(output, inverse_indices); +} } // namespace std::tuple @@ -56,5 +132,13 @@ _unique_cpu(const Tensor& self, const bool sorted, const bool return_inverse) { }); } +std::tuple +_unique_dim_cpu(const Tensor& self, const int64_t dim, const bool sorted, const bool return_inverse) { + return AT_DISPATCH_ALL_TYPES(self.type(), "unique_dim", [&] { + // The current implementation using `dim` always sorts due to unhashable tensors + return _unique_dim_cpu_template(self, dim, return_inverse); + }); +} + } // namespace native } // namespace at diff --git a/aten/src/ATen/native/cuda/Gesv.cu b/aten/src/ATen/native/cuda/Gesv.cu index 0692dd0fea2901..bc37e83990e192 100644 --- a/aten/src/ATen/native/cuda/Gesv.cu +++ b/aten/src/ATen/native/cuda/Gesv.cu @@ -48,7 +48,7 @@ void magmaGesvBatched( } static magma_queue_t createMagmaQueue(const Tensor& tensor) { - auto& context = tensor.type().get_context(); + auto& context = at::globalContext(); magma_queue_t magma_queue; magma_queue_create_from_cuda( tensor.get_device(), diff --git a/aten/src/ATen/native/cuda/Unique.cu b/aten/src/ATen/native/cuda/Unique.cu index f2e13b4c708b62..c29337f90f1347 100644 --- a/aten/src/ATen/native/cuda/Unique.cu +++ b/aten/src/ATen/native/cuda/Unique.cu @@ -69,6 +69,92 @@ template return std::tuple(output, inverse_indices); } + +template + std::tuple _unique_dim_cuda_template( + const Tensor& self, + const int64_t dim, + const bool return_inverse) { + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); + auto policy = thrust::cuda::par(allocator).on(stream); + + Tensor input_flat = self.transpose(dim, 0); + auto orig_sizes = input_flat.sizes().vec(); + input_flat = input_flat.contiguous().view({input_flat.size(0), -1}); + + scalar_t* input_flat_ptr = input_flat.data(); + + Tensor indices = at::arange(0, input_flat.size(0), self.type().toScalarType(kLong)); + int64_t* indices_ptr = indices.data(); + int64_t numel = input_flat.size(1); + + // sort indices using data + thrust::sort(policy, indices_ptr, indices_ptr + indices.numel(), + [=] __device__ (int64_t a, int64_t b) -> bool { + for (int64_t i = 0; i < numel; ++i) { + scalar_t lhs = input_flat_ptr[i + a * numel]; + scalar_t rhs = input_flat_ptr[i + b * numel]; + if (lhs < rhs) { + return true; + } else if (lhs > rhs) { + return false; + } + } + return false; + }); + + Tensor input_sorted = input_flat.index_select(0, indices); + + // get unique tensors + scalar_t* input_sorted_ptr = input_sorted.data(); + Tensor input_sorted_indices = at::arange(0, input_sorted.size(0), self.type().toScalarType(kLong)); + int64_t* input_sorted_indices_ptr = input_sorted_indices.data(); + auto last = thrust::unique(policy, input_sorted_indices_ptr, input_sorted_indices_ptr + input_sorted_indices.numel(), + [=] __device__ (int64_t a, int64_t b) -> bool { + for (int64_t i = 0; i < numel; ++i) { + scalar_t lhs = input_sorted_ptr[i + a * numel]; + scalar_t rhs = input_sorted_ptr[i + b * numel]; + if (lhs != rhs) { + return false; + } + } + return true; + }); + input_sorted_indices.resize_(last - input_sorted_indices_ptr); + Tensor output = input_sorted.index_select(0, input_sorted_indices); + + // reshape back + auto new_sizes = std::vector(orig_sizes); + new_sizes[0] = -1; + output = output.view(new_sizes); + output = output.transpose(0, dim); + + // calculate inverse indices + Tensor inverse_indices = at::empty({0}, self.type().toScalarType(kLong)); + if (return_inverse) { + int64_t size = self.size(dim); + inverse_indices.resize_(size); + Tensor mask = at::empty(input_sorted.size(0), self.type().toScalarType(kLong)); + mask[0] = 1; + for (int i = 0; i < input_sorted.size(0) - 1; ++i) { + if (!at::equal(input_sorted[i], input_sorted[i+1])) { + mask[i+1] = 1; + } else { + mask[i+1] = 0; + } + } + + Tensor imask = at::cumsum(mask, 0) - 1; + for (int i = 0; i < indices.size(0); ++i) { + inverse_indices[indices[i]] = imask[i]; + } + } + + THCudaCheck(cudaGetLastError()); + return std::tuple(output, inverse_indices); + } } // namespace #endif @@ -86,5 +172,16 @@ _unique_cuda(const Tensor& self, const bool sorted, const bool return_inverse) { #endif } +std::tuple +_unique_dim_cuda(const Tensor& self, const int64_t dim, const bool sorted, const bool return_inverse) { + #ifndef __HIP_PLATFORM_HCC__ + return AT_DISPATCH_ALL_TYPES(self.type(), "unique_dim", [&] { + return _unique_dim_cuda_template(self, dim, return_inverse); + }); + #else + AT_ERROR("unique_dim_cuda: HIP not supported"); + #endif +} + } // namespace native } // namespace at diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 466fe6c3134e84..cb194cd0c7bdee 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -1748,6 +1748,11 @@ CPU: _unique_cpu CUDA: _unique_cuda +- func: _unique_dim(Tensor self, int64_t dim, bool sorted=false, bool return_inverse=false) -> (Tensor, Tensor) + dispatch: + CPU: _unique_dim_cpu + CUDA: _unique_dim_cuda + - func: _unsafe_view(Tensor self, IntList size) -> Tensor variants: function diff --git a/aten/src/ATen/templates/RegisterCPU.cpp b/aten/src/ATen/templates/RegisterCPU.cpp new file mode 100644 index 00000000000000..0c1eeb4818fbbc --- /dev/null +++ b/aten/src/ATen/templates/RegisterCPU.cpp @@ -0,0 +1,20 @@ +#include + +// ${generated_comment} + +#include +#include +#include +#include + +${cpu_type_headers} + +namespace at { + +void register_cpu_types(Context * context) { + ${cpu_type_registrations} + context->type_registry[static_cast(Backend::Undefined)] + [static_cast(ScalarType::Undefined)].reset(new UndefinedType()); +} + +} // namespace at diff --git a/aten/src/ATen/templates/RegisterCPU.h b/aten/src/ATen/templates/RegisterCPU.h new file mode 100644 index 00000000000000..b923c180aac805 --- /dev/null +++ b/aten/src/ATen/templates/RegisterCPU.h @@ -0,0 +1,10 @@ +#pragma once + +// ${generated_comment} + +namespace at { + +class Context; +void register_cpu_types(Context * context); + +} // namespace at diff --git a/aten/src/ATen/templates/SparseTypeDerived.cpp b/aten/src/ATen/templates/SparseTypeDerived.cpp index 4a17004bb5ff8c..2ef9dbf398fa2f 100644 --- a/aten/src/ATen/templates/SparseTypeDerived.cpp +++ b/aten/src/ATen/templates/SparseTypeDerived.cpp @@ -27,8 +27,8 @@ namespace at { -${Type}::${Type}(Context* context) - : Type(context, ${Backend}TensorId(), /*is_variable=*/false, /*is_undefined=*/false) {} +${Type}::${Type}() + : Type(${Backend}TensorId(), /*is_variable=*/false, /*is_undefined=*/false) {} ScalarType ${Type}::scalarType() const { return ScalarType::${ScalarName}; } @@ -58,7 +58,7 @@ Storage ${Type}::unsafeStorageFromTH(void * th_pointer, bool retain) const { AT_ERROR("unsafeTensorFromTH not supported on sparse"); } std::unique_ptr ${Type}::generator() const { - return std::unique_ptr(new ${Generator}(context)); + return std::unique_ptr(new ${Generator}(&at::globalContext())); } const char * ${Type}::toString() const { diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h index 28e8e5381f2933..4d8bf60522f7db 100644 --- a/aten/src/ATen/templates/Tensor.h +++ b/aten/src/ATen/templates/Tensor.h @@ -2,16 +2,17 @@ // ${generated_comment} -#include "ATen/Device.h" +#include "ATen/core/Device.h" #include "ATen/core/Layout.h" #include "ATen/Scalar.h" -#include "ATen/ScalarType.h" -#include "ATen/SparseTensorRef.h" +#include "ATen/core/ScalarType.h" +#include "ATen/core/SparseTensorRef.h" #include "ATen/Storage.h" #include "ATen/TensorAccessor.h" -#include "ATen/TensorBase.h" #include "ATen/TensorImpl.h" #include "ATen/core/optional.h" +#include "ATen/UndefinedTensor.h" +#include "ATen/core/Error.h" namespace at { struct Generator; @@ -38,16 +39,48 @@ namespace at { // // Note that Tensor can also be NULL, i.e. it is not associated with any underlying TensorImpl, and // special care must be taken to handle this. -struct AT_API Tensor : public detail::TensorBase { - using TensorBase = detail::TensorBase; - Tensor() : TensorBase() {} - Tensor(TensorImpl * self, bool retain) : TensorBase(self, retain) {} - Tensor(const c10::intrusive_ptr& ptr) : TensorBase(ptr) {} - Tensor(c10::intrusive_ptr&& ptr) : TensorBase(std::move(ptr)) {} +struct AT_API Tensor { + Tensor(){}; + Tensor(TensorImpl* tensor_impl, bool retain) + : tensor_impl_(c10::intrusive_ptr::reclaim( + tensor_impl)) { + if (tensor_impl == nullptr) { + throw std::runtime_error("TensorBaseImpl with nullptr not supported"); + } + if (retain && tensor_impl != UndefinedTensor::singleton()) { + c10::raw::intrusive_ptr::incref(tensor_impl); + } + } + Tensor(const c10::intrusive_ptr& ptr) + : tensor_impl_(std::move(ptr)) {} + Tensor(c10::intrusive_ptr&& ptr) + : tensor_impl_(ptr) {} Tensor(const Tensor&) = default; Tensor(Tensor&&) = default; + int64_t dim() const { + return tensor_impl_->dim(); + } + + TensorImpl * unsafeGetTensorImpl() const { + return tensor_impl_.get(); + } + TensorImpl * unsafeReleaseTensorImpl() { + return tensor_impl_.release(); + } + const c10::intrusive_ptr& getIntrusivePtr() const { + return tensor_impl_; + } + + bool defined() const { + return tensor_impl_; + } + + void reset() { + tensor_impl_.reset(); + } + // The following overloads are very intruiging. Consider the following // program: // @@ -242,6 +275,9 @@ struct AT_API Tensor : public detail::TensorBase { } friend struct WeakTensor; + +protected: + c10::intrusive_ptr tensor_impl_; }; struct AT_API WeakTensor { diff --git a/aten/src/ATen/templates/TensorMethods.h b/aten/src/ATen/templates/TensorMethods.h index 214a5d18316588..e52c597b99eeb7 100644 --- a/aten/src/ATen/templates/TensorMethods.h +++ b/aten/src/ATen/templates/TensorMethods.h @@ -4,7 +4,7 @@ #include "ATen/Tensor.h" #include "ATen/Scalar.h" -#include "ATen/SparseTensorRef.h" +#include "ATen/core/SparseTensorRef.h" #include "ATen/Type.h" namespace at { diff --git a/aten/src/ATen/templates/Type.cpp b/aten/src/ATen/templates/Type.cpp index 40621a9be6e08b..ff154971e7bffb 100644 --- a/aten/src/ATen/templates/Type.cpp +++ b/aten/src/ATen/templates/Type.cpp @@ -5,26 +5,14 @@ #include "ATen/ExpandUtils.h" #include "ATen/NativeFunctions.h" #include "ATen/Scalar.h" -#include "ATen/SparseTensorRef.h" +#include "ATen/core/SparseTensorRef.h" #include "ATen/Storage.h" #include "ATen/Tensor.h" #include "ATen/TensorOptions.h" -#include "ATen/UndefinedType.h" #include "ATen/DeviceGuard.h" -#include - -#include -${cpu_type_headers} - namespace at { -void Type::registerCPU(Context * context) { - ${cpu_type_registrations} - context->type_registry[static_cast(Backend::Undefined)] - [static_cast(ScalarType::Undefined)].reset(new UndefinedType(context)); -} - Tensor & Type::copy_(Tensor & self, const Tensor & src, bool non_blocking) const { Tensor b_src; std::tie(b_src) = expand_inplace(self, src, "copy"); @@ -50,10 +38,10 @@ Tensor Type::copy(const Tensor & src, bool non_blocking) const { } Type & Type::toBackend(Backend b) const { - return context->getType(b,scalarType()); + return at::globalContext().getType(b,scalarType()); } Type & Type::toScalarType(ScalarType s) const { - return context->getType(backend(),s); + return at::globalContext().getType(backend(),s); } static std::vector defaultStrides(IntList sizes) { std::vector strides(sizes.size()); diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h index d4972d87a6dfd9..10c52ac14b6975 100644 --- a/aten/src/ATen/templates/Type.h +++ b/aten/src/ATen/templates/Type.h @@ -3,13 +3,13 @@ // ${generated_comment} #include "ATen/core/ATenGeneral.h" -#include "ATen/Allocator.h" +#include "ATen/core/Allocator.h" #include "ATen/core/Deprecated.h" #include "ATen/core/Generator.h" #include "ATen/core/Layout.h" #include "ATen/Scalar.h" #include "ATen/core/ScalarType.h" -#include "ATen/SparseTensorRef.h" +#include "ATen/core/SparseTensorRef.h" #include "ATen/Tensor.h" #include "ATen/core/ArrayRef.h" #include "ATen/core/Half.h" @@ -45,8 +45,8 @@ enum class TypeID { }; struct AT_API Type { - explicit Type(Context* context, TensorTypeId type_id, bool is_variable, bool is_undefined) - : context(context), type_id_(type_id), is_variable_(is_variable), is_undefined_(is_undefined) {} + explicit Type(TensorTypeId type_id, bool is_variable, bool is_undefined) + : type_id_(type_id), is_variable_(is_variable), is_undefined_(is_undefined) {} virtual ~Type() {} virtual ScalarType scalarType() const = 0; virtual Backend backend() const = 0; @@ -56,7 +56,6 @@ struct AT_API Type { virtual bool is_distributed() const = 0; bool is_variable() const noexcept { return is_variable_; } bool is_undefined() const noexcept { return is_undefined_; } - static void registerCPU(Context * context); virtual Storage storage(bool resizable = false) const = 0; virtual Storage storage(size_t size, bool resizable = false) const = 0; virtual Storage storageFromBlob(void * data, int64_t size, const std::function & deleter=noop_deleter) const = 0; @@ -80,8 +79,6 @@ struct AT_API Type { Type & cuda() const { return this->toBackend(at::backendToCUDA(this->backend())); } - Context& get_context() const { return *context; } - // contiguous IDs for all types in the system // for external dispatch virtual TypeID ID() const = 0; @@ -112,7 +109,6 @@ struct AT_API Type { // virtual Tensor * add(Tensor & a, Tensor & b) = 0; ${type_method_declarations} protected: - Context* context; TensorTypeId type_id_; bool is_variable_; bool is_undefined_; diff --git a/aten/src/ATen/templates/TypeDerived.cpp b/aten/src/ATen/templates/TypeDerived.cpp index fbafed82b57e02..4335a8f2209a20 100644 --- a/aten/src/ATen/templates/TypeDerived.cpp +++ b/aten/src/ATen/templates/TypeDerived.cpp @@ -38,8 +38,8 @@ static int getPointerDevice(void* ptr) { } #endif -${Type}::${Type}(Context* context) - : Type(context, ${Backend}TensorId(), /*is_variable=*/false, /*is_undefined=*/false) {} +${Type}::${Type}() + : Type(${Backend}TensorId(), /*is_variable=*/false, /*is_undefined=*/false) {} ScalarType ${Type}::scalarType() const { return ScalarType::${ScalarName}; } @@ -99,7 +99,7 @@ Storage ${Type}::unsafeStorageFromTH(void * th_pointer, bool retain) const { return Storage((${THStorage}*) th_pointer); } std::unique_ptr ${Type}::generator() const { - return std::unique_ptr(new ${Generator}(context)); + return std::unique_ptr(new ${Generator}(&at::globalContext())); } const char * ${Type}::toString() const { diff --git a/aten/src/ATen/templates/TypeDerived.h b/aten/src/ATen/templates/TypeDerived.h index e8613b62a333be..ec08e1a336daf6 100644 --- a/aten/src/ATen/templates/TypeDerived.h +++ b/aten/src/ATen/templates/TypeDerived.h @@ -16,7 +16,7 @@ namespace at { struct ${Type} final : public Type { - explicit ${Type}(Context* context); + explicit ${Type}(); virtual ScalarType scalarType() const override; virtual Backend backend() const override; virtual bool is_cuda() const override; diff --git a/aten/src/TH/CMakeLists.txt b/aten/src/TH/CMakeLists.txt index ab9f5343eddad9..9fe22beb0dc54e 100644 --- a/aten/src/TH/CMakeLists.txt +++ b/aten/src/TH/CMakeLists.txt @@ -102,7 +102,6 @@ INSTALL(FILES THTensor.hpp THStorageFunctions.hpp THGenerator.hpp - THTypeConversion.hpp DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/TH") INSTALL(FILES diff --git a/aten/src/TH/THFile.cpp b/aten/src/TH/THFile.cpp index c8924b54f4bf70..4a2cb18b92e07e 100644 --- a/aten/src/TH/THFile.cpp +++ b/aten/src/TH/THFile.cpp @@ -140,12 +140,12 @@ IMPLEMENT_THFILE_SCALAR(Half, THHalf) #define IMPLEMENT_THFILE_STORAGE(TYPEC, TYPE) \ size_t THFile_read##TYPEC(THFile *self, TH##TYPEC##Storage *storage) \ { \ - return THFile_read##TYPEC##Raw(self, TH##TYPEC##Storage_data(storage), storage->size()); \ + return THFile_read##TYPEC##Raw(self, TH##TYPEC##Storage_data(storage), storage->numel()); \ } \ \ size_t THFile_write##TYPEC(THFile *self, TH##TYPEC##Storage *storage) \ { \ - return THFile_write##TYPEC##Raw(self, TH##TYPEC##Storage_data(storage), storage->size()); \ + return THFile_write##TYPEC##Raw(self, TH##TYPEC##Storage_data(storage), storage->numel()); \ } IMPLEMENT_THFILE_STORAGE(Byte, uint8_t) diff --git a/aten/src/TH/THHalf.h b/aten/src/TH/THHalf.h index 5ff85eb2c8f40b..fb68639ec44752 100644 --- a/aten/src/TH/THHalf.h +++ b/aten/src/TH/THHalf.h @@ -2,40 +2,22 @@ #define TH_HALF_H #include -#include -/* Neither built-in nor included from Cutorch, use our definition lifted from CUDA */ -#if defined(__GNUC__) -#define __thalign__(n) __attribute__((aligned(n))) -#elif defined(_WIN32) -#define __thalign__(n) __declspec(align(n)) -#else -#define __thalign__(n) +#ifdef __cplusplus +#include #endif -typedef struct __thalign__(2){ - unsigned short x; -} __THHalf; - -typedef struct __thalign__(4) { - unsigned int x; -} __THHalf2; - -typedef __THHalf THHalf; -typedef __THHalf2 THHalf2; +#ifdef __cplusplus +#define THHalf at::Half +#else +typedef struct at_Half at_Half; +#define THHalf at_Half +#endif TH_API void TH_float2halfbits(float*, unsigned short*); TH_API void TH_halfbits2float(unsigned short*, float*); TH_API THHalf TH_float2half(float); -TH_API float TH_half2float(THHalf); - -#ifndef TH_HALF_BITS_TO_LITERAL -# define TH_HALF_BITS_TO_LITERAL(n) { n } -#endif - -#define TH_HALF_ZERO 0x0U -#define TH_HALF_INF 0x7C00U +TH_API float TH_half2float(THHalf); -#undef __thalign__ #endif diff --git a/aten/src/TH/THMemoryFile.cpp b/aten/src/TH/THMemoryFile.cpp index 011c1d1f54aaee..3f2187b68f74ea 100644 --- a/aten/src/TH/THMemoryFile.cpp +++ b/aten/src/TH/THMemoryFile.cpp @@ -56,7 +56,7 @@ static void THMemoryFile_grow(THMemoryFile *self, ssize_t size) return; else { - if(size < self->storage->size()) /* note the "<" and not "<=" */ + if(size < self->storage->numel()) /* note the "<" and not "<=" */ { self->size = size; THCharStorage_data(self->storage)[self->size] = '\0'; @@ -64,10 +64,10 @@ static void THMemoryFile_grow(THMemoryFile *self, ssize_t size) } } - missingSpace = size-self->storage->size()+1; /* +1 for the '\0' */ - THCharStorage_resize(self->storage, (self->storage->size()/2 > missingSpace ? - self->storage->size() + (self->storage->size()/2) - : self->storage->size() + missingSpace)); + missingSpace = size-self->storage->numel()+1; /* +1 for the '\0' */ + THCharStorage_resize(self->storage, (self->storage->numel()/2 > missingSpace ? + self->storage->numel() + (self->storage->numel()/2) + : self->storage->numel() + missingSpace)); } static int THMemoryFile_mode(const char *mode, int *isReadable, int *isWritable) @@ -188,12 +188,12 @@ static int THMemoryFile_mode(const char *mode, int *isReadable, int *isWritable) while (1) \ { \ ASCII_WRITE_ELEM; \ - if( (nByteWritten > -1) && (nByteWritten < mfself->storage->size()-mfself->position) ) \ + if( (nByteWritten > -1) && (nByteWritten < mfself->storage->numel()-mfself->position) ) \ { \ mfself->position += nByteWritten; \ break; \ } \ - THMemoryFile_grow(mfself, mfself->storage->size() + (mfself->storage->size()/2) + 2); \ + THMemoryFile_grow(mfself, mfself->storage->numel() + (mfself->storage->numel()/2) + 2); \ } \ if(mfself->file.isAutoSpacing) \ { \ @@ -297,7 +297,7 @@ static void THMemoryFile_free(THFile *self) /* READ_WRITE_METHODS(bool, Bool, */ /* int value = 0; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%d%n", &value, &nByteRead); data[i] = (value ? 1 : 0), */ -/* int value = (data[i] ? 1 : 0); nByteWritten = snprintf(THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size()-mfself->position, "%d", value), */ +/* int value = (data[i] ? 1 : 0); nByteWritten = snprintf(THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%d", value), */ /* 1) */ READ_WRITE_METHODS(uint8_t, Byte, @@ -307,7 +307,7 @@ READ_WRITE_METHODS(uint8_t, Byte, nread = ret; \ i = n-1; \ memmove(data, THCharStorage_data(mfself->storage)+mfself->position, nByteRead), - nByteWritten = (n < mfself->storage->size()-mfself->position ? n : -1); \ + nByteWritten = (n < mfself->storage->numel()-mfself->position ? n : -1); \ i = n-1; \ if(nByteWritten > -1) memmove(THCharStorage_data(mfself->storage)+mfself->position, data, nByteWritten), @@ -322,7 +322,7 @@ READ_WRITE_METHODS(int8_t, Char, nread = ret; \ i = n-1; \ memmove(data, THCharStorage_data(mfself->storage)+mfself->position, nByteRead), - nByteWritten = (n < mfself->storage->size()-mfself->position ? n : -1); \ + nByteWritten = (n < mfself->storage->numel()-mfself->position ? n : -1); \ i = n-1; \ if(nByteWritten > -1) memmove(THCharStorage_data(mfself->storage)+mfself->position, data, nByteWritten), @@ -330,29 +330,29 @@ READ_WRITE_METHODS(int8_t, Char, READ_WRITE_METHODS(int16_t, Short, int nByteRead_; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%hd%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++, - nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size()-mfself->position, "%hd", data[i]), + nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%hd", data[i]), 1) READ_WRITE_METHODS(int32_t, Int, int nByteRead_; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%d%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++, - nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size()-mfself->position, "%d", data[i]), + nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%d", data[i]), 1) READ_WRITE_METHODS(float, Float, int nByteRead_; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%g%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++, - nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size()-mfself->position, "%.9g", data[i]), + nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%.9g", data[i]), 1) READ_WRITE_METHODS(THHalf, Half, int nByteRead_; float buf; \ int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%g%n", &buf, &nByteRead_); \ data[i] = TH_float2half(buf); nByteRead = nByteRead_; if(ret <= 0) break; else nread++, - nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size()-mfself->position, "%.9g", TH_half2float(data[i])), + nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%.9g", TH_half2float(data[i])), 1) READ_WRITE_METHODS(double, Double, int nByteRead_; int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%lg%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++, - nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size()-mfself->position, "%.17g", data[i]), + nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%.17g", data[i]), 1) static ssize_t THMemoryFile_readLong(THFile *self, int64_t *data, ssize_t n) @@ -491,13 +491,13 @@ static ssize_t THMemoryFile_writeLong(THFile *self, int64_t *data, ssize_t n) ssize_t nByteWritten; while (1) { - nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->size()-mfself->position, "%" PRId64, data[i]); - if( (nByteWritten > -1) && (nByteWritten < mfself->storage->size()-mfself->position) ) + nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%" PRId64, data[i]); + if( (nByteWritten > -1) && (nByteWritten < mfself->storage->numel()-mfself->position) ) { mfself->position += nByteWritten; break; } - THMemoryFile_grow(mfself, mfself->storage->size() + (mfself->storage->size()/2) + 2); + THMemoryFile_grow(mfself, mfself->storage->numel() + (mfself->storage->numel()/2) + 2); } if(mfself->file.isAutoSpacing) { @@ -654,7 +654,7 @@ THFile *THMemoryFile_newWithStorage(THCharStorage *storage, const char *mode) if(storage) { - THArgCheck(THCharStorage_data(storage)[storage->size()-1] == '\0', 1, "provided CharStorage must be terminated by 0"); + THArgCheck(THCharStorage_data(storage)[storage->numel()-1] == '\0', 1, "provided CharStorage must be terminated by 0"); THArgCheck(THMemoryFile_mode(mode, &isReadable, &isWritable), 2, "file mode should be 'r','w' or 'rw'"); THCharStorage_retain(storage); } @@ -668,7 +668,7 @@ THFile *THMemoryFile_newWithStorage(THCharStorage *storage, const char *mode) mfself = static_cast(THAlloc(sizeof(THMemoryFile))); mfself->storage = storage; - mfself->size = (storage ? storage->size()-1 : 0); + mfself->size = (storage ? storage->numel()-1 : 0); mfself->position = 0; mfself->longSize = 0; diff --git a/aten/src/TH/THStorageFunctions.cpp b/aten/src/TH/THStorageFunctions.cpp index b0e4abe9329db7..a5319e67dabe61 100644 --- a/aten/src/TH/THStorageFunctions.cpp +++ b/aten/src/TH/THStorageFunctions.cpp @@ -34,7 +34,7 @@ void THStorage_free(THStorage* storage) { ptrdiff_t THStorage_size(const THStorage *self) { - return self->size(); + return self->numel(); } void THStorage_retain(THStorage *storage) @@ -49,21 +49,21 @@ void THStorage_resize(THStorage* storage, ptrdiff_t size) { /* case when the allocator does not have a realloc defined */ at::DataPtr new_data; if (size != 0) { - new_data = storage->allocator()->allocate(storage->elementSize() * size); + new_data = storage->allocator()->allocate(storage->itemsize() * size); } at::DataPtr old_data = storage->set_data_ptr(std::move(new_data)); - ptrdiff_t old_size = storage->size(); - storage->set_size(size); + ptrdiff_t old_size = storage->numel(); + storage->set_numel(size); if (old_data != nullptr) { ptrdiff_t copy_size = old_size; - if (storage->size() < copy_size) { - copy_size = storage->size(); + if (storage->numel() < copy_size) { + copy_size = storage->numel(); } if (copy_size > 0) { memcpy( storage->data(), old_data.get(), - storage->elementSize() * copy_size); + storage->itemsize() * copy_size); } } } else { diff --git a/aten/src/TH/THStorageFunctions.hpp b/aten/src/TH/THStorageFunctions.hpp index 9fe0db5e5497f9..362fa6e2c83de5 100644 --- a/aten/src/TH/THStorageFunctions.hpp +++ b/aten/src/TH/THStorageFunctions.hpp @@ -8,7 +8,6 @@ #include #include -#include "THTypeConversion.hpp" #include // Note [Weak references for intrusive refcounting] diff --git a/aten/src/TH/THTensor.cpp b/aten/src/TH/THTensor.cpp index 1b1f493ac4e289..0c731779b95685 100644 --- a/aten/src/TH/THTensor.cpp +++ b/aten/src/TH/THTensor.cpp @@ -125,7 +125,7 @@ void THTensor_resizeNd(THTensor *self, int nDimension, const int64_t *size, cons if(!THTensor_getStoragePtr(self)) { THTensor_stealAndSetStoragePtr(self, THStorage_new(self->scalar_type())); } - if(totalSize+self->storage_offset() > THTensor_getStoragePtr(self)->size()) { + if(totalSize+self->storage_offset() > THTensor_getStoragePtr(self)->numel()) { THStorage_resize(THTensor_getStoragePtr(self), totalSize+self->storage_offset()); } } diff --git a/aten/src/TH/THTypeConversion.hpp b/aten/src/TH/THTypeConversion.hpp deleted file mode 100644 index d40169e7180e58..00000000000000 --- a/aten/src/TH/THTypeConversion.hpp +++ /dev/null @@ -1,24 +0,0 @@ -#pragma once - -#include -#include "THHalf.h" - -// Type traits to convert types to TH-specific types. Used primarily to -// convert at::Half to TH's half type. This makes the conversion explicit. -// FIXME: we should just use the same type - -namespace th { - -template -struct FromTypeConversion { - using type = T; -}; - -template <> -struct FromTypeConversion { - using type = at::Half; -}; - -template -using from_type = typename FromTypeConversion::type; -} diff --git a/aten/src/TH/generic/THStorage.cpp b/aten/src/TH/generic/THStorage.cpp index 21431ef778d5a0..992cbd5bb7509f 100644 --- a/aten/src/TH/generic/THStorage.cpp +++ b/aten/src/TH/generic/THStorage.cpp @@ -21,13 +21,13 @@ size_t THStorage_(elementSize)() THStorage* THStorage_(new)(void) { - return THStorage_new(at::CTypeToScalarType>::to()); + return THStorage_new(at::CTypeToScalarType::to()); } THStorage* THStorage_(newWithSize)(ptrdiff_t size) { THStorage* storage = c10::make_intrusive( - at::scalarTypeToDataType(at::CTypeToScalarType>::to()), + at::scalarTypeToDataType(at::CTypeToScalarType::to()), size, getTHDefaultAllocator(), true).release(); @@ -38,7 +38,7 @@ THStorage* THStorage_(newWithAllocator)(ptrdiff_t size, at::Allocator *allocator) { THStorage* storage = c10::make_intrusive( - at::scalarTypeToDataType(at::CTypeToScalarType>::to()), + at::scalarTypeToDataType(at::CTypeToScalarType::to()), size, allocator, true).release(); @@ -48,7 +48,7 @@ THStorage* THStorage_(newWithAllocator)(ptrdiff_t size, THStorage* THStorage_(newWithMapping)(const char *filename, ptrdiff_t size, int flags) { - auto scalar_type = at::CTypeToScalarType>::to(); + auto scalar_type = at::CTypeToScalarType::to(); size_t actual_size = -1; THStorage* storage = c10::make_intrusive( at::scalarTypeToDataType(scalar_type), @@ -59,7 +59,7 @@ THStorage* THStorage_(newWithMapping)(const char *filename, ptrdiff_t size, int false).release(); if (size <= 0) { - storage->set_size(actual_size / at::elementSize(scalar_type)); + storage->set_numel(actual_size / at::elementSize(scalar_type)); } return storage; @@ -116,7 +116,7 @@ void THStorage_(free)(THStorage *storage) THStorage* THStorage_(newWithDataAndAllocator)(at::DataPtr&& data, ptrdiff_t size, at::Allocator* allocator) { THStorage* storage = c10::make_intrusive( - at::scalarTypeToDataType(at::CTypeToScalarType>::to()), + at::scalarTypeToDataType(at::CTypeToScalarType::to()), size, std::move(data), allocator, @@ -132,19 +132,19 @@ void THStorage_(resize)(THStorage *storage, ptrdiff_t size) void THStorage_(fill)(THStorage *storage, real value) { ptrdiff_t i; - for(i = 0; i < storage->size(); i++) + for(i = 0; i < storage->numel(); i++) THStorage_(data)(storage)[i] = value; } void THStorage_(set)(THStorage *self, ptrdiff_t idx, real value) { - THArgCheck((idx >= 0) && (idx < self->size()), 2, "out of bounds"); + THArgCheck((idx >= 0) && (idx < self->numel()), 2, "out of bounds"); THStorage_(data)(self)[idx] = value; } real THStorage_(get)(const THStorage *self, ptrdiff_t idx) { - THArgCheck((idx >= 0) && (idx < self->size()), 2, "out of bounds"); + THArgCheck((idx >= 0) && (idx < self->numel()), 2, "out of bounds"); return THStorage_(data)(self)[idx]; } diff --git a/aten/src/TH/generic/THStorageCopy.cpp b/aten/src/TH/generic/THStorageCopy.cpp index 0cde162d4c2843..442f7dbde2925d 100644 --- a/aten/src/TH/generic/THStorageCopy.cpp +++ b/aten/src/TH/generic/THStorageCopy.cpp @@ -6,13 +6,13 @@ void THStorage_(rawCopy)(THStorage *storage, real *src) { ptrdiff_t i; real *data = THStorage_(data)(storage); - for(i = 0; i < storage->size(); i++) + for(i = 0; i < storage->numel(); i++) data[i] = src[i]; } void THStorage_(copy)(THStorage *storage, THStorage *src) { - THArgCheck(storage->size() == src->size(), 2, "size mismatch"); + THArgCheck(storage->numel() == src->numel(), 2, "size mismatch"); THStorage_(rawCopy)(storage, THStorage_(data)(src)); } @@ -25,40 +25,40 @@ void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage ptrdiff_t i; \ auto data = THStorage_(data)(storage); \ auto src_data = TH##TYPENAMESRC##Storage_data(src); \ - for(i = 0; i < storage->size(); i++) \ + for(i = 0; i < storage->numel(); i++) \ data[i] = static_cast(src_data[i]); \ } #define IMPLEMENT_THStorage_COPY_FROM_HALF(TYPENAMESRC) \ void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \ { \ - THArgCheck(storage->size() == src->size(), 2, "size mismatch"); \ + THArgCheck(storage->numel() == src->numel(), 2, "size mismatch"); \ ptrdiff_t i; \ auto data = THStorage_(data)(storage); \ auto src_data = TH##TYPENAMESRC##Storage_data(src); \ - for(i = 0; i < storage->size(); i++) \ + for(i = 0; i < storage->numel(); i++) \ data[i] = (real)TH_half2float(src_data[i]); \ } #define IMPLEMENT_THStorage_COPY_TO_HALF(TYPENAMESRC) \ void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \ { \ - THArgCheck(storage->size() == src->size(), 2, "size mismatch"); \ + THArgCheck(storage->numel() == src->numel(), 2, "size mismatch"); \ ptrdiff_t i; \ auto data = THStorage_(data)(storage); \ auto src_data = TH##TYPENAMESRC##Storage_data(src); \ - for(i = 0; i < storage->size(); i++) \ + for(i = 0; i < storage->numel(); i++) \ data[i] = TH_float2half((float)(src_data[i])); \ } #define IMPLEMENT_THStorage_COPY_TO_FROM_HALF(TYPENAMESRC) \ void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \ { \ - THArgCheck(storage->size() == src->size(), 2, "size mismatch"); \ + THArgCheck(storage->numel() == src->numel(), 2, "size mismatch"); \ ptrdiff_t i; \ auto data = THStorage_(data)(storage); \ auto src_data = TH##TYPENAMESRC##Storage_data(src); \ - for(i = 0; i < storage->size(); i++) \ + for(i = 0; i < storage->numel(); i++) \ data[i] = static_cast(src_data[i]); \ } diff --git a/aten/src/THC/THCStorage.cpp b/aten/src/THC/THCStorage.cpp index a9a1790c58c830..96e3938e20b0f9 100644 --- a/aten/src/THC/THCStorage.cpp +++ b/aten/src/THC/THCStorage.cpp @@ -20,17 +20,17 @@ void THCStorage_resize(THCState *state, THCStorage *self, ptrdiff_t size) if (!self->resizable()) THError("Trying to resize storage that is not resizable"); - size_t elementSize = self->elementSize(); + size_t itemsize = self->itemsize(); if(size == 0) { self->set_data_ptr(at::DataPtr(nullptr, at::Device(at::DeviceType::CUDA, device))); - self->set_size(0); + self->set_numel(0); } else { at::DataPtr data = - self->allocator()->allocate(size * elementSize); + self->allocator()->allocate(size * itemsize); if (self->data_ptr()) { // Enable p2p access when the memcpy is across devices @@ -38,14 +38,14 @@ void THCStorage_resize(THCState *state, THCStorage *self, ptrdiff_t size) THCudaCheck(cudaMemcpyAsync(data.get(), self->data(), - THMin(self->size(), size) * elementSize, + THMin(self->numel(), size) * itemsize, cudaMemcpyDeviceToDevice, THCState_getCurrentStream(state))); } // Destructively overwrite data_ptr self->set_data_ptr(std::move(data)); - self->set_size(size); + self->set_numel(size); } } diff --git a/aten/src/THC/THCTensor.cpp b/aten/src/THC/THCTensor.cpp index 3826ea57fc5da3..de787bd380b6e6 100644 --- a/aten/src/THC/THCTensor.cpp +++ b/aten/src/THC/THCTensor.cpp @@ -148,7 +148,7 @@ void THCTensor_resizeNd(THCState *state, THCTensor *self, int nDimension, const if(!THTensor_getStoragePtr(self)) { THError("Tensor: invalid null storage"); } - if(totalSize+self->storage_offset() > THTensor_getStoragePtr(self)->size()) { + if(totalSize+self->storage_offset() > THTensor_getStoragePtr(self)->numel()) { THCStorage_resize(state, THTensor_getStoragePtr(self), totalSize+self->storage_offset()); } } diff --git a/aten/src/THC/generic/THCStorage.cpp b/aten/src/THC/generic/THCStorage.cpp index aef30d62517061..feb2e94959abf2 100644 --- a/aten/src/THC/generic/THCStorage.cpp +++ b/aten/src/THC/generic/THCStorage.cpp @@ -21,7 +21,7 @@ int THCStorage_(elementSize)(THCState *state) void THCStorage_(set)(THCState *state, THCStorage *self, ptrdiff_t index, real value) { - THArgCheck((index >= 0) && (index < self->size()), 2, "index out of bounds"); + THArgCheck((index >= 0) && (index < self->numel()), 2, "index out of bounds"); cudaStream_t stream = THCState_getCurrentStream(state); THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self) + index, &value, sizeof(real), cudaMemcpyHostToDevice, @@ -31,7 +31,7 @@ void THCStorage_(set)(THCState *state, THCStorage *self, ptrdiff_t index, real v real THCStorage_(get)(THCState *state, const THCStorage *self, ptrdiff_t index) { - THArgCheck((index >= 0) && (index < self->size()), 2, "index out of bounds"); + THArgCheck((index >= 0) && (index < self->numel()), 2, "index out of bounds"); real value; cudaStream_t stream = THCState_getCurrentStream(state); THCudaCheck(cudaMemcpyAsync(&value, THCStorage_(data)(state, self) + index, sizeof(real), diff --git a/aten/src/THC/generic/THCStorage.cu b/aten/src/THC/generic/THCStorage.cu index a6b3bf557e2f63..95f2bc7163d46f 100644 --- a/aten/src/THC/generic/THCStorage.cu +++ b/aten/src/THC/generic/THCStorage.cu @@ -10,7 +10,7 @@ void THCStorage_(fill)(THCState *state, THCStorage *self, real value) #if CUDA_VERSION >= 7000 thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), #endif - self_data, self_data+self->size(), value); + self_data, self_data+self->numel(), value); } void THCStorage_(resize)(THCState *state, THCStorage *self, ptrdiff_t size) diff --git a/aten/src/THC/generic/THCStorageCopy.cpp b/aten/src/THC/generic/THCStorageCopy.cpp index 9194ab7d3c80d4..546777baaf98c7 100644 --- a/aten/src/THC/generic/THCStorageCopy.cpp +++ b/aten/src/THC/generic/THCStorageCopy.cpp @@ -4,11 +4,11 @@ void THCStorage_(copyCPU)(THCState *state, THCStorage *self, struct THStorage *src) { - THArgCheck(self->size() == src->size(), 2, "size does not match"); + THArgCheck(self->numel() == src->numel(), 2, "size does not match"); cudaStream_t stream = THCState_getCurrentStream(state); THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self), THStorage_(data)(src), - self->size() * sizeof(real), + self->numel() * sizeof(real), cudaMemcpyHostToDevice, stream)); THCudaCheck(cudaStreamSynchronize(stream)); @@ -18,9 +18,9 @@ void THCStorage_(copyCPU)(THCState *state, THCStorage *self, struct THStorage *s void THCStorage_(copy##TYPEC)(THCState *state, THCStorage *self, struct TH##TYPEC##Storage *src) \ { \ THCTensor* selfTensor = \ - THCTensor_(newWithStorage1d)(state, self, 0, self->size(), 1); \ + THCTensor_(newWithStorage1d)(state, self, 0, self->numel(), 1); \ struct TH##TYPEC##Tensor* srcTensor = \ - TH##TYPEC##Tensor_newWithStorage1d(src, 0, src->size(), 1); \ + TH##TYPEC##Tensor_newWithStorage1d(src, 0, src->numel(), 1); \ THCTensor_(copy##TYPEC)(state, selfTensor, srcTensor); \ TH##TYPEC##Tensor_free(srcTensor); \ THCTensor_(free)(state, selfTensor); \ @@ -36,11 +36,11 @@ TH_CUDA_STORAGE_IMPLEMENT_COPY(Double) void THStorage_(copyCuda)(THCState *state, THStorage *self, struct THCStorage *src) { - THArgCheck(self->size() == src->size(), 2, "size does not match"); + THArgCheck(self->numel() == src->numel(), 2, "size does not match"); cudaStream_t stream = THCState_getCurrentStream(state); THCudaCheck(cudaMemcpyAsync(THStorage_(data)(self), THCStorage_(data)(state, src), - self->size() * sizeof(real), + self->numel() * sizeof(real), cudaMemcpyDeviceToHost, stream)); THCudaCheck(cudaStreamSynchronize(stream)); @@ -50,9 +50,9 @@ void THStorage_(copyCuda)(THCState *state, THStorage *self, struct THCStorage *s void TH_CONCAT_4(TH,TYPEC,Storage_copyCuda,Real)(THCState *state, TH##TYPEC##Storage *self, struct THCStorage *src) \ { \ TH##TYPEC##Tensor* selfTensor = \ - TH##TYPEC##Tensor_newWithStorage1d(self, 0, self->size(), 1); \ + TH##TYPEC##Tensor_newWithStorage1d(self, 0, self->numel(), 1); \ struct THCTensor* srcTensor = \ - THCTensor_(newWithStorage1d)(state, src, 0, src->size(), 1); \ + THCTensor_(newWithStorage1d)(state, src, 0, src->numel(), 1); \ TH_CONCAT_4(TH,TYPEC,Tensor_copyCuda,Real)(state, selfTensor, srcTensor); \ THCTensor_(free)(state, srcTensor); \ TH##TYPEC##Tensor_free(selfTensor); \ diff --git a/aten/src/THC/generic/THCStorageCopy.cu b/aten/src/THC/generic/THCStorageCopy.cu index bea4fe699623fb..962167c73b82c8 100644 --- a/aten/src/THC/generic/THCStorageCopy.cu +++ b/aten/src/THC/generic/THCStorageCopy.cu @@ -4,17 +4,17 @@ void THCStorage_(rawCopy)(THCState *state, THCStorage *self, real *src) { - THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self), src, self->size() * sizeof(real), cudaMemcpyDeviceToDevice, THCState_getCurrentStream(state))); + THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self), src, self->numel() * sizeof(real), cudaMemcpyDeviceToDevice, THCState_getCurrentStream(state))); } // conversions are delegated to THCTensor implementation #define THC_CUDA_STORAGE_IMPLEMENT_COPY(TYPEC,TYPECUDA) \ void THCStorage_(copyCuda##TYPEC)(THCState *state, THCStorage *self, struct THCuda##TYPECUDA##Storage *src) \ { \ - THArgCheck(self->size() == src->size(), 2, "size does not match"); \ - THCTensor* selfTensor = THCTensor_(newWithStorage1d)(state, self, 0, self->size(), 1); \ + THArgCheck(self->numel() == src->numel(), 2, "size does not match"); \ + THCTensor* selfTensor = THCTensor_(newWithStorage1d)(state, self, 0, self->numel(), 1); \ struct THCuda##TYPECUDA##Tensor* srcTensor = \ - THCuda##TYPECUDA##Tensor_newWithStorage1d(state, src, 0, src->size(), 1); \ + THCuda##TYPECUDA##Tensor_newWithStorage1d(state, src, 0, src->numel(), 1); \ THCTensor_(copyCuda##TYPEC)(state, selfTensor, srcTensor); \ THCuda##TYPECUDA##Tensor_free(state, srcTensor); \ THCTensor_(free)(state, selfTensor); \ diff --git a/aten/src/THC/generic/THCTensorMathMagma.cu b/aten/src/THC/generic/THCTensorMathMagma.cu index aee04a8e22a4e4..3b63c3ae1c7b2f 100644 --- a/aten/src/THC/generic/THCTensorMathMagma.cu +++ b/aten/src/THC/generic/THCTensorMathMagma.cu @@ -235,7 +235,13 @@ THC_API void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, T else if (info < 0) THError("MAGMA syev : Argument %d : illegal value", -info); } - THCTensor_(freeCopyTo)(state, input, rv_); + if (jobzs[0] == 'N') { + // If eigenvector is not needed, fill the result with zeros. + THCTensor_(zero)(state, rv_); + THCTensor_(free)(state, input); + } else { + THCTensor_(freeCopyTo)(state, input, rv_); + } #else THError(NoMagma(syev)); #endif diff --git a/binaries/benchmark_helper.cc b/binaries/benchmark_helper.cc index 7a441dc1c5c2ee..daf3ccac90eecf 100644 --- a/binaries/benchmark_helper.cc +++ b/binaries/benchmark_helper.cc @@ -14,8 +14,9 @@ * limitations under the License. */ -#include #include +#include +#include #include #include "binaries/benchmark_helper.h" @@ -309,3 +310,88 @@ void writeOutput( } } } + +int benchmark( + int argc, + char* argv[], + const string& FLAGS_backend, + const string& FLAGS_init_net, + const string& FLAGS_input, + const string& FLAGS_input_dims, + const string& FLAGS_input_file, + const string& FLAGS_input_type, + int FLAGS_iter, + const string& FLAGS_net, + const string& FLAGS_output, + const string& FLAGS_output_folder, + bool FLAGS_run_individual, + int FLAGS_sleep_before_run, + bool FLAGS_text_output, + int FLAGS_warmup, + bool FLAGS_wipe_cache) { + caffe2::GlobalInit(&argc, &argv); + // Check arguments to be correct + { + // Need to check whether file exists, as the file reader does not assert if + // file does not exist + std::ifstream net_file(FLAGS_net); + CAFFE_ENFORCE(net_file.good()); + + std::ifstream init_net_file(FLAGS_init_net); + CAFFE_ENFORCE(init_net_file.good()); + + if (FLAGS_input_file.size() > 0) { + vector input_files = caffe2::split(',', FLAGS_input_file); + for (auto input_file : input_files) { + std::ifstream ifile(input_file); + CAFFE_ENFORCE(ifile.good()); + } + } + } + + observerConfig(); + caffe2::ShowLogInfoToStderr(); + + auto workspace = std::make_shared(new caffe2::Workspace()); + bool run_on_gpu = backendCudaSet(FLAGS_backend); + // Run initialization network. + caffe2::NetDef init_net_def; + CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_init_net, &init_net_def)); + setOperatorEngine(&init_net_def, FLAGS_backend); + CAFFE_ENFORCE(workspace->RunNetOnce(init_net_def)); + + // Run main network. + caffe2::NetDef net_def; + CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_net, &net_def)); + setOperatorEngine(&net_def, FLAGS_backend); + + map tensor_protos_map; + + loadInput( + workspace, + run_on_gpu, + tensor_protos_map, + FLAGS_input, + FLAGS_input_file, + FLAGS_input_dims, + FLAGS_input_type); + + runNetwork( + workspace, + net_def, + tensor_protos_map, + FLAGS_wipe_cache, + FLAGS_run_individual, + FLAGS_warmup, + FLAGS_iter, + FLAGS_sleep_before_run); + + writeOutput( + workspace, + run_on_gpu, + FLAGS_output, + FLAGS_output_folder, + FLAGS_text_output); + + return 0; +} diff --git a/binaries/benchmark_helper.h b/binaries/benchmark_helper.h index df23ed8651118a..5bf79182dab7e1 100644 --- a/binaries/benchmark_helper.h +++ b/binaries/benchmark_helper.h @@ -98,3 +98,21 @@ void runNetwork( const int, const int, const int); +int benchmark( + int argc, + char* argv[], + const string& FLAGS_backend, + const string& FLAGS_init_net, + const string& FLAGS_input, + const string& FLAGS_input_dims, + const string& FLAGS_input_file, + const string& FLAGS_input_type, + int FLAGS_iter, + const string& FLAGS_net, + const string& FLAGS_output, + const string& FLAGS_output_folder, + bool FLAGS_run_individual, + int FLAGS_sleep_before_run, + bool FLAGS_text_output, + int FLAGS_warmup, + bool FLAGS_wipe_cache); diff --git a/binaries/caffe2_benchmark.cc b/binaries/caffe2_benchmark.cc index c5a93ae7cbae33..38badccfa1e4bb 100644 --- a/binaries/caffe2_benchmark.cc +++ b/binaries/caffe2_benchmark.cc @@ -77,51 +77,22 @@ CAFFE2_DEFINE_bool( "Whether to evict the cache before running network."); int main(int argc, char** argv) { - caffe2::GlobalInit(&argc, &argv); - - observerConfig(); - caffe2::ShowLogInfoToStderr(); - - auto workspace = make_shared(new caffe2::Workspace()); - bool run_on_gpu = backendCudaSet(caffe2::FLAGS_backend); - // Run initialization network. - caffe2::NetDef init_net_def; - CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_init_net, &init_net_def)); - setOperatorEngine(&init_net_def, caffe2::FLAGS_backend); - CAFFE_ENFORCE(workspace->RunNetOnce(init_net_def)); - - // Run main network. - caffe2::NetDef net_def; - CAFFE_ENFORCE(ReadProtoFromFile(caffe2::FLAGS_net, &net_def)); - setOperatorEngine(&net_def, caffe2::FLAGS_backend); - - map tensor_protos_map; - - loadInput( - workspace, - run_on_gpu, - tensor_protos_map, + benchmark( + argc, + argv, + caffe2::FLAGS_backend, + caffe2::FLAGS_init_net, caffe2::FLAGS_input, - caffe2::FLAGS_input_file, caffe2::FLAGS_input_dims, - caffe2::FLAGS_input_type); - - runNetwork( - workspace, - net_def, - tensor_protos_map, - caffe2::FLAGS_wipe_cache, - caffe2::FLAGS_run_individual, - caffe2::FLAGS_warmup, + caffe2::FLAGS_input_file, + caffe2::FLAGS_input_type, caffe2::FLAGS_iter, - caffe2::FLAGS_sleep_before_run); - - writeOutput( - workspace, - run_on_gpu, + caffe2::FLAGS_net, caffe2::FLAGS_output, caffe2::FLAGS_output_folder, - caffe2::FLAGS_text_output); - - return 0; + caffe2::FLAGS_run_individual, + caffe2::FLAGS_sleep_before_run, + caffe2::FLAGS_text_output, + caffe2::FLAGS_warmup, + caffe2::FLAGS_wipe_cache); } diff --git a/caffe2/core/logging.cc b/caffe2/core/logging.cc index ec7518630c9525..a394f91c729b87 100644 --- a/caffe2/core/logging.cc +++ b/caffe2/core/logging.cc @@ -236,6 +236,12 @@ MessageLogger::~MessageLogger() { if (severity_ >= FLAGS_caffe2_log_level) { // If not building on Android, log all output to std::cerr. std::cerr << stream_.str(); + // Simulating the glog default behavior: if the severity is above INFO, + // we flush the stream so that the output appears immediately on std::cerr. + // This is expected in some of our tests. + if (severity_ > INFO) { + std::cerr << std::flush; + } } #endif // ANDROID if (severity_ == FATAL) { diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/Compiler.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/Compiler.h index 8560ff82374d9a..8c24a2e2cb1076 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/Compiler.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/Compiler.h @@ -11,15 +11,15 @@ namespace repr { class CAFFE2_API Value { public: enum class ValueKind { Value, Instruction, Data }; - Value(ValueKind K) : Kind(K) {} - Value() : Kind(ValueKind::Value) {} + Value(ValueKind K) : kind_(K) {} + Value() : kind_(ValueKind::Value) {} ValueKind getKind() const { - return Kind; + return kind_; } virtual ~Value() = default; private: - const ValueKind Kind; + const ValueKind kind_; }; class CAFFE2_API Data : public Value { @@ -30,15 +30,15 @@ class CAFFE2_API Data : public Value { } virtual ~Data() = default; size_t getVersion() const { - return Version; + return version_; } void setVersion(size_t version) { - Version = version; + version_ = version; } private: - size_t Version = 0; + size_t version_ = 0; }; class CAFFE2_API Instruction : public Value { @@ -52,18 +52,18 @@ class CAFFE2_API Instruction : public Value { TerminatorEnd, Phi }; - Instruction() : Value(ValueKind::Instruction), Op(Opcode::Generic) {} - Instruction(Opcode op) : Value(ValueKind::Instruction), Op(op) {} + Instruction() : Value(ValueKind::Instruction), op_(Opcode::Generic) {} + Instruction(Opcode op) : Value(ValueKind::Instruction), op_(op) {} CAFFE2_API static bool classof(const Value* V) { return V->getKind() == ValueKind::Instruction; } virtual ~Instruction() = default; Opcode getOpcode() const { - return Op; + return op_; } private: - Opcode Op; + Opcode op_; }; class CAFFE2_API Terminator : public Instruction { diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/ControlFlow.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/ControlFlow.h index 835f187febf15d..1934b1f1b7bad4 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/ControlFlow.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/ControlFlow.h @@ -19,45 +19,45 @@ class CAFFE2_API BasicBlock { using NodeRef = typename Subgraph::NodeRef; BasicBlock() {} ~BasicBlock() { - for (auto pair : callbacks) { + for (auto pair : callbacks_) { pair.first->deleteDestructorCallback(pair.second); } } void trackNode(NodeRef node) { - callbacks[node] = node->registerDestructorCallback([&](NodeRef n) { + callbacks_[node] = node->registerDestructorCallback([&](NodeRef n) { assert( hasInstruction(n) && "Destructor callback invoked on untracked node in BasicBlock."); deleteInstruction(n); }); - Nodes.addNode(node); + nodes_.addNode(node); } void untrackNode(NodeRef node) { - callbacks.erase(node); - Nodes.removeNode(node); + callbacks_.erase(node); + nodes_.removeNode(node); } void pushInstructionNode(NodeRef node) { assert( isa(node->data()) && "Cannot push non-instruction node to basic block."); - Instructions.emplace_back(node); + instructions_.emplace_back(node); trackNode(node); } const std::vector& getInstructions() { - return Instructions; + return instructions_; } bool hasInstruction(NodeRef instr) const { - return Nodes.hasNode(instr); + return nodes_.hasNode(instr); } void insertInstructionBefore(NodeRef newInstr, NodeRef instr) { auto it = - std::find(std::begin(Instructions), std::end(Instructions), instr); - Instructions.insert(it, newInstr); + std::find(std::begin(instructions_), std::end(instructions_), instr); + instructions_.insert(it, newInstr); trackNode(newInstr); } @@ -65,28 +65,28 @@ class CAFFE2_API BasicBlock { assert(hasInstruction(instr1) && "Instruction not in basic block."); assert(hasInstruction(instr2) && "Instruction not in basic block."); auto it1 = - std::find(std::begin(Instructions), std::end(Instructions), instr1); + std::find(std::begin(instructions_), std::end(instructions_), instr1); auto it2 = - std::find(std::begin(Instructions), std::end(Instructions), instr2); - Instructions.erase(it1); - Instructions.insert(it2, instr1); + std::find(std::begin(instructions_), std::end(instructions_), instr2); + instructions_.erase(it1); + instructions_.insert(it2, instr1); } void deleteInstruction(NodeRef instr) { assert(hasInstruction(instr) && "Instruction not in basic block."); - Instructions.erase( - std::remove(Instructions.begin(), Instructions.end(), instr), - Instructions.end()); + instructions_.erase( + std::remove(instructions_.begin(), instructions_.end(), instr), + instructions_.end()); untrackNode(instr); } private: - Subgraph Nodes; - std::vector Instructions; + Subgraph nodes_; + std::vector instructions_; // Because we reference a dataflow graph, we need to register callbacks // for when the dataflow graph is modified. std::unordered_map>::Callback*> - callbacks; + callbacks_; }; using Program = Graph; diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h index 1f7e2c27906c99..b1e9283bc9ccee 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h @@ -45,19 +45,19 @@ class CAFFE2_API Annotation { public: enum class AnnotationKind { Generic, Caffe2 }; - Annotation(AnnotationKind K) : Kind(K) {} - Annotation() : Kind(AnnotationKind::Generic) {} + Annotation(AnnotationKind kind) : kind_(kind) {} + Annotation() : kind_(AnnotationKind::Generic) {} virtual ~Annotation() {} AnnotationKind getKind() const { - return Kind; + return kind_; } Annotation(const Annotation&) = delete; Annotation& operator=(Annotation&) = delete; private: - const AnnotationKind Kind; + const AnnotationKind kind_; }; class CAFFE2_API NeuralNetOperator : public Instruction { @@ -75,36 +75,38 @@ class CAFFE2_API NeuralNetOperator : public Instruction { enum class NNLayout { Undefined, NCHW, NHWC }; NeuralNetOperator(NNKind K, Opcode I, NNLayout L) - : Instruction(I), Kind(K), Layout(L) {} + : Instruction(I), kind_(K), layout_(L) {} NeuralNetOperator(NNKind K, Opcode I) - : Instruction(I), Kind(K), Layout(NNLayout::Undefined) {} - NeuralNetOperator(NNKind K, NNLayout L) : Instruction(), Kind(K), Layout(L) {} + : Instruction(I), kind_(K), layout_(NNLayout::Undefined) {} + NeuralNetOperator(NNKind K, NNLayout L) + : Instruction(), kind_(K), layout_(L) {} NeuralNetOperator(NNKind K) - : Instruction(), Kind(K), Layout(NNLayout::Undefined) {} + : Instruction(), kind_(K), layout_(NNLayout::Undefined) {} NeuralNetOperator() - : Instruction(), Kind(NNKind::Undefined), Layout(NNLayout::Undefined) {} + : Instruction(), kind_(NNKind::Undefined), layout_(NNLayout::Undefined) {} NNKind getKind() const { - return Kind; + return kind_; } void setLayout(NNLayout L) { - Layout = L; + layout_ = L; } NNLayout getLayout() const { - return Layout; + return layout_; } void setAnnotation(std::unique_ptr extraAnnotation) { - ExtraAnnotation = std::move(extraAnnotation); + extraAnnotation_ = std::move(extraAnnotation); } const Annotation* getAnnotation() const { - return ExtraAnnotation.get(); + return extraAnnotation_.get(); } + Annotation* getMutableAnnotation() { - return ExtraAnnotation.get(); + return extraAnnotation_.get(); } const std::string getName() const; @@ -128,9 +130,9 @@ class CAFFE2_API NeuralNetOperator : public Instruction { NeuralNetOperator& operator=(NeuralNetOperator&) = delete; private: - const NNKind Kind; - NNLayout Layout; // Mutable attribute, much like a type cast - std::unique_ptr ExtraAnnotation; + const NNKind kind_; + NNLayout layout_; // Mutable attribute, much like a type cast + std::unique_ptr extraAnnotation_; }; class CAFFE2_API NeuralNetData : public Data { @@ -138,12 +140,12 @@ class CAFFE2_API NeuralNetData : public Data { /// Discriminator for LLVM-style RTTI (isa<>) enum class NNDataKind { Generic, Tensor }; - NeuralNetData(NNDataKind kind) : Kind(kind) {} + NeuralNetData(NNDataKind kind) : kind_(kind) {} - NeuralNetData() : Kind(NNDataKind::Generic) {} + NeuralNetData() : kind_(NNDataKind::Generic) {} NNDataKind getKind() const { - return Kind; + return kind_; } virtual NeuralNetData* clone() = 0; @@ -153,8 +155,8 @@ class CAFFE2_API NeuralNetData : public Data { virtual ~NeuralNetData() = 0; private: - NNDataKind Kind; - size_t Version = 0; + NNDataKind kind_; + size_t version_ = 0; }; class CAFFE2_API Tensor : public NeuralNetData { diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Support/Common.h b/caffe2/core/nomnigraph/include/nomnigraph/Support/Common.h index cef1bdec522a56..91e4c2f6e01e87 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Support/Common.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Support/Common.h @@ -71,13 +71,13 @@ class Notifier { Notifier() {} Callback* registerDestructorCallback(Callback fn) { - DtorCallbacks.emplace_back(fn); - return &DtorCallbacks.back(); + dtorCallbacks_.emplace_back(fn); + return &dtorCallbacks_.back(); } Callback* registerNotificationCallback(Callback fn) { - NotifCallbacks.emplace_back(fn); - return &NotifCallbacks.back(); + notifCallbacks_.emplace_back(fn); + return ¬ifCallbacks_.back(); } void deleteCallback(std::list& callbackList, Callback* toDelete) { @@ -90,11 +90,11 @@ class Notifier { } void deleteDestructorCallback(Callback* c) { - deleteCallback(DtorCallbacks, c); + deleteCallback(dtorCallbacks_, c); } void deleteNotificationCallback(Callback* c) { - deleteCallback(NotifCallbacks, c); + deleteCallback(notifCallbacks_, c); } /// \brief Notifies all listeners (`registerNotificationCallback` @@ -102,20 +102,20 @@ class Notifier { /// is encoded in the state of the derived class, thus only passing /// a pointer of type T* to the callback. void notify() { - for (auto callback : NotifCallbacks) { + for (auto callback : notifCallbacks_) { callback(reinterpret_cast(this)); } } virtual ~Notifier() { - for (auto callback : DtorCallbacks) { + for (auto callback : dtorCallbacks_) { callback(reinterpret_cast(this)); } } private: - std::list DtorCallbacks; - std::list NotifCallbacks; + std::list dtorCallbacks_; + std::list notifCallbacks_; }; #endif /* NOM_SUPPORT_COMMON_H */ diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h index 23740cfc5772e5..f1934f5ddbc28d 100644 --- a/caffe2/core/tensor.h +++ b/caffe2/core/tensor.h @@ -25,7 +25,7 @@ inline vector ToVectorTIndex(const std::vector& src) { } /** - * Return product of all dimensions starting from K + * Return product of all dimensions starting from k */ inline TIndex size_from_dim_(int k, const vector& dims) { TIndex r = 1; @@ -35,7 +35,7 @@ inline TIndex size_from_dim_(int k, const vector& dims) { return r; } -// Product of all dims up to +// Product of all dims up to k (not including dims[k]) inline TIndex size_to_dim_(int k, const vector& dims) { CAFFE_ENFORCE((unsigned)k <= dims.size()); TIndex r = 1; @@ -61,6 +61,7 @@ inline TIndex size_between_dim_(int k, int l, const vector& dims) { return r; } +// Wrap around axis_index if it is negative, s.t., -1 is the last dim inline int canonical_axis_index_(int axis_index, int ndims) { CAFFE_ENFORCE_GE(axis_index, -ndims); CAFFE_ENFORCE_LT(axis_index, ndims); @@ -274,9 +275,6 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1); CAFFE_ENFORCE_GE_WITH_CALLER( num, 0, "`num` must be non-negative for Extend"); - CAFFE_ENFORCE( - storage_.use_count() == 1, - "Can't call Extend on shared storage, please call Resize instead"); auto newDims = dims_; newDims[0] += num; if (!storage_->data()) { diff --git a/caffe2/ideep/operators/concat_split_op.cc b/caffe2/ideep/operators/concat_split_op.cc index eb2d5b6acf1a61..25d4e16d2f9e7a 100644 --- a/caffe2/ideep/operators/concat_split_op.cc +++ b/caffe2/ideep/operators/concat_split_op.cc @@ -25,13 +25,21 @@ class IDEEPConcatOp final : public IDEEPOperator { virtual ~IDEEPConcatOp() {} bool RunOnDevice() override { - const auto& input_zero = Input(INPUT0); auto* output = Output(OUTPUT); TensorCPU* axis_info = OperatorBase::Output(AXIS_INFO, CPU); vector inputs; for (int i = 0; i < InputSize(); ++i) { - inputs.emplace_back(Input(i)); + if (OperatorBase::InputBlob(i).template IsType()) { + inputs.emplace_back(Input(i)); + } else { + CAFFE_ENFORCE(OperatorBase::InputBlob(i).IsType(CPU), + "Expect cpu tensor if not itensor"); + auto& tensor_cpu = OperatorBase::Input(i, CPU); + CAFFE_ENFORCE(tensor_cpu.dims().size() == 0 || + tensor_cpu.size_from_dim(0) == 0, + "Expect zero dim tensor"); + } } auto axis_vdata = ideep::concat::compute(inputs, axis_, add_axis_, *output); diff --git a/caffe2/ideep/operators/operator_fallback_ideep.cc b/caffe2/ideep/operators/operator_fallback_ideep.cc index 8251b386eeb3c7..75895c5d844345 100644 --- a/caffe2/ideep/operators/operator_fallback_ideep.cc +++ b/caffe2/ideep/operators/operator_fallback_ideep.cc @@ -32,6 +32,8 @@ #include #include #include +#include +#include #include #include #include @@ -116,6 +118,12 @@ REGISTER_IDEEP_OPERATOR( REGISTER_IDEEP_OPERATOR( BBoxTransform, IDEEPFallbackOp>); +REGISTER_IDEEP_OPERATOR( + AffineChannel, + IDEEPFallbackOp>); +REGISTER_IDEEP_OPERATOR( + StopGradient, + IDEEPFallbackOp>); REGISTER_IDEEP_OPERATOR( PadImage, diff --git a/caffe2/ideep/operators/operator_fallback_ideep.h b/caffe2/ideep/operators/operator_fallback_ideep.h index ae4f903c23c2fc..31df729a217850 100644 --- a/caffe2/ideep/operators/operator_fallback_ideep.h +++ b/caffe2/ideep/operators/operator_fallback_ideep.h @@ -53,6 +53,8 @@ class IDEEPFallbackOp final : public IDEEPOperator { // then forward output blobs to local workspace. std::unordered_map forwarded_output_blobs; for (int i = 0; i < base_def_.output_size(); i++) { + // For in-place case, the in/output tensor for local_ws must be + // re-created, instead of forwarding from current workspace. string parent_name(base_def_.output(i)); if (!SkipOutputCopy::Contains(i)) { parent_name += "_cpu_output_blob_" + base_def_.type(); @@ -60,6 +62,13 @@ class IDEEPFallbackOp final : public IDEEPOperator { local_output_blobs_.push_back(ws->CreateBlob(parent_name)); CHECK_NOTNULL(local_output_blobs_.back()); forwarded_output_blobs[base_def_.output(i)] = parent_name; + output_inplace_.push_back(false); + for (const string &input_name : base_def_.input()) { + if (input_name == base_def_.output(i)) { + output_inplace_[i] = true; + break; + } + } } local_ws_.reset(new Workspace(ws, forwarded_output_blobs)); // Set up the symbols for the local workspace. @@ -67,31 +76,26 @@ class IDEEPFallbackOp final : public IDEEPOperator { local_input_blobs_.push_back(local_ws_->CreateBlob(name)); CHECK_NOTNULL(local_input_blobs_.back()); } + input_share_.resize(local_input_blobs_.size(), false); base_op_.reset(new CPUOp(base_def_, local_ws_.get())); } bool RunOnDevice() override { for (int i = 0; i < InputSize(); ++i) { - if (InputIsType(i) && Input(i).get_data_type() == itensor::data_type::f32) { + if (InputIsType(i) && + Input(i).get_data_type() == itensor::data_type::f32) { auto& input = Input(i); - auto dtensor = local_input_blobs_[i]->GetMutableTensor(CPU); - dtensor->Resize(input.get_dims()); - if (input.is_public_format()) { - dtensor->ShareExternalPointer(static_cast(input.get_data_handle())); - } else { - input.reorder_to(dtensor->template mutable_data()); + if (input_share_[i]) { + local_input_blobs_[i]->Reset(); } - } else if ( - InputIsType(i) && - Input(i).get_data_type() == itensor::data_type::s32) { - auto& input = Input(i); + input_share_[i] = false; auto dtensor = local_input_blobs_[i]->GetMutableTensor(CPU); dtensor->Resize(input.get_dims()); if (input.is_public_format()) { dtensor->ShareExternalPointer( - static_cast(input.get_data_handle())); + static_cast(input.get_data_handle())); } else { - input.reorder_to(dtensor->template mutable_data()); + input.reorder_to(dtensor->template mutable_data()); } } else { VLOG(1) << "Input " << i << " is not ideep::tensor. Skipping copy."; @@ -99,8 +103,9 @@ class IDEEPFallbackOp final : public IDEEPOperator { // local_input_blobs will only be used as const blob input for the // base op so we are still fine. local_input_blobs_[i]->ShareExternal( - const_cast(OperatorBase::Inputs()[i]->GetRaw()), + const_cast(OperatorBase::Inputs()[i]->GetRaw()), OperatorBase::Inputs()[i]->meta()); + input_share_[i] = true; } } @@ -120,21 +125,16 @@ class IDEEPFallbackOp final : public IDEEPOperator { "IDEEP fallback op currently does not support non-TensorCPU " "output type who needs copying."); const auto& src = local_output_blobs_[i]->template Get(); - auto src_dims = src.dims(); - if (src.ndim() == 0) { - VLOG(1) << "Copy output: index " << i << " skipped."; + if (src.template IsType() && + src.dims().size() != 0 && src.size_from_dim(0) != 0 && + base_op_->type() != "Python") { Blob* dst = OperatorBase::OutputBlob(i); - dst->Reset(new Tensor(CPU)); - auto dtensor = dst->GetMutableTensor(CPU); - dtensor->Resize(src_dims); - dtensor->ShareData(src); - continue; - } - - if (src.template IsType()) { - Blob* dst = OperatorBase::OutputBlob(i); - if (!dst->template IsType()) { + // The output tensor must be ideep tensor with public format. + // If reusing ideep tensor with non-public format, the tensor buffer + // will be interpreted incorrectly. + if (!dst->template IsType() || + !dst->template Get().is_public_format()) { dst->Reset(new itensor()); } @@ -143,7 +143,12 @@ class IDEEPFallbackOp final : public IDEEPOperator { if (dtensor->get_dims() != dst_dims) { dtensor->resize(dst_dims, itensor::data_type::f32); } - dtensor->set_data_handle(const_cast(src.raw_data())); + if (output_inplace_[i]) { + dtensor->reorder_from(dst_dims, itensor::data_type::f32, + const_cast(src.raw_data())); + } else { + dtensor->set_data_handle(const_cast(src.raw_data())); + } } else { VLOG(2) << "Output " << base_def_.output(i) << " as CPUTensor"; Blob* dst = OperatorBase::OutputBlob(i); @@ -159,6 +164,8 @@ class IDEEPFallbackOp final : public IDEEPOperator { protected: vector local_input_blobs_; vector local_output_blobs_; + vector output_inplace_; + vector input_share_; std::unique_ptr base_op_; std::unique_ptr local_ws_; OperatorDef base_def_; diff --git a/caffe2/mpi/mpi_common.h b/caffe2/mpi/mpi_common.h index 3e1e7a5625bd2e..b283a0aea382c3 100644 --- a/caffe2/mpi/mpi_common.h +++ b/caffe2/mpi/mpi_common.h @@ -4,6 +4,7 @@ #include #include +#include "caffe2/core/common.h" #include "caffe2/core/logging.h" namespace caffe2 { @@ -29,7 +30,7 @@ MPI_DATATYPE_WRAPPER(double, MPI_DOUBLE) #undef MPI_DATATYPE_WRAPPER // For all Caffe MPI calls, we will wrap it inside an MPI mutex lock guard. -std::mutex& MPIMutex(); +CAFFE2_API std::mutex& MPIMutex(); #define MPI_CHECK(condition) \ do { \ @@ -49,23 +50,23 @@ std::mutex& MPIMutex(); * @brief Gets the global MPI communicator used by Caffe2. In default, this * is MPI_COMM_WORLD unless you call SetGlobalMPIComm(). */ -MPI_Comm GlobalMPIComm(); +CAFFE2_API MPI_Comm GlobalMPIComm(); /** * @brief Sets the global MPI communicator. Caffe2 takes over the ownership * of the passed in communicator. */ -void SetGlobalMPIComm(MPI_Comm new_comm); +CAFFE2_API void SetGlobalMPIComm(MPI_Comm new_comm); /** * @brief A helper function to return the size of the given communicator. */ -int MPICommSize(MPI_Comm comm); +CAFFE2_API int MPICommSize(MPI_Comm comm); /** * @brief A helper function to return the rank of the given communicator. */ -int MPICommRank(MPI_Comm comm); +CAFFE2_API int MPICommRank(MPI_Comm comm); /** * @brief A simple wrapper over an MPI common world. diff --git a/caffe2/operators/concat_split_op.cc b/caffe2/operators/concat_split_op.cc index a8f4c91e7e5404..31256026028dfa 100644 --- a/caffe2/operators/concat_split_op.cc +++ b/caffe2/operators/concat_split_op.cc @@ -311,8 +311,8 @@ op = core.CreateOperator( axis=3 ) -workspace.FeedBlob("X1", np.random.randint(10, size=(1, 1, 5, 5))) # NCHW -workspace.FeedBlob("X2", np.random.randint(10, size=(1, 1, 5, 5))) # NCHW +workspace.FeedBlob("X1", np.random.randint(10, size=(1, 1, 5, 5))) // NCHW +workspace.FeedBlob("X2", np.random.randint(10, size=(1, 1, 5, 5))) // NCHW print("X1:", workspace.FetchBlob("X1")) print("X2:", workspace.FetchBlob("X2")) workspace.RunOperatorOnce(op) diff --git a/caffe2/operators/conv_op.cc b/caffe2/operators/conv_op.cc index 082c94fb6c18fb..30fb79d3846942 100644 --- a/caffe2/operators/conv_op.cc +++ b/caffe2/operators/conv_op.cc @@ -42,24 +42,24 @@ op = core.CreateOperator( stride=2 ) -# Create X: (N,C,H,W) +// Create X: (N,C,H,W) data = np.random.randn(1,1,8,8).astype(np.float32) print("Data shape: ",data.shape) -# Create W: (M,C,Kh,Kw) +// Create W: (M,C,Kh,Kw) filters = np.random.randn(3,1,5,5).astype(np.float32) print("Filter shape: ",filters.shape) -# Create b: M +// Create b: M bias = np.array([1.,1.,1.]).astype(np.float32) print("Bias shape: ",bias.shape) -# Put the inputs into the workspace +// Put the inputs into the workspace workspace.FeedBlob("X", data) workspace.FeedBlob("filter", filters) workspace.FeedBlob("bias", bias) -# Run the operator +// Run the operator workspace.RunOperatorOnce(op) print("Y:\n", workspace.FetchBlob("Y")) diff --git a/caffe2/operators/conv_transpose_op.cc b/caffe2/operators/conv_transpose_op.cc index 57ec02b63ea0dd..7de16afaed9158 100644 --- a/caffe2/operators/conv_transpose_op.cc +++ b/caffe2/operators/conv_transpose_op.cc @@ -44,24 +44,24 @@ op = core.CreateOperator( strides=[2,2] ) -# Create X: (N,C,H,W) +// Create X: (N,C,H,W) data = np.random.randn(2,3,5,5).astype(np.float32) print("Data shape: ",data.shape) -# Create filter: (M,C,Kh,Kw) +// Create filter: (M,C,Kh,Kw) filters = np.random.randn(3,1,2,2).astype(np.float32) print("Filter shape: ",filters.shape) -# Create b: M +// Create b: M bias = np.array([1.]).astype(np.float32) print("Bias shape: ",bias.shape) -# Put the inputs into the workspace +// Put the inputs into the workspace workspace.FeedBlob("X", data) workspace.FeedBlob("filter", filters) workspace.FeedBlob("bias", bias) -# Run the operator +// Run the operator workspace.RunOperatorOnce(op) print("Y:\n", workspace.FetchBlob("Y")) diff --git a/caffe2/operators/counter_ops.cc b/caffe2/operators/counter_ops.cc index 15cdab5849cc1f..50e4b9448af310 100644 --- a/caffe2/operators/counter_ops.cc +++ b/caffe2/operators/counter_ops.cc @@ -58,22 +58,22 @@ resetcounter_op = core.CreateOperator( ) -# Create counter +// Create counter workspace.RunOperatorOnce(createcounter_op) print("'counter' pointer:", workspace.FetchBlob("counter")) -# Retrieve initial counter value +// Retrieve initial counter value workspace.RunOperatorOnce(retrievecount_op) print("Initial 'count':", workspace.FetchBlob("count")) -# Check if counter is done +// Check if counter is done workspace.RunOperatorOnce(checkcounterdone_op) print("Initial 'done' value:", workspace.FetchBlob("done")) -# Test CountUp operator +// Test CountUp operator print("\nTesting CountUp operator...") for i in range(5): workspace.RunOperatorOnce(countup_op) @@ -83,7 +83,7 @@ workspace.RunOperatorOnce(retrievecount_op) print("'count' value after CountUp test:", workspace.FetchBlob("count")) -# Test CountDown operator +// Test CountDown operator print("\nTesting CountDown operator...") for i in range(11): workspace.RunOperatorOnce(countdown_op) diff --git a/caffe2/operators/cross_entropy_op.cc b/caffe2/operators/cross_entropy_op.cc index 584b7abd5a183f..0473e7d4e435b3 100644 --- a/caffe2/operators/cross_entropy_op.cc +++ b/caffe2/operators/cross_entropy_op.cc @@ -401,22 +401,22 @@ op = core.CreateOperator( ["Y"] ) -# Create X: Sample softmax output for 5-class model +// Create X: Sample softmax output for 5-class model X = np.array([[.01, .05, .02, .02, .9],[.03, .1, .42, .05, .4]]) print("X:\n",X) -# Create label: Sample 1-hot ground truth label vectors +// Create label: Sample 1-hot ground truth label vectors label = np.array([4,2]) print("label:\n",label) -# Feed X & label into workspace +// Feed X & label into workspace workspace.FeedBlob("X", X.astype(np.float32)) workspace.FeedBlob("label", label.astype(np.int32)) -# Run op +// Run op workspace.RunOperatorOnce(op) -# Collect Output +// Collect Output print("Y:\n", workspace.FetchBlob("Y")) ``` @@ -635,22 +635,22 @@ op = core.CreateOperator( ["Y"] ) -# Create X: Sample softmax output for 5-class model +// Create X: Sample softmax output for 5-class model X = np.array([[.01, .05, .02, .02, .9],[.03, .1, .42, .05, .4]]) print("X:\n",X) -# Create label: Sample 1-hot ground truth label vectors +// Create label: Sample 1-hot ground truth label vectors label = np.array([[0.,0.,0.,0.,1.],[0.,0.,1.,0.,0.]]) print("label:\n",label) -# Feed X & label into workspace +// Feed X & label into workspace workspace.FeedBlob("X", X.astype(np.float32)) workspace.FeedBlob("label", label.astype(np.float32)) -# Run op +// Run op workspace.RunOperatorOnce(op) -# Collect Output +// Collect Output print("Y:\n", workspace.FetchBlob("Y")) ``` diff --git a/caffe2/operators/distance_op.cc b/caffe2/operators/distance_op.cc index d9abfa0e254336..9a38a4a77a0043 100644 --- a/caffe2/operators/distance_op.cc +++ b/caffe2/operators/distance_op.cc @@ -437,22 +437,22 @@ op = core.CreateOperator( ["Z"] ) -# Create X +// Create X X = 5*np.ones((1, 4)) print("X:\n",X) -# Create Y +// Create Y Y = np.ones((1, 4)) print("Y:\n",Y) -# Feed X & Y into workspace +// Feed X & Y into workspace workspace.FeedBlob("X", X.astype(np.float32)) workspace.FeedBlob("Y", Y.astype(np.float32)) -# Run op +// Run op workspace.RunOperatorOnce(op) -# Collect Output +// Collect Output print("Z:\n", workspace.FetchBlob("Z")) ``` @@ -645,22 +645,22 @@ op = core.CreateOperator( ["Z"] ) -# Create X +// Create X X = np.random.randn(3, 3) print("X:\n",X) -# Create Y +// Create Y Y = np.random.randn(3, 3) print("Y:\n",Y) -# Feed X & Y into workspace +// Feed X & Y into workspace workspace.FeedBlob("X", X.astype(np.float32)) workspace.FeedBlob("Y", Y.astype(np.float32)) -# Run op +// Run op workspace.RunOperatorOnce(op) -# Collect Output +// Collect Output print("Z:\n", workspace.FetchBlob("Z")) ``` diff --git a/caffe2/operators/elementwise_linear_op.cc b/caffe2/operators/elementwise_linear_op.cc index d68bfbc5a0eb93..371aae78a25201 100644 --- a/caffe2/operators/elementwise_linear_op.cc +++ b/caffe2/operators/elementwise_linear_op.cc @@ -112,28 +112,28 @@ op = core.CreateOperator( ["Y"] ) -# Create X +// Create X X = np.array([[1,2,3,4,5],[6,8,9,16,10]]) print("X:\n",X) -# Create w +// Create w w = np.array([1,1/2.,1/3.,1/4.,1/5.]) print("w:\n",w) -# Create b +// Create b b = np.array([1.,1.,1.,1.,1.]) print("b:\n",b) -# Feed X & w & b into workspace +// Feed X & w & b into workspace workspace.FeedBlob("X", X.astype(np.float32)) workspace.FeedBlob("w", w.astype(np.float32)) workspace.FeedBlob("b", b.astype(np.float32)) -# Run op +// Run op workspace.RunOperatorOnce(op) -# Collect Output +// Collect Output print("Y:\n", workspace.FetchBlob("Y")) ``` diff --git a/caffe2/operators/elementwise_logical_ops.cc b/caffe2/operators/elementwise_logical_ops.cc index 5ddd4570356e9d..0e2da569dcb11f 100644 --- a/caffe2/operators/elementwise_logical_ops.cc +++ b/caffe2/operators/elementwise_logical_ops.cc @@ -63,7 +63,7 @@ op = core.CreateOperator( value=[0,2,4,6,8], ) -# Use a not-empty tensor +// Use a not-empty tensor workspace.FeedBlob("X", np.array([0,1,2,3,4,5,6,7,8]).astype(np.int32)) print("X:\n", workspace.FetchBlob("X")) @@ -75,7 +75,7 @@ print("Y: \n", workspace.FetchBlob("Y")) **Result** ``` -# value=[0,2,4,6,8] +// value=[0,2,4,6,8] X: [0 1 2 3 4 5 6 7 8] diff --git a/caffe2/operators/elementwise_sum_op.cc b/caffe2/operators/elementwise_sum_op.cc index 861f4f115c0a41..dee3671f5bdc4a 100644 --- a/caffe2/operators/elementwise_sum_op.cc +++ b/caffe2/operators/elementwise_sum_op.cc @@ -86,7 +86,7 @@ workspace.ResetWorkspace() op = core.CreateOperator( "Sum", ["A", "B"], - ["A"], # inplace + ["A"], // inplace ) workspace.FeedBlob("A", np.array([[1,2,5],[8,3,4]]).astype(np.float32)) diff --git a/caffe2/operators/filler_op.cc b/caffe2/operators/filler_op.cc index ff3eac217390a4..c5a121e3a222d6 100644 --- a/caffe2/operators/filler_op.cc +++ b/caffe2/operators/filler_op.cc @@ -298,11 +298,11 @@ op_2 = core.CreateOperator( input_as_shape=1 ) -# Test arg-based op +// Test arg-based op workspace.RunOperatorOnce(op_1) print("output (op_1):\n", workspace.FetchBlob("output")) -# Test input-based op +// Test input-based op workspace.ResetWorkspace() workspace.FeedBlob("shape", np.array([5,5])) workspace.FeedBlob("min", np.array(13.8, dtype=np.float32)) @@ -389,11 +389,11 @@ op_2 = core.CreateOperator( input_as_shape=1 ) -# Test arg-based op +// Test arg-based op workspace.RunOperatorOnce(op_1) print("output (op_1):\n", workspace.FetchBlob("output")) -# Test input-based op +// Test input-based op workspace.ResetWorkspace() workspace.FeedBlob("shape", np.array([5,5])) workspace.FeedBlob("min", np.array(13, dtype=np.int32)) diff --git a/caffe2/operators/fully_connected_op.cc b/caffe2/operators/fully_connected_op.cc index 6fe95eefbac476..e14fec6f8464b8 100644 --- a/caffe2/operators/fully_connected_op.cc +++ b/caffe2/operators/fully_connected_op.cc @@ -182,9 +182,9 @@ Github Links: ``` -# In this example, our batch size is 1 (M=1), the input observation will have -# 6 features (K=6), and the layer will have one hidden node (N=1). The -# expected output is Y=7. +// In this example, our batch size is 1 (M=1), the input observation will have +// 6 features (K=6), and the layer will have one hidden node (N=1). The +// expected output is Y=7. workspace.ResetWorkspace() op = core.CreateOperator( @@ -193,23 +193,23 @@ op = core.CreateOperator( ["Y"] ) -# Create X: MxK +// Create X: MxK data = np.array([1,2,3,4,5,6]).astype(np.float32) data = data[np.newaxis,:] -# Create W: NxK +// Create W: NxK weights = np.array(np.array([1,1/2.,1/3.,1/4.,1/5.,1/6.])).astype(np.float32) weights = weights[np.newaxis,:] -# Create b: N +// Create b: N bias = np.array([1.]).astype(np.float32) -# Put the inputs into the workspace +// Put the inputs into the workspace workspace.FeedBlob("X", data) workspace.FeedBlob("W", weights) workspace.FeedBlob("b", bias) -# Run the operator +// Run the operator workspace.RunOperatorOnce(op) print("Y:\n", workspace.FetchBlob("Y")) diff --git a/caffe2/operators/gather_op.cc b/caffe2/operators/gather_op.cc index cee268ddafdcbd..34c42bfc983f84 100644 --- a/caffe2/operators/gather_op.cc +++ b/caffe2/operators/gather_op.cc @@ -37,7 +37,7 @@ print("DATA:\n",data) inds = np.array([[0, 1],[1, 2]]) print("INDICES:\n",inds) -# Feed X into workspace +// Feed X into workspace workspace.FeedBlob("DATA", data.astype(np.float32)) workspace.FeedBlob("INDICES", inds.astype(np.int32)) diff --git a/caffe2/operators/hard_sigmoid_op.cu b/caffe2/operators/hard_sigmoid_op.cu new file mode 100644 index 00000000000000..ed3a4ec8286888 --- /dev/null +++ b/caffe2/operators/hard_sigmoid_op.cu @@ -0,0 +1,91 @@ +#include "caffe2/operators/hard_sigmoid_op.h" + +#include +#include + +#include "caffe2/core/context_gpu.h" + +namespace caffe2 { + +namespace { + +template +__global__ void HardSigmoidCUDAKernel( + const int N, + const T alpha, + const T beta, + const T* X, + T* Y) { + CUDA_1D_KERNEL_LOOP(i, N) { +#if __CUDA_ARCH__ >= 350 + Y[i] = max(T(0), min(T(1), alpha * __ldg(X + i) + beta)); +#else + Y[i] = max(T(0), min(T(1), alpha * X[i] + beta)); +#endif + } +} + +template +__global__ void HardSigmoidGradientCUDAKernel( + const int N, + const T alpha, + const T* dY, + const T* Y, + T* dX) { + CUDA_1D_KERNEL_LOOP(i, N) { +#if __CUDA_ARCH__ >= 350 + dX[i] = (__ldg(Y + i) > T(0) && __ldg(Y + i) < T(1)) ? __ldg(dY + i) * alpha + : T(0); +#else + dX[i] = (Y[i] > T(0) && Y[i] < T(1)) ? dY[i] * alpha : T(0); +#endif + } +} + +} // namespace + +template <> +template +bool HardSigmoidFunctor:: +operator()(const int N, const T* X, T* Y, CUDAContext* context) const { + HardSigmoidCUDAKernel + <<cuda_stream()>>>(N, alpha, beta, X, Y); + return true; +} + +template <> +template +bool HardSigmoidGradientFunctor::Forward( + const std::vector& Y_dims, + const std::vector& /* dY_dims */, + const T* Y, + const T* dY, + T* dX, + CUDAContext* context) const { + const int size = std::accumulate( + Y_dims.cbegin(), Y_dims.cend(), 1, std::multiplies()); + HardSigmoidGradientCUDAKernel + <<cuda_stream()>>>(size, alpha, dY, Y, dX); + return true; +} + +REGISTER_CUDA_OPERATOR( + HardSigmoid, + UnaryElementwiseWithArgsOp< + TensorTypes, + CUDAContext, + HardSigmoidFunctor>); +REGISTER_CUDA_OPERATOR( + HardSigmoidGradient, + BinaryElementwiseWithArgsOp< + TensorTypes, + CUDAContext, + HardSigmoidGradientFunctor>); + +} // namespace caffe2 diff --git a/caffe2/operators/local_response_normalization_op.cc b/caffe2/operators/local_response_normalization_op.cc index 1cba60e86d9787..81499b4a5d6abf 100644 --- a/caffe2/operators/local_response_normalization_op.cc +++ b/caffe2/operators/local_response_normalization_op.cc @@ -342,7 +342,7 @@ op = core.CreateOperator("LRN", order="NHWC" ) -workspace.FeedBlob("X", np.random.randn(1, 6, 6, 1).astype(np.float32)) # NCHW +workspace.FeedBlob("X", np.random.randn(1, 6, 6, 1).astype(np.float32)) // NCHW print("X:\n", workspace.FetchBlob("X"), "\n") workspace.RunOperatorOnce(op) print("Y:\n", workspace.FetchBlob("Y")) diff --git a/caffe2/operators/lp_pool_op.cc b/caffe2/operators/lp_pool_op.cc index f877786648350b..f39aaaa6397a3e 100644 --- a/caffe2/operators/lp_pool_op.cc +++ b/caffe2/operators/lp_pool_op.cc @@ -258,7 +258,7 @@ op = core.CreateOperator( p=2.0 ) -workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) # NCHW +workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) // NCHW print("X:\n", workspace.FetchBlob("X"), "\n") workspace.RunOperatorOnce(op) print("Y:\n", workspace.FetchBlob("Y")) diff --git a/caffe2/operators/lpnorm_op.cc b/caffe2/operators/lpnorm_op.cc index 6af404d1153588..79c35cd83a2148 100644 --- a/caffe2/operators/lpnorm_op.cc +++ b/caffe2/operators/lpnorm_op.cc @@ -100,7 +100,7 @@ op = core.CreateOperator( X = np.array([5., 2.]) print("X:\n",X) -# Feed X into workspace +// Feed X into workspace workspace.FeedBlob("X", X.astype(np.float32)) workspace.RunOperatorOnce(op) diff --git a/caffe2/operators/pool_op.cc b/caffe2/operators/pool_op.cc index eca7978e024aac..87d67b17e2b6ce 100644 --- a/caffe2/operators/pool_op.cc +++ b/caffe2/operators/pool_op.cc @@ -764,7 +764,7 @@ op = core.CreateOperator( stride=2, ) -workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) # NCHW +workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) // NCHW print("X:\n", workspace.FetchBlob("X"), "\n") workspace.RunOperatorOnce(op) print("Y:\n", workspace.FetchBlob("Y")) @@ -832,7 +832,7 @@ op = core.CreateOperator( stride=2, ) -workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) # NCHW +workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) // NCHW print("X:\n", workspace.FetchBlob("X"), "\n") workspace.RunOperatorOnce(op) print("Y:\n", workspace.FetchBlob("Y")) diff --git a/caffe2/operators/reduction_ops.cc b/caffe2/operators/reduction_ops.cc index 0d01d50ca000e3..95f15b56a720e9 100644 --- a/caffe2/operators/reduction_ops.cc +++ b/caffe2/operators/reduction_ops.cc @@ -139,17 +139,17 @@ op = core.CreateOperator( ["Y"] ) -# Create X, simulating a batch of 2, 4x4 matricies +// Create X, simulating a batch of 2, 4x4 matricies X = np.random.randint(0,high=20,size=(2,4,4)) print("X:\n",X) -# Feed X into workspace +// Feed X into workspace workspace.FeedBlob("X", X.astype(np.float32)) -# Run op +// Run op workspace.RunOperatorOnce(op) -# Collect Output +// Collect Output print("Y:\n", workspace.FetchBlob("Y")) ``` @@ -226,17 +226,17 @@ op = core.CreateOperator( ["Y"] ) -# Create X, simulating a batch of 2, 4x4 matricies +// Create X, simulating a batch of 2, 4x4 matricies X = np.random.randint(0,high=20,size=(2,4,4)) print("X:\n",X) -# Feed X into workspace +// Feed X into workspace workspace.FeedBlob("X", X.astype(np.float32)) -# Run op +// Run op workspace.RunOperatorOnce(op) -# Collect Output +// Collect Output print("Y:\n", workspace.FetchBlob("Y")) ``` diff --git a/caffe2/operators/relu_op.cc b/caffe2/operators/relu_op.cc index 03205241efc3e1..0f1abd82396156 100644 --- a/caffe2/operators/relu_op.cc +++ b/caffe2/operators/relu_op.cc @@ -105,7 +105,7 @@ op = core.CreateOperator( ["Y"] ) -workspace.FeedBlob("X", np.random.randn(4, 4).astype(np.float32)) # NCHW +workspace.FeedBlob("X", np.random.randn(4, 4).astype(np.float32)) // NCHW print("X:\n", workspace.FetchBlob("X"), "\n") workspace.RunOperatorOnce(op) diff --git a/caffe2/operators/sparse_to_dense_mask_op.cc b/caffe2/operators/sparse_to_dense_mask_op.cc index bea0b43d751ccf..d968112c9ecc2d 100644 --- a/caffe2/operators/sparse_to_dense_mask_op.cc +++ b/caffe2/operators/sparse_to_dense_mask_op.cc @@ -48,8 +48,8 @@ vector and `values` tensor into a compacted tensor where the first dimension corresponds to each id provided in mask argument. Missing values are filled with the value of `default_value`. After running this op: - output[j, :] = values[i] # where mask[j] == indices[i] - output[j, ...] = default_value # when mask[j] doesn't appear in indices + output[j, :] = values[i] // where mask[j] == indices[i] + output[j, ...] = default_value // when mask[j] doesn't appear in indices If `lengths` is provided and not empty, and extra "batch" dimension is prepended to the output. diff --git a/caffe2/operators/sparse_to_dense_op.cc b/caffe2/operators/sparse_to_dense_op.cc index 4f6a49796df826..0c9519e6576122 100644 --- a/caffe2/operators/sparse_to_dense_op.cc +++ b/caffe2/operators/sparse_to_dense_op.cc @@ -23,7 +23,7 @@ representation. After running this op: - output[indices[i], :] += values[i] # sum over all indices[i] equal to the index + output[indices[i], :] += values[i] // sum over all indices[i] equal to the index output[j, ...] = 0 if j not in indices )DOC") .Input(0, "indices", "1-D int32/int64 tensor of concatenated ids of data") diff --git a/caffe2/operators/stats_ops.cc b/caffe2/operators/stats_ops.cc index 508dd1ae82060a..d07f9cace13636 100644 --- a/caffe2/operators/stats_ops.cc +++ b/caffe2/operators/stats_ops.cc @@ -290,7 +290,7 @@ timergetandend_op = core.CreateOperator( ["nanos"] ) -# Test TimerBegin/TimerGet/TimerEnd +// Test TimerBegin/TimerGet/TimerEnd workspace.RunOperatorOnce(timerbegin_op) print("timer:", workspace.FetchBlob("timer")) workspace.RunOperatorOnce(timerget_op) @@ -298,7 +298,7 @@ print("nanos:", workspace.FetchBlob("nanos")) workspace.RunOperatorOnce(timerend_op) -# Test TimerBegin/TimerGetAndEnd +// Test TimerBegin/TimerGetAndEnd workspace.RunOperatorOnce(timerbegin_op) print("timer:", workspace.FetchBlob("timer")) workspace.RunOperatorOnce(timergetandend_op) diff --git a/caffe2/operators/utility_ops.cc b/caffe2/operators/utility_ops.cc index cc7c037a6d332d..eb771974fbf397 100644 --- a/caffe2/operators/utility_ops.cc +++ b/caffe2/operators/utility_ops.cc @@ -103,17 +103,17 @@ op = core.CreateOperator( ["Y"] ) -# Create X: Sample softmax output for 5-class model +// Create X: Sample softmax output for 5-class model X = np.array([2,2,2,2,2,2,2,2,2,2]) print("X:\n",X) -# Feed X into workspace +// Feed X into workspace workspace.FeedBlob("X", X.astype(np.int32)) -# Run op +// Run op workspace.RunOperatorOnce(op) -# Collect Output +// Collect Output print("Y:\n", workspace.FetchBlob("Y")) ``` @@ -508,14 +508,14 @@ op = core.CreateOperator( ["has_elements"], ) -# Use a not-empty tensor +// Use a not-empty tensor workspace.FeedBlob("tensor", np.random.randn(2, 2).astype(np.float32)) print("tensor:\n", workspace.FetchBlob("tensor")) workspace.RunOperatorOnce(op) print("has_elements: ", workspace.FetchBlob("has_elements"),"\n") -# Use an empty tensor +// Use an empty tensor workspace.FeedBlob("tensor", np.empty(0)) print("tensor:\n", workspace.FetchBlob("tensor")) diff --git a/caffe2/opt/converter.cc b/caffe2/opt/converter.cc index 6a8d22253444a5..80e2308eabf3cd 100644 --- a/caffe2/opt/converter.cc +++ b/caffe2/opt/converter.cc @@ -322,14 +322,26 @@ repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_mappushInstructionNode(opNode); } - CAFFE_ENFORCE( - externalInputNames.size() == 0, - "Attempting to convert an ill-formed network: \ - external_input contains unused blobs"); + if (externalInputNames.size()) { + std::ostringstream os; + for (const auto& inputName : externalInputNames) { + os << "\"" << inputName << "\" "; + } + + CAFFE_ENFORCE( + externalInputNames.size() == 0, + "Attempting to convert an ill-formed network: external_input contains ", + externalInputNames.size(), + " unused blobs: ", + os.str()); + } for (const auto& outputName : net.external_output()) { CAFFE_ENFORCE( - blobMap.count(outputName), "NetDef has ill-formed external_output"); + blobMap.count(outputName), + "NetDef has ill-formed external_output: \"", + outputName, + "\""); module.outputs.insert(blobMap[outputName]); } diff --git a/caffe2/predictor/predictor_config.cc b/caffe2/predictor/predictor_config.cc index aabff0daffcd73..0ca120d0121da5 100644 --- a/caffe2/predictor/predictor_config.cc +++ b/caffe2/predictor/predictor_config.cc @@ -10,7 +10,7 @@ namespace { // We don't use the getNet() from predictor_utils.cc here because that file // has additional dependencies that we want to avoid bringing in, to keep the // binary size as small as possible. -const NetDef& getNet(const MetaNetDef& def, const std::string& name) { +static const NetDef& getNet(const MetaNetDef& def, const std::string& name) { for (const auto& n : def.nets()) { if (n.key() == name) { return n.value(); @@ -19,7 +19,7 @@ const NetDef& getNet(const MetaNetDef& def, const std::string& name) { CAFFE_THROW("Net not found: ", name); } -const ::google::protobuf::RepeatedPtrField<::std::string>& getBlobs( +static const ::google::protobuf::RepeatedPtrField<::std::string>& getBlobs( const MetaNetDef& def, const std::string& name) { for (const auto& b : def.blobs()) { @@ -30,26 +30,60 @@ const ::google::protobuf::RepeatedPtrField<::std::string>& getBlobs( CAFFE_THROW("Blob not found: ", name); } +static std::string combine(const std::string& str, const std::string& name) { + if (name.empty()) { + return std::string(str); + } + return str + "_" + name; +} + +static std::string getNamedPredictNet(const string& name) { + return combine(PredictorConsts::default_instance().predict_net_type(), name); +} + +static std::string getNamedInitNet(const string& name) { + return combine( + PredictorConsts::default_instance().predict_init_net_type(), name); +} + +static std::string getNamedInputs(const string& name) { + return combine(PredictorConsts::default_instance().inputs_blob_type(), name); +} + +static std::string getNamedOutputs(const string& name) { + return combine(PredictorConsts::default_instance().outputs_blob_type(), name); +} + +static std::string getNamedParams(const string& name) { + return combine( + PredictorConsts::default_instance().parameters_blob_type(), name); +} + } // namespace -PredictorConfig -makePredictorConfig(const MetaNetDef& def, Workspace* parent, bool run_init) { - const auto& init_net = - getNet(def, PredictorConsts::default_instance().global_init_net_type()); - const auto& run_net = - getNet(def, PredictorConsts::default_instance().predict_net_type()); +PredictorConfig makePredictorConfig( + const MetaNetDef& def, + Workspace* parent, + bool run_init, + const std::string& net_name) { + const auto& init_net = getNet(def, getNamedInitNet(net_name)); + const auto& run_net = getNet(def, getNamedPredictNet(net_name)); auto config = makePredictorConfig(init_net, run_net, parent, run_init); - const auto& inputs = - getBlobs(def, PredictorConsts::default_instance().inputs_blob_type()); + const auto& inputs = getBlobs(def, getNamedInputs(net_name)); for (const auto& input : inputs) { config.input_names.emplace_back(input); } - const auto& outputs = - getBlobs(def, PredictorConsts::default_instance().outputs_blob_type()); + const auto& outputs = getBlobs(def, getNamedOutputs(net_name)); for (const auto& output : outputs) { config.output_names.emplace_back(output); } + + const auto& params = getBlobs(def, getNamedParams(net_name)); + for (const auto& param : params) { + config.parameter_names.emplace_back(param); + } + return config; } diff --git a/caffe2/predictor/predictor_config.h b/caffe2/predictor/predictor_config.h index eda1c9d03ca2ba..b1555addfa6f08 100644 --- a/caffe2/predictor/predictor_config.h +++ b/caffe2/predictor/predictor_config.h @@ -45,7 +45,8 @@ CAFFE2_API Workspace makeWorkspace(std::shared_ptr paramete CAFFE2_API PredictorConfig makePredictorConfig( const MetaNetDef& net, Workspace* parent = nullptr, - bool run_init = true); + bool run_init = true, + const std::string& net_name = ""); CAFFE2_API PredictorConfig makePredictorConfig( const NetDef& init_net, diff --git a/caffe2/predictor/predictor_test.cc b/caffe2/predictor/predictor_test.cc index 40e4f720c61900..326265fc66d039 100644 --- a/caffe2/predictor/predictor_test.cc +++ b/caffe2/predictor/predictor_test.cc @@ -209,33 +209,4 @@ TEST_F(PredictorTest, SimpleBatchSizedMapInput) { EXPECT_NEAR(output.front().data()[4], 0.1209, 1E-4); } -class PredictorMetaNetDefTest : public testing::Test { - public: - void SetUp() override { - DeviceOption op; - op.set_random_seed(1701); - ctx_ = caffe2::make_unique(op); - p_ = caffe2::make_unique( - makePredictorConfig(parseMetaNetDef(metaSpec))); - } - - std::unique_ptr ctx_; - std::unique_ptr p_; -}; - -TEST_F(PredictorMetaNetDefTest, SimpleMetaNetDefInitializer) { - auto inputData = randomTensor({1, 4}, ctx_.get()); - Predictor::TensorMap input; - auto iter = input.emplace("data", Tensor(CPU)); - auto tensor = inputData->GetMutableTensor(CPU); - iter.first->second.ResizeLike(*tensor); - iter.first->second.ShareData(*tensor); - Predictor::TensorList output; - (*p_)(input, &output); - EXPECT_EQ(output.size(), 1); - EXPECT_EQ(output.front().dims().size(), 2); - EXPECT_EQ(output.front().dim(0), 1); - EXPECT_EQ(output.front().dim(1), 10); - EXPECT_NEAR(output.front().data()[4], 0.1209, 1E-4); -} } // namespace caffe2 diff --git a/caffe2/predictor/predictor_utils.cc b/caffe2/predictor/predictor_utils.cc index 4af83d0bea8c25..f5acd4f936010b 100644 --- a/caffe2/predictor/predictor_utils.cc +++ b/caffe2/predictor/predictor_utils.cc @@ -1,4 +1,5 @@ #include "caffe2/predictor/predictor_utils.h" +#include "caffe2/predictor/predictor_config.h" #include "caffe2/core/blob.h" #include "caffe2/core/logging.h" @@ -6,6 +7,13 @@ #include "caffe2/proto/predictor_consts.pb.h" #include "caffe2/utils/proto_utils.h" +CAFFE2_DEFINE_bool( + caffe2_predictor_claim_tensor_memory, + true, + "If false, then predictor will not claim tensor memory" + "otherwise when tensor is shrinked to a size smaller than current size " + "by FLAGS_caffe2_max_keep_on_shrink_memory, the memory will be claimed."); + namespace caffe2 { namespace predictor_utils { @@ -79,4 +87,47 @@ std::unique_ptr runGlobalInitialization( } } // namespace predictor_utils + +void removeExternalBlobs( + const std::vector& input_blobs, + const std::vector& output_blobs, + Workspace* ws) { + for (const auto& blob : input_blobs) { + ws->RemoveBlob(blob); + } + for (const auto& blob : output_blobs) { + ws->RemoveBlob(blob); + } +} + +PredictorConfig makePredictorConfig( + const string& db_type, + const string& db_path) { + // TODO: Remove this flags once Predictor accept PredictorConfig as + // constructors. These comes are copied temporarly from the Predictor. + if (FLAGS_caffe2_predictor_claim_tensor_memory) { + if (FLAGS_caffe2_max_keep_on_shrink_memory == LLONG_MAX) { + FLAGS_caffe2_max_keep_on_shrink_memory = 8 * 1024 * 1024; + } + } + auto dbReader = + make_unique(db::CreateDB(db_type, db_path, db::READ)); + auto ws = std::make_shared(); + auto net_def = + predictor_utils::runGlobalInitialization(std::move(dbReader), ws.get()); + auto config = makePredictorConfig(*net_def, ws.get()); + config.ws = ws; + const auto& init_net = predictor_utils::getNet( + *net_def, PredictorConsts::default_instance().predict_init_net_type()); + CAFFE_ENFORCE(config.ws->RunNetOnce(init_net)); + config.ws->RemoveBlob( + PredictorConsts::default_instance().predictor_dbreader()); + // Input and output blobs should never be allocated in the master workspace + // since we'll end up with race-conditions due to these being shared among + // predictor threads / TL workspaces. Safely handle against globalInitNet + // creating them in the master. + removeExternalBlobs(config.input_names, config.output_names, config.ws.get()); + return config; +} + } // namespace caffe2 diff --git a/caffe2/predictor/predictor_utils.h b/caffe2/predictor/predictor_utils.h index 8c9cb4a5792d48..af7799b039c8b7 100644 --- a/caffe2/predictor/predictor_utils.h +++ b/caffe2/predictor/predictor_utils.h @@ -24,4 +24,14 @@ CAFFE2_API std::unique_ptr runGlobalInitialization( Workspace* master); } // namespace predictor_utils + +PredictorConfig makePredictorConfig( + const string& db_type, + const string& db_path); + +void removeExternalBlobs( + const std::vector& input_blobs, + const std::vector& output_blobs, + Workspace* ws); + } // namespace caffe2 diff --git a/caffe2/python/data_parallel_model.py b/caffe2/python/data_parallel_model.py index 60e5c39bed1318..ae169eef2e6480 100644 --- a/caffe2/python/data_parallel_model.py +++ b/caffe2/python/data_parallel_model.py @@ -44,6 +44,7 @@ def Parallelize( param_update_builder_fun=None, optimizer_builder_fun=None, post_sync_builder_fun=None, + pre_grad_net_transformer_fun=None, net_transformer_fun=None, devices=None, rendezvous=None, @@ -91,6 +92,11 @@ def Parallelize( Signature: net_transformer_fun( model, num_devices, device_prefix, device_type) + pre_grad_net_transformer_fun: + Optional function to transform the network similar to + net_transformer_fun, but happens before gradient ops + been add. + Signature: pre_grad_net_transformer_fun(model) post_sync_builder_fun: Function applied after initial parameter sync has been completed, such as keeping multi-precision parameters @@ -234,6 +240,9 @@ def Parallelize( model_helper_obj._computed_param_names =\ list(viewkeys(computed_params_grouped)) + if pre_grad_net_transformer_fun: + pre_grad_net_transformer_fun(model_helper_obj) + if has_parameter_updates: log.info("Adding gradient operators") _AddGradientOperators(devices, model_helper_obj, losses_by_gpu) diff --git a/caffe2/python/ideep/operator_fallback_op_test.py b/caffe2/python/ideep/operator_fallback_op_test.py new file mode 100644 index 00000000000000..19bdbaac8a217e --- /dev/null +++ b/caffe2/python/ideep/operator_fallback_op_test.py @@ -0,0 +1,99 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import unittest +import hypothesis.strategies as st +from hypothesis import given +import numpy as np +from caffe2.python import core, workspace +from caffe2.proto import caffe2_pb2 +import caffe2.python.hypothesis_test_util as hu +import caffe2.python.ideep_test_util as mu + + +@unittest.skipIf(not workspace.C.use_ideep, "No IDEEP support.") +class TestFallbackOps(hu.HypothesisTestCase): + @given(stride=st.integers(1, 3), + pad=st.integers(0, 3), + kernel=st.integers(3, 5), + size=st.integers(8, 10), + input_channels=st.integers(1, 3), + output_channels=st.integers(1, 5), + batch_size=st.integers(1, 3), + use_bias=st.booleans(), + **mu.gcs) + def test_in_place(self, stride, pad, kernel, size, + input_channels, output_channels, + batch_size, use_bias, gc, dc): + # To expose fallback in-place potential issue, the fallback op + # following ideep op must be run at least two iterations. + conv = core.CreateOperator( + "Conv", + ["X", "w", "b"] if use_bias else ["X", "w"], + ["Y"], + stride=stride, + pad=pad, + kernel=kernel, + device_option=dc[0] + ) + X = np.random.rand( + batch_size, input_channels, size, size).astype(np.float32) - 0.5 + w = np.random.rand(output_channels, input_channels, kernel, kernel) \ + .astype(np.float32) - 0.5 + b = np.random.rand(output_channels).astype(np.float32) - 0.5 + + old_ws_name = workspace.CurrentWorkspace() + workspace.SwitchWorkspace("_device_check_", True) + workspace.FeedBlob('X', X, dc[0]) + workspace.FeedBlob('w', w, dc[0]) + workspace.FeedBlob('b', b, dc[0]) + workspace.RunOperatorOnce(conv) + Y = workspace.FetchBlob('Y') + + scale = np.random.randn(Y.shape[1]).astype(np.float32) + bias = np.random.randn(Y.shape[1]).astype(np.float32) + ac = core.CreateOperator( + "AffineChannel", + ["Y", "scale", "bias"], + ["Y"], + is_learnable=False, + device_option=dc[0] + ) + workspace.FeedBlob('scale', scale, dc[0]) + workspace.FeedBlob('bias', bias, dc[0]) + workspace.RunOperatorOnce(ac) + workspace.RunOperatorOnce(conv) + workspace.RunOperatorOnce(ac) + Y0 = workspace.FetchBlob('Y') + + workspace.ResetWorkspace() + dev_net = caffe2_pb2.NetDef() + conv_dev = caffe2_pb2.OperatorDef() + conv_dev.CopyFrom(conv) + conv_dev.device_option.CopyFrom(dc[1]) + ac_dev = caffe2_pb2.OperatorDef() + ac_dev.CopyFrom(ac) + ac_dev.device_option.CopyFrom(dc[1]) + dev_net.op.extend([conv_dev, ac_dev]) + workspace.FeedBlob('X', X, dc[1]) + workspace.FeedBlob('w', w, dc[1]) + workspace.FeedBlob('b', b, dc[1]) + workspace.FeedBlob('scale', scale, dc[1]) + workspace.FeedBlob('bias', bias, dc[1]) + workspace.RunNetOnce(dev_net) + workspace.RunNetOnce(dev_net) + Y1 = workspace.FetchBlob('Y') + + if not np.allclose(Y0, Y1, atol=0.01, rtol=0.01): + print(Y1.flatten()) + print(Y0.flatten()) + print(np.max(np.abs(Y1 - Y0))) + self.assertTrue(False) + + workspace.SwitchWorkspace(old_ws_name) + + +if __name__ == "__main__": + unittest.main() diff --git a/caffe2/python/operator_test/elementwise_ops_test.py b/caffe2/python/operator_test/elementwise_ops_test.py index 0e590307a88858..c20aad4218f17e 100644 --- a/caffe2/python/operator_test/elementwise_ops_test.py +++ b/caffe2/python/operator_test/elementwise_ops_test.py @@ -338,7 +338,7 @@ def sigmoid_ref(X): alpha=st.floats(min_value=-100.0, max_value=100.0), beta=st.floats(min_value=-100.0, max_value=100.0), engine=st.sampled_from([""]), - **hu.gcs_cpu_only) + **hu.gcs) def test_hard_sigmoid(self, X, inplace, alpha, beta, engine, gc, dc): # Prevent alpha and beta from mutually being 0 to avoid a division # error when adjusting our inputs diff --git a/caffe2/python/pybind_state_ideep.cc b/caffe2/python/pybind_state_ideep.cc index 668c812cd8e1a8..056558c9a73335 100644 --- a/caffe2/python/pybind_state_ideep.cc +++ b/caffe2/python/pybind_state_ideep.cc @@ -9,6 +9,7 @@ #include #include +#include "caffe2/ideep/operators/operator_fallback_ideep.h" #include namespace caffe2 { @@ -19,42 +20,42 @@ USE_IDEEP_DEF_ALIASES(); class IDeepFetcher; class IDeepFeeder; -REGISTER_BLOB_FETCHER((TypeMeta::Id()),IDeepFetcher); +REGISTER_IDEEP_OPERATOR(Python, IDEEPFallbackOp>); + +REGISTER_BLOB_FETCHER((TypeMeta::Id()), IDeepFetcher); REGISTER_BLOB_FEEDER(IDEEP, IDeepFeeder); class IDeepFetcher : public BlobFetcherBase { TypeMeta type_transform(const itensor &atensor) { - switch(atensor.get_data_type()) { - case itensor::data_type::f32: - return TypeMeta::Make(); - case itensor::data_type::s16: - return TypeMeta::Make(); - case itensor::data_type::s32: - return TypeMeta::Make(); - case itensor::data_type::s8: - return TypeMeta::Make(); - case itensor::data_type::u8: - return TypeMeta::Make(); - default: - // Should we throw exception? - return TypeMeta(); + switch (atensor.get_data_type()) { + case itensor::data_type::f32: + return TypeMeta::Make(); + case itensor::data_type::s32: + return TypeMeta::Make(); + case itensor::data_type::s8: + return TypeMeta::Make(); + case itensor::data_type::u8: + return TypeMeta::Make(); + default: + // Should we throw exception? + return TypeMeta(); } } - public: - pybind11::object Fetch(const Blob& blob) override { +public: + pybind11::object Fetch(const Blob &blob) override { try { return FetchTensor(blob.Get(), true).obj; - } catch (ideep::error& e) { - VLOG(1) << "IDEEP error: " << e.message; + } catch (ideep::error &e) { + LOG(ERROR) << "IDEEP error: " << e.message; throw; } } - FetchedBlob FetchTensor(const itensor& atensor, bool force_copy) { + FetchedBlob FetchTensor(const itensor &atensor, bool force_copy) { FetchedBlob result; CAFFE_ENFORCE(atensor.materialized(), - "Trying to fetch uninitialized tensor"); + "Trying to fetch uninitialized tensor"); const int numpy_type = CaffeToNumpyType(type_transform(atensor)); CAFFE_ENFORCE( numpy_type != -1, @@ -64,17 +65,16 @@ class IDeepFetcher : public BlobFetcherBase { std::vector npy_dims(dims.begin(), dims.end()); result.copied = force_copy || atensor.need_reorder(); - void* outPtr; + void *outPtr; if (result.copied) { result.obj = py::reinterpret_steal( PyArray_SimpleNew(atensor.ndims(), npy_dims.data(), numpy_type)); outPtr = static_cast( - PyArray_DATA(reinterpret_cast(result.obj.ptr()))); + PyArray_DATA(reinterpret_cast(result.obj.ptr()))); } else { outPtr = atensor.get_data_handle(); - result.obj = py::reinterpret_steal( - PyArray_SimpleNewFromData( - atensor.ndims(), npy_dims.data(), numpy_type, outPtr)); + result.obj = py::reinterpret_steal(PyArray_SimpleNewFromData( + atensor.ndims(), npy_dims.data(), numpy_type, outPtr)); } if (numpy_type == NPY_OBJECT) { @@ -95,8 +95,6 @@ class IDeepFeeder : public BlobFeederBase { return itensor::data_type::f32; else if (meta == TypeMeta::Make()) return itensor::data_type::s32; - else if (meta == TypeMeta::Make()) - return itensor::data_type::s16; else if (meta == TypeMeta::Make()) return itensor::data_type::s8; else if (meta == TypeMeta::Make()) @@ -105,53 +103,74 @@ class IDeepFeeder : public BlobFeederBase { return itensor::data_type::data_undef; } - public: - void FeedTensor( - const DeviceOption& option, - PyArrayObject *original_array, - itensor *tensor) { - PyArrayObject *array = PyArray_GETCONTIGUOUS(original_array); - auto g = MakeGuard([&]() {Py_XDECREF(array); }); - - const auto npy_type = PyArray_TYPE(array); - const TypeMeta& meta = NumpyTypeToCaffe(npy_type); - CAFFE_ENFORCE( - meta.id() != TypeIdentifier::uninitialized(), +public: + void FeedTensor( + const DeviceOption &option, + PyArrayObject *original_array, + itensor *tensor) { + PyArrayObject *array = PyArray_GETCONTIGUOUS(original_array); + auto g = MakeGuard([&]() { Py_XDECREF(array); }); + const auto npy_type = PyArray_TYPE(array); + const TypeMeta &meta = NumpyTypeToCaffe(npy_type); + CAFFE_ENFORCE_NE( + meta.id(), + TypeIdentifier::uninitialized(), "This numpy data type is not supported: ", - PyArray_TYPE(array), - "."); + PyArray_TYPE(array), "."); - int ndim = PyArray_NDIM(array); - npy_intp* npy_dims = PyArray_DIMS(array); + int ndim = PyArray_NDIM(array); + npy_intp *npy_dims = PyArray_DIMS(array); - itensor::dims adims; - for (int i = 0; i < ndim; i++) { - adims.push_back(static_cast( - npy_dims[i])); - } + itensor::dims adims; + for (int i = 0; i < ndim; i++) { + adims.push_back(static_cast(npy_dims[i])); + } - switch (npy_type) { + switch (npy_type) { case NPY_OBJECT: case NPY_UNICODE: CAFFE_THROW("IDeep doesn't support string"); break; default: auto type = type_transform(meta); - tensor->resize(adims, type); + if (tensor->get_dims() != adims || type != tensor->get_data_type()) { + tensor->resize(adims, type); + } tensor->reorder_from(adims, type, - static_cast(PyArray_DATA(array))); - } - } + static_cast(PyArray_DATA(array))); + } + } - void Feed(const DeviceOption& option, PyArrayObject* original_array, - Blob* blob) { - try { + bool ZeroDim(PyArrayObject *array) { + int ndim = PyArray_NDIM(array); + npy_intp *npy_dims = PyArray_DIMS(array); + return ndim == 0 || + std::find(npy_dims, npy_dims + ndim, 0) != npy_dims + ndim; + } + + void Feed(const DeviceOption &option, PyArrayObject *original_array, + Blob *blob) { + try { + PyArrayObject *array = PyArray_GETCONTIGUOUS(original_array); + auto g = MakeGuard([&]() { Py_XDECREF(array); }); + + const auto npy_type = PyArray_TYPE(array); + const TypeMeta &meta = NumpyTypeToCaffe(npy_type); + // TODO: if necessary, use dispatcher. + if (meta.Match() && !ZeroDim(original_array)) { FeedTensor(option, original_array, blob->GetMutable()); - } catch (ideep::error& e) { - VLOG(1) << "IDEEP error: " << e.message; - throw; + } else { + DeviceOption cpu_option(option); + cpu_option.set_device_type(DeviceType::CPU); + TensorFeeder cpu_tensor_feeder; + cpu_tensor_feeder.FeedTensor(cpu_option, original_array, + blob->GetMutableTensor(CPU)); } - } + } catch (ideep::error &e) { + LOG(ERROR) << "IDEEP error: " << e.message; + throw; + } + } }; } // namespace python diff --git a/caffe2/python/task.py b/caffe2/python/task.py index 5aafdf63c3b28a..9cfe7089332a18 100644 --- a/caffe2/python/task.py +++ b/caffe2/python/task.py @@ -228,7 +228,7 @@ def num_registered_tasks(self): def used_nodes(self): # use list to keep order used = [] - for task in self.tasks(): + for task in self._tasks + self._tasks_to_add: if task.node not in used: used.append(task.node) return used diff --git a/caffe2/python/transformations_test.py b/caffe2/python/transformations_test.py index 2437933ae624eb..1a579b519fe09c 100644 --- a/caffe2/python/transformations_test.py +++ b/caffe2/python/transformations_test.py @@ -391,3 +391,17 @@ def test_transformer_FuseConv3DBN( rtol=1e-02, atol=1e-04 ) + + def test_converterEnforceUnusedInputs(self): + net = core.Net("net") + net.Relu(["X"], ["Y"]) + net.Proto().external_input.extend(["fake"]) + with self.assertRaises(Exception): + transformer.AddNNPACK(net) # just testing the converter + + def test_converterEnforceUnusedOutputs(self): + net = core.Net("net") + net.Relu(["X"], ["Y"]) + net.Proto().external_output.extend(["fake"]) + with self.assertRaises(Exception): + transformer.AddNNPACK(net) # just testing the converter diff --git a/caffe2/utils/smart_tensor_printer_test.cc b/caffe2/utils/smart_tensor_printer_test.cc index 82a59ad60aa950..e207f7c7b05284 100644 --- a/caffe2/utils/smart_tensor_printer_test.cc +++ b/caffe2/utils/smart_tensor_printer_test.cc @@ -39,6 +39,9 @@ void printTensorAndCheck(const std::vector& values) { expect_stderr_contains(values); } +// We need real glog for this test to pass +#ifdef CAFFE2_USE_GOOGLE_GLOG + #if !(__APPLE__) // TODO(janusz): thread_local does not work under mac. TEST(SmartTensorPrinterTest, SimpleTest) { @@ -48,4 +51,6 @@ TEST(SmartTensorPrinterTest, SimpleTest) { #endif // !(__APPLE__) +#endif // CAFFE2_USE_GOOGLE_GLOG + } // namespace caffe2 diff --git a/docs/source/torch.rst b/docs/source/torch.rst index fa2f92092758a4..d385ff07d323d5 100644 --- a/docs/source/torch.rst +++ b/docs/source/torch.rst @@ -169,6 +169,7 @@ Pointwise Ops .. autofunction:: cos .. autofunction:: cosh .. autofunction:: div +.. autofunction:: digamma .. autofunction:: erf .. autofunction:: erfc .. autofunction:: erfinv diff --git a/modules/detectron/CMakeLists.txt b/modules/detectron/CMakeLists.txt index f18077b829427b..1791ca27a98590 100644 --- a/modules/detectron/CMakeLists.txt +++ b/modules/detectron/CMakeLists.txt @@ -11,4 +11,8 @@ if (USE_CUDA) target_link_libraries(caffe2_detectron_ops_gpu caffe2_gpu) install(TARGETS caffe2_detectron_ops_gpu DESTINATION lib) +elseif(NOT IOS_PLATFORM) + add_library(caffe2_detectron_ops SHARED ${Detectron_CPU_SRCS}) + target_link_libraries(caffe2_detectron_ops caffe2) + install(TARGETS caffe2_detectron_ops DESTINATION lib) endif() diff --git a/modules/detectron/batch_permutation_op.cc b/modules/detectron/batch_permutation_op.cc index f92d7dd236d758..032288f811de08 100644 --- a/modules/detectron/batch_permutation_op.cc +++ b/modules/detectron/batch_permutation_op.cc @@ -15,9 +15,19 @@ */ #include "batch_permutation_op.h" +#ifdef CAFFE2_USE_IDEEP +#include +#include +#endif namespace caffe2 { +#ifdef CAFFE2_USE_IDEEP +REGISTER_IDEEP_OPERATOR( + BatchPermutation, + IDEEPFallbackOp>); +#endif + REGISTER_CPU_OPERATOR(BatchPermutation, BatchPermutationOp); REGISTER_CPU_OPERATOR( BatchPermutationGradient, diff --git a/modules/detectron/upsample_nearest_op.cc b/modules/detectron/upsample_nearest_op.cc index b668701b4ce4f4..4fc4d6dcd93a31 100644 --- a/modules/detectron/upsample_nearest_op.cc +++ b/modules/detectron/upsample_nearest_op.cc @@ -15,8 +15,17 @@ */ #include "upsample_nearest_op.h" +#ifdef CAFFE2_USE_IDEEP +#include "caffe2/ideep/operators/operator_fallback_ideep.h" +#include "caffe2/ideep/utils/ideep_operator.h" +#endif namespace caffe2 { +#ifdef CAFFE2_USE_IDEEP +REGISTER_IDEEP_OPERATOR( + UpsampleNearest, + IDEEPFallbackOp>); +#endif REGISTER_CPU_OPERATOR(UpsampleNearest, UpsampleNearestOp); REGISTER_CPU_OPERATOR( diff --git a/modules/detectron/upsample_nearest_op.h b/modules/detectron/upsample_nearest_op.h index e24d705bc14afd..17f77855509e67 100644 --- a/modules/detectron/upsample_nearest_op.h +++ b/modules/detectron/upsample_nearest_op.h @@ -35,8 +35,50 @@ class UpsampleNearestOp final : public Operator { USE_OPERATOR_CONTEXT_FUNCTIONS; bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; + auto translate_idx = [](int ii, int d1, int d2, int d3, int scale_factor) { + int x, y, z, w; + w = ii % d3; + ii = ii/d3; + z = ii % d2; + ii = ii/d2; + y = ii % d1; + ii = ii/d1; + x = ii; + w = w/scale_factor; + z = z/scale_factor; + d2 /= scale_factor; + d3 /= scale_factor; + return (((x*d1+y)*d2)+z)*d3+w; + }; + + auto& X = Input(0); + auto* Y = Output(0); + auto out_shape = X.dims(); + out_shape[X.ndim() - 1] *= scale_; + out_shape[X.ndim() - 2] *= scale_; + Y->Resize(out_shape); + + int d1; + int d2; + int d3; + if (X.ndim() == 3) { + d1 = Y->dim32(0); + d2 = Y->dim32(1); + d3 = Y->dim32(2); + } else { + d1 = Y->dim32(1); + d2 = Y->dim32(2); + d3 = Y->dim32(3); + } + + const T *input_data = X.template data(); + T *output_data = Y->template mutable_data(); + + for (int ii = 0; ii < Y->size(); ii++) { + int ipidx = translate_idx(ii, d1, d2, d3, scale_); + output_data[ii] = input_data[ipidx]; + } + return true; } protected: diff --git a/setup.py b/setup.py index ac82f2a7960a1b..e99283d54ffdd9 100644 --- a/setup.py +++ b/setup.py @@ -54,6 +54,8 @@ # TORCH_CUDA_ARCH_LIST # specify which CUDA architectures to build for. # ie `TORCH_CUDA_ARCH_LIST="6.0;7.0"` +# These are not CUDA versions, instead, they specify what +# classes of NVIDIA hardware we should generate PTX for. # # ONNX_NAMESPACE # specify a namespace for ONNX built here rather than the hard-coded diff --git a/test/common.py b/test/common.py index 1c86bcd7fe24b8..e7d6940ea56cc1 100644 --- a/test/common.py +++ b/test/common.py @@ -17,6 +17,7 @@ import warnings import random import contextlib +import socket from functools import wraps from itertools import product from copy import deepcopy @@ -111,12 +112,10 @@ def wrapper(*args, **kwargs): def skipIfNoLapack(fn): @wraps(fn) def wrapper(*args, **kwargs): - try: + if not torch._C.has_lapack: + raise unittest.SkipTest('PyTorch compiled without Lapack') + else: fn(*args, **kwargs) - except Exception as e: - if 'Lapack library not found' in repr(e): - raise unittest.SkipTest('Compiled without Lapack') - raise return wrapper @@ -550,3 +549,12 @@ def download_file(url, binary=True): msg = "could not download test file '{}'".format(url) warnings.warn(msg, RuntimeWarning) raise unittest.SkipTest(msg) + + +def find_free_port(): + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + sock.bind(('localhost', 0)) + sockname = sock.getsockname() + sock.close() + return sockname[1] diff --git a/test/expect/TestJit.test_constant_prop_loop_constant.expect b/test/expect/TestJit.test_constant_prop_loop_constant.expect new file mode 100644 index 00000000000000..5bdca2f2c47890 --- /dev/null +++ b/test/expect/TestJit.test_constant_prop_loop_constant.expect @@ -0,0 +1,20 @@ +graph() { + %b.1 : int = prim::Constant[value=0]() + %1 : int = prim::Constant[value=2147483647]() + %2 : int = prim::Constant[value=1]() + %b.3 : int = prim::Loop(%1, %2, %b.1) + block0(%4 : int, %5 : int) { + %b.2 : int = prim::Constant[value=1]() + %7 : int = prim::Constant[value=1]() + -> (%7, %b.2) + } + %8 : int = prim::Constant[value=2147483647]() + %9 : int = prim::Constant[value=0]() + %b : int = prim::Loop(%8, %9, %b.3) + block0(%11 : int, %12 : int) { + %b.4 : int = prim::Constant[value=2]() + %14 : int = prim::Constant[value=0]() + -> (%14, %b.4) + } + return (%b); +} diff --git a/test/expect/TestScript.test_call_traced_mod_from_script_fn.expect b/test/expect/TestScript.test_call_traced_mod_from_script_fn.expect index 6a9a3a571967a2..078091d52268e2 100644 --- a/test/expect/TestScript.test_call_traced_mod_from_script_fn.expect +++ b/test/expect/TestScript.test_call_traced_mod_from_script_fn.expect @@ -1,8 +1,14 @@ graph(%x : Dynamic) { - %1 : Double(4, 3) = prim::Constant[value=]() - %2 : Double(3, 3) = aten::mm(%x, %1) - %3 : int = prim::Constant[value=1]() - %4 : int = prim::Constant[value=1]() - %5 : Dynamic = aten::add(%2, %3, %4) - return (%5); + %1 : int = prim::Constant[value=4]() + %2 : int = prim::Constant[value=3]() + %3 : int[] = prim::ListConstruct(%1, %2) + %4 : int = prim::Constant[value=7]() + %5 : int = prim::Constant[value=0]() + %6 : int[] = prim::Constant[value=[0, -1]]() + %7 : Double(4, 3) = aten::zeros(%3, %4, %5, %6) + %8 : Double(3, 3) = aten::mm(%x, %7) + %9 : int = prim::Constant[value=1]() + %10 : int = prim::Constant[value=1]() + %11 : Dynamic = aten::add(%8, %9, %10) + return (%11); } diff --git a/test/expect/TestScript.test_onnx_export_speculate-f1.expect b/test/expect/TestScript.test_onnx_export_speculate-f1.expect index 47f55eb41ccdaa..4e8e51552ea4ac 100644 --- a/test/expect/TestScript.test_onnx_export_speculate-f1.expect +++ b/test/expect/TestScript.test_onnx_export_speculate-f1.expect @@ -6,27 +6,28 @@ ModelProto { GraphProto { name: "torch-jit-export" inputs: [{name: "x.1", type:Tensor dims: 1 10}] - outputs: [{name: "6", type:Tensor dims: 10 1}] + outputs: [{name: "8", type:Tensor dims: 10 1}] initializers: [] nodes: [ Node {type: "Add", inputs: [x.1,x.1], outputs: [1], attributes: []}, - Node {type: "Constant", inputs: [], outputs: [2], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]}, - Node {type: "Transpose", inputs: [1], outputs: [3], attributes: [{ name: 'perm', type: ints, values: [1 0]}]}, - Node {type: "Transpose", inputs: [1], outputs: [4], attributes: [{ name: 'perm', type: ints, values: [1 0]}]}, + Node {type: "ReduceSum", inputs: [1], outputs: [2], attributes: [{ name: 'keepdims', type: int, value: 0}]}, + Node {type: "Constant", inputs: [], outputs: [3], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]}, + Node {type: "Greater", inputs: [2,3], outputs: [4], attributes: []}, Node {type: "Transpose", inputs: [1], outputs: [5], attributes: [{ name: 'perm', type: ints, values: [1 0]}]}, - Node {type: "If", inputs: [2], outputs: [6], attributes: [{ name: 'then_branch', type: graph, value: + Node {type: "Transpose", inputs: [1], outputs: [6], attributes: [{ name: 'perm', type: ints, values: [1 0]}]}, + Node {type: "Transpose", inputs: [1], outputs: [7], attributes: [{ name: 'perm', type: ints, values: [1 0]}]}, + Node {type: "If", inputs: [4], outputs: [8], attributes: [{ name: 'then_branch', type: graph, value: GraphProto { name: "torch-jit-export1" inputs: [] - outputs: [{name: "8", type:Tensor dims: }] + outputs: [{name: "9", type:Tensor dims: }] initializers: [] nodes: [ - Node {type: "Constant", inputs: [], outputs: [7], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]}, - Node {type: "If", inputs: [7], outputs: [8], attributes: [{ name: 'then_branch', type: graph, value: + Node {type: "If", inputs: [4], outputs: [9], attributes: [{ name: 'then_branch', type: graph, value: GraphProto { name: "torch-jit-export2" inputs: [] - outputs: [{name: "3", type:Tensor dims: }] + outputs: [{name: "5", type:Tensor dims: }] initializers: [] nodes: [ @@ -37,7 +38,7 @@ ModelProto { GraphProto { name: "torch-jit-export3" inputs: [] - outputs: [{name: "4", type:Tensor dims: }] + outputs: [{name: "6", type:Tensor dims: }] initializers: [] nodes: [ @@ -52,7 +53,7 @@ ModelProto { GraphProto { name: "torch-jit-export4" inputs: [] - outputs: [{name: "5", type:Tensor dims: }] + outputs: [{name: "7", type:Tensor dims: }] initializers: [] nodes: [ diff --git a/test/expect/TestScript.test_onnx_export_speculate-f2.expect b/test/expect/TestScript.test_onnx_export_speculate-f2.expect index e7d04f54309b05..2820ce5f639ecb 100644 --- a/test/expect/TestScript.test_onnx_export_speculate-f2.expect +++ b/test/expect/TestScript.test_onnx_export_speculate-f2.expect @@ -6,27 +6,28 @@ ModelProto { GraphProto { name: "torch-jit-export" inputs: [{name: "x.1", type:Tensor dims: 1 10},{name: "1", type:Tensor dims: 20 10},{name: "2", type:Tensor dims: 20}] - outputs: [{name: "5", type:Tensor dims: 1 20}] + outputs: [{name: "7", type:Tensor dims: 1 20}] initializers: [TensorProto shape: [20 10],TensorProto shape: [20]] nodes: [ Node {type: "Add", inputs: [x.1,x.1], outputs: [3], attributes: []}, - Node {type: "Constant", inputs: [], outputs: [4], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]}, - Node {type: "If", inputs: [4], outputs: [5], attributes: [{ name: 'then_branch', type: graph, value: + Node {type: "ReduceSum", inputs: [3], outputs: [4], attributes: [{ name: 'keepdims', type: int, value: 0}]}, + Node {type: "Constant", inputs: [], outputs: [5], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]}, + Node {type: "Greater", inputs: [4,5], outputs: [6], attributes: []}, + Node {type: "If", inputs: [6], outputs: [7], attributes: [{ name: 'then_branch', type: graph, value: GraphProto { name: "torch-jit-export1" inputs: [] - outputs: [{name: "7", type:Tensor dims: 1 20}] + outputs: [{name: "8", type:Tensor dims: 1 20}] initializers: [] nodes: [ - Node {type: "Constant", inputs: [], outputs: [6], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]}, - Node {type: "If", inputs: [6], outputs: [7], attributes: [{ name: 'then_branch', type: graph, value: + Node {type: "If", inputs: [6], outputs: [8], attributes: [{ name: 'then_branch', type: graph, value: GraphProto { name: "torch-jit-export2" inputs: [] - outputs: [{name: "8", type:Tensor dims: 1 20}] + outputs: [{name: "9", type:Tensor dims: 1 20}] initializers: [] nodes: [ - Node {type: "Gemm", inputs: [3,1,2], outputs: [8], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'transB', type: int, value: 1}]} + Node {type: "Gemm", inputs: [3,1,2], outputs: [9], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'transB', type: int, value: 1}]} ] } @@ -34,10 +35,10 @@ ModelProto { GraphProto { name: "torch-jit-export3" inputs: [] - outputs: [{name: "9", type:Tensor dims: 1 20}] + outputs: [{name: "10", type:Tensor dims: 1 20}] initializers: [] nodes: [ - Node {type: "Gemm", inputs: [3,1,2], outputs: [9], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'transB', type: int, value: 1}]} + Node {type: "Gemm", inputs: [3,1,2], outputs: [10], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'transB', type: int, value: 1}]} ] } @@ -49,10 +50,10 @@ ModelProto { GraphProto { name: "torch-jit-export4" inputs: [] - outputs: [{name: "10", type:Tensor dims: 1 20}] + outputs: [{name: "11", type:Tensor dims: 1 20}] initializers: [] nodes: [ - Node {type: "Gemm", inputs: [3,1,2], outputs: [10], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'transB', type: int, value: 1}]} + Node {type: "Gemm", inputs: [3,1,2], outputs: [11], attributes: [{ name: 'alpha', type: float, value: 1},{ name: 'beta', type: float, value: 1},{ name: 'transB', type: int, value: 1}]} ] } diff --git a/test/onnx/expect/TestOperators.test_unsqueeze.expect b/test/onnx/expect/TestOperators.test_unsqueeze.expect new file mode 100644 index 00000000000000..3a8e01092f8d0b --- /dev/null +++ b/test/onnx/expect/TestOperators.test_unsqueeze.expect @@ -0,0 +1,54 @@ +ir_version: 3 +producer_name: "pytorch" +producer_version: "0.4" +graph { + node { + input: "0" + output: "1" + op_type: "Unsqueeze" + attribute { + name: "axes" + ints: 2 + type: INTS + } + } + name: "torch-jit-export" + input { + name: "0" + type { + tensor_type { + elem_type: FLOAT + shape { + dim { + dim_value: 3 + } + dim { + dim_value: 4 + } + } + } + } + } + output { + name: "1" + type { + tensor_type { + elem_type: FLOAT + shape { + dim { + dim_value: 3 + } + dim { + dim_value: 4 + } + dim { + dim_value: 1 + } + } + } + } + } +} +opset_import { + version: 7 +} diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py index f476cde7afd935..d8e0b6be0d94a9 100644 --- a/test/onnx/test_operators.py +++ b/test/onnx/test_operators.py @@ -428,6 +428,10 @@ def test_upsample(self): x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True) self.assertONNX(lambda x: nn.functional.interpolate(x, scale_factor=2., mode='bilinear'), x) + def test_unsqueeze(self): + x = Variable(torch.randn(3, 4), requires_grad=True) + self.assertONNX(lambda x: x.unsqueeze(len(x.shape)), x) + def test_symbolic_override(self): """Lifted from fast-neural-style: custom implementation of instance norm to be mapped to ONNX operator""" diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py index 9b31d02d6e385d..349e7fc1eec375 100644 --- a/test/onnx/test_pytorch_onnx_caffe2.py +++ b/test/onnx/test_pytorch_onnx_caffe2.py @@ -798,6 +798,18 @@ def test_convtranspose(self): model = nn.ConvTranspose2d(3, 3, 3, stride=3, bias=False, padding=1, output_padding=2) self.run_model_test(model, train=False, batch_size=BATCH_SIZE, atol=1e-7) + def test_unsqueeze(self): + shape = (3, 4, 5) + for dim in range(len(shape) + 1): + class MyModel(torch.nn.Module): + def __init__(self): + super(MyModel, self).__init__() + + def forward(self, x): + return x.unsqueeze(dim) + x = Variable(torch.randn(*shape)) + self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, atol=1e-7) + # NB: InstanceNorm model includes unused weights, so skip this in TestCaffe2BackendEmbed # TODO: We should have another pass to eliminate the unused initializers in ONNX models. @skipIfEmbed diff --git a/test/run_test.py b/test/run_test.py index 3979ba0f2d15e6..71b96e78bc91b5 100644 --- a/test/run_test.py +++ b/test/run_test.py @@ -14,6 +14,7 @@ import torch from torch.utils import cpp_extension from common import TEST_WITH_ROCM +import torch.distributed.c10d as c10d TESTS = [ 'autograd', @@ -31,12 +32,14 @@ 'nn', 'optim', 'sparse', + 'thd_distributed', 'torch', 'utils', ] WINDOWS_BLACKLIST = [ 'distributed', + 'thd_distributed', ] ROCM_BLACKLIST = [ @@ -46,10 +49,29 @@ 'distributions', 'multiprocessing', 'nccl', + 'thd_distributed', 'utils', ] DISTRIBUTED_TESTS_CONFIG = { + 'gloo': { + 'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3' + }, +} + + +if c10d.is_available(): + if c10d.is_mpi_available(): + DISTRIBUTED_TESTS_CONFIG['mpi'] = { + 'WORLD_SIZE': '3' + } + if c10d.is_nccl_available(): + DISTRIBUTED_TESTS_CONFIG['nccl'] = { + 'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3' + } + + +THD_DISTRIBUTED_TESTS_CONFIG = { 'tcp': { 'WORLD_SIZE': '3' }, @@ -122,7 +144,10 @@ def test_distributed(python, test_module, test_directory, options): if options.verbose and not mpi_available: print_to_stderr( 'MPI not available -- MPI backend tests will be skipped') - for backend, env_vars in DISTRIBUTED_TESTS_CONFIG.items(): + config = DISTRIBUTED_TESTS_CONFIG + if test_module == "test_thd_distributed": + config = THD_DISTRIBUTED_TESTS_CONFIG + for backend, env_vars in config.items(): if backend == 'mpi' and not mpi_available: continue for with_init_file in {True, False}: @@ -137,7 +162,10 @@ def test_distributed(python, test_module, test_directory, options): os.environ['INIT_METHOD'] = 'env://' os.environ.update(env_vars) if with_init_file: - init_method = 'file://{}/shared_init_file'.format(tmp_dir) + if test_module == "test_distributed": + init_method = 'file://{}/'.format(tmp_dir) + else: + init_method = 'file://{}/shared_init_file'.format(tmp_dir) os.environ['INIT_METHOD'] = init_method try: os.mkdir(os.path.join(tmp_dir, 'barrier')) @@ -166,6 +194,7 @@ def test_distributed(python, test_module, test_directory, options): CUSTOM_HANDLERS = { 'cpp_extensions': test_cpp_extensions, 'distributed': test_distributed, + 'thd_distributed': test_distributed, } diff --git a/test/test_c10d.py b/test/test_c10d.py index c448eba1349972..13f7b779d04736 100644 --- a/test/test_c10d.py +++ b/test/test_c10d.py @@ -1,7 +1,6 @@ import copy import math import multiprocessing -import socket import sys import tempfile import unittest @@ -10,6 +9,7 @@ from collections import namedtuple import torch +import common from torch import nn import torch.nn.functional as F from torch.distributed import c10d @@ -60,15 +60,6 @@ def get_timeout(test_id): return TIMEOUT_OVERRIDE.get(test_id.split('.')[-1], TIMEOUT_DEFAULT) -def find_free_port(): - sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - sock.bind(('localhost', 0)) - sockname = sock.getsockname() - sock.close() - return sockname[1] - - def gpus_for_rank(world_size): """Multigpu tests are designed to simulate the multi nodes with multi GPUs on each node. Nccl backend requires equal #GPUs in each process. @@ -126,14 +117,14 @@ def _create_store(self): class TCPStoreTest(TestCase, StoreTestBase): def _create_store(self): addr = 'localhost' - port = find_free_port() + port = common.find_free_port() return c10d.TCPStore(addr, port, True) class PrefixTCPStoreTest(TestCase, StoreTestBase): def setUp(self): addr = 'localhost' - port = find_free_port() + port = common.find_free_port() self.tcpstore = c10d.TCPStore(addr, port, True) self.prefix = "test_prefix" @@ -150,10 +141,10 @@ def test_unknown_handler(self): class RendezvousFileTest(TestCase): def test_common_errors(self): with self.assertRaisesRegex(ValueError, 'path missing'): - gen = c10d.rendezvous('file://?rank=0&size=1') + gen = c10d.rendezvous('file://?rank=0&world_size=1') next(gen) with self.assertRaisesRegex(ValueError, 'rank parameter missing'): - gen = c10d.rendezvous('file:///tmp/foo?size=1') + gen = c10d.rendezvous('file:///tmp/foo?world_size=1') next(gen) with self.assertRaisesRegex(ValueError, 'size parameter missing'): gen = c10d.rendezvous('file:///tmp/foo?rank=0') @@ -161,7 +152,7 @@ def test_common_errors(self): def test_nominal(self): with tempfile.NamedTemporaryFile() as file: - url = 'file://%s?size=%d' % (file.name, 2) + url = 'file://%s?world_size=%d' % (file.name, 2) gen0 = c10d.rendezvous(url + "&rank=0") store0, rank0, size0 = next(gen0) self.assertEqual(0, rank0) @@ -183,10 +174,10 @@ def test_nominal(self): class RendezvousTCPTest(TestCase): def test_common_errors(self): with self.assertRaisesRegex(ValueError, 'port number missing'): - gen = c10d.rendezvous('tcp://127.0.0.1?rank=0&size=1') + gen = c10d.rendezvous('tcp://127.0.0.1?rank=0&world_size=1') next(gen) with self.assertRaisesRegex(ValueError, 'rank parameter missing'): - gen = c10d.rendezvous('tcp://127.0.0.1:23456?size=1') + gen = c10d.rendezvous('tcp://127.0.0.1:23456?world_size=1') next(gen) with self.assertRaisesRegex(ValueError, 'size parameter missing'): gen = c10d.rendezvous('tcp://127.0.0.1:23456?rank=0') @@ -194,8 +185,8 @@ def test_common_errors(self): def test_nominal(self): addr = 'localhost' - port = find_free_port() - url = 'tcp://%s:%d?size=%d' % (addr, port, 2) + port = common.find_free_port() + url = 'tcp://%s:%d?world_size=%d' % (addr, port, 2) gen0 = c10d.rendezvous(url + "&rank=0") store0, rank0, size0 = next(gen0) self.assertEqual(0, rank0) @@ -245,7 +236,7 @@ def setUpClass(cls): def setUp(self): self.rank = self.MAIN_PROCESS_RANK self.file = tempfile.NamedTemporaryFile() - self.port = find_free_port() + self.port = common.find_free_port() self.processes = [self._spawn_process(rank) for rank in range(int(self.world_size))] def tearDown(self): @@ -529,8 +520,9 @@ def _test_ddp_with_process_group(self, process_group): model = Net() ddp_model = distributed_c10d._DistributedDataParallelC10d( copy.deepcopy(model).cuda(gpus[0]), - process_group, - device_ids=gpus) + device_ids=gpus, + process_group=process_group) + model.cuda(gpus[0]) local_batch_size = len(gpus) diff --git a/test/test_cuda.py b/test/test_cuda.py index 7c70aa2591f3a3..03a2ff5af641fe 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -1823,17 +1823,7 @@ def test(use_double=False): @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected") def test_symeig(self): - # Small case - tensor = torch.randn(3, 3).cuda() - tensor = torch.mm(tensor, tensor.t()) - eigval, eigvec = torch.symeig(tensor, eigenvectors=True) - self.assertEqual(tensor, torch.mm(torch.mm(eigvec, eigval.diag()), eigvec.t())) - - # Large case - tensor = torch.randn(257, 257).cuda() - tensor = torch.mm(tensor, tensor.t()) - eigval, eigvec = torch.symeig(tensor, eigenvectors=True) - self.assertEqual(tensor, torch.mm(torch.mm(eigvec, eigval.diag()), eigvec.t())) + TestTorch._test_symeig(self, lambda t: t.cuda()) def test_arange(self): for t in ['IntTensor', 'LongTensor', 'FloatTensor', 'DoubleTensor']: diff --git a/test/test_distributed.py b/test/test_distributed.py index 47dbe9d056f154..38a32d69ef7c64 100644 --- a/test/test_distributed.py +++ b/test/test_distributed.py @@ -5,29 +5,32 @@ import os import sys import time +import tempfile import unittest from contextlib import contextmanager from functools import reduce, wraps import torch import torch.cuda -import torch.distributed as dist +import torch.distributed.c10d as dist import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from common import TestCase from torch._utils_internal import TEST_MASTER_ADDR as MASTER_ADDR from torch.autograd import Variable - +import common BACKEND = os.environ["BACKEND"] TEMP_DIR = os.environ["TEMP_DIR"] INIT_METHOD = os.getenv("INIT_METHOD", "env://") -MASTER_PORT = "29500" DEFAULT_TIMEOUT = 300 CUSTOMIZED_TIMEOUT = {"test_DistributedDataParallel": 500} +if INIT_METHOD.startswith("file://"): + FOLDER = INIT_METHOD[7:] + def get_timeout(test_id): test_name = test_id.split(".")[-1] @@ -361,8 +364,9 @@ def test_broadcast_cuda(self): rank_to_GPU = self._init_multigpu_helper() self._test_broadcast_helper(group, group_id, rank, True, rank_to_GPU) - @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") @skip_if_small_worldsize + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + @unittest.skipIf(BACKEND == "mpi", "MPI does not support group") def test_broadcast_group(self): group, group_id, rank = self._init_group_test() self._test_broadcast_helper(group, group_id, rank) @@ -454,7 +458,8 @@ def test_reduce_max(self): self._test_reduce_helper(group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10) @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce") - @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + @unittest.skipIf(BACKEND == "mpi", "MPI does not support group") @skip_if_small_worldsize def test_reduce_group_sum(self): group, group_id, rank = self._init_group_test() @@ -469,7 +474,8 @@ def test_reduce_group_sum(self): ) @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce") - @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + @unittest.skipIf(BACKEND == "mpi", "MPI does not support group") @skip_if_small_worldsize def test_reduce_group_product(self): group, group_id, rank = self._init_group_test() @@ -484,14 +490,16 @@ def test_reduce_group_product(self): ) @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce") - @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + @unittest.skipIf(BACKEND == "mpi", "MPI does not support group") @skip_if_small_worldsize def test_reduce_group_min(self): group, group_id, rank = self._init_group_test() self._test_reduce_helper(group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1) @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce") - @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + @unittest.skipIf(BACKEND == "mpi", "MPI does not support group") @skip_if_small_worldsize def test_reduce_group_max(self): group, group_id, rank = self._init_group_test() @@ -540,8 +548,8 @@ def test_all_reduce_sum(self): ) @unittest.skipIf( - BACKEND != "gloo" and BACKEND != "nccl", - "Only Gloo & Nccl backend support CUDA allReduce", + BACKEND != "gloo", + "Only Gloo backend will have CUDA allReduce tested", ) @skip_if_no_cuda_distributed @skip_if_no_gpu @@ -587,8 +595,9 @@ def test_all_reduce_max(self): group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10 ) - @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") @skip_if_small_worldsize + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + @unittest.skipIf(BACKEND == "mpi", "MPI does not support group") def test_all_reduce_group_sum(self): group, group_id, rank = self._init_group_test() self._test_all_reduce_helper( @@ -601,8 +610,9 @@ def test_all_reduce_group_sum(self): 2 + (10 * (len(group) - 1)), ) - @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") @skip_if_small_worldsize + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + @unittest.skipIf(BACKEND == "mpi", "MPI does not support group") def test_all_reduce_group_product(self): group, group_id, rank = self._init_group_test() self._test_all_reduce_helper( @@ -615,16 +625,18 @@ def test_all_reduce_group_product(self): reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2), ) - @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") @skip_if_small_worldsize + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + @unittest.skipIf(BACKEND == "mpi", "MPI does not support group") def test_all_reduce_group_min(self): group, group_id, rank = self._init_group_test() self._test_all_reduce_helper( group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1 ) - @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") @skip_if_small_worldsize + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + @unittest.skipIf(BACKEND == "mpi", "MPI does not support group") def test_all_reduce_group_max(self): group, group_id, rank = self._init_group_test() self._test_all_reduce_helper( @@ -652,6 +664,7 @@ def test_scatter(self): @unittest.skipIf(BACKEND == "gloo", "Gloo does not support scatter") @unittest.skipIf(BACKEND == "nccl", "Nccl does not support scatter") + @unittest.skipIf(BACKEND == "mpi", "MPI does not support group") @skip_if_small_worldsize def test_scatter_group(self): group, group_id, rank = self._init_group_test() @@ -679,7 +692,8 @@ def test_gather(self): self._test_gather_helper(group, group_id, rank) @unittest.skipIf(BACKEND == "gloo", "Gloo does not support gather") - @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + @unittest.skipIf(BACKEND == "mpi", "MPI does not support group") @skip_if_small_worldsize def test_gather_group(self): group, group_id, rank = self._init_group_test() @@ -703,12 +717,13 @@ def _test_all_gather_helper( self._barrier() - @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + @unittest.skipIf(BACKEND != "mpi", "Only MPI supports CPU all gather") def test_all_gather(self): group, group_id, rank = self._init_global_test() self._test_all_gather_helper(group, group_id, rank) @unittest.skipIf(BACKEND != "nccl", "Only Nccl supports CUDA all gather") + @unittest.skipIf(BACKEND == "nccl", "CUDA all gather skipped for NCCL") @skip_if_no_cuda_distributed @skip_if_no_gpu def test_all_gather_cuda(self): @@ -716,8 +731,10 @@ def test_all_gather_cuda(self): rank_to_GPU = self._init_multigpu_helper() self._test_all_gather_helper(group, group_id, rank, True, rank_to_GPU) - @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") @skip_if_small_worldsize + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support gather") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + @unittest.skipIf(BACKEND == "mpi", "MPI does not support group") def test_all_gather_group(self): group, group_id, rank = self._init_group_test() self._test_all_gather_helper(group, group_id, rank) @@ -740,13 +757,14 @@ def _test_barrier_helper(self, group, group_id, rank): self._barrier() - @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + @unittest.skipIf(BACKEND != "mpi", "Only MPI supports barrier") def test_barrier(self): group, group_id, rank = self._init_global_test() self._test_barrier_helper(group, group_id, rank) - @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") @skip_if_small_worldsize + @unittest.skipIf(BACKEND != "mpi", "Only MPI supports barrier") + @unittest.skipIf(BACKEND == "mpi", "MPI does not support group") def test_barrier_group(self): group, group_id, rank = self._init_group_test() self._test_barrier_helper(group, group_id, rank) @@ -765,7 +783,8 @@ def _test_broadcast_multigpu_helper(self, group, group_id, rank, rank_to_GPU): self.assertEqual(tensor, expected_tensor) self._barrier() - @unittest.skipIf(BACKEND != "nccl", "Only Nccl backend supports broadcast multigpu") + @unittest.skipIf(BACKEND == "mpi", "MPI doesn't support broadcast multigpu") + @unittest.skipIf(BACKEND == "nccl", "NCCL broadcast multigpu skipped") @skip_if_no_gpu def test_broadcast_multigpu(self): group, group_id, rank = self._init_global_test() @@ -802,7 +821,8 @@ def _test_all_reduce_multigpu_helper( self._barrier() - @unittest.skipIf(BACKEND != "nccl", "Only Nccl backend supports allreduce multigpu") + @unittest.skipIf(BACKEND == "mpi", "MPI doesn't support broadcast multigpu") + @unittest.skipIf(BACKEND == "nccl", "CUDA all_reduce multigpu skipped for NCCL") @skip_if_no_gpu def test_all_reduce_multigpu(self): group, group_id, rank = self._init_global_test() @@ -985,7 +1005,7 @@ def test_DistributedDataParallel(self): # DDP training setup model_DDP = copy.deepcopy(model) model_DDP.cuda(gpu_subset[0]) - model_DDP = nn.parallel.DistributedDataParallel( + model_DDP = nn.parallel._DistributedDataParallelC10d( model_DDP, device_ids=gpu_subset ) @@ -1006,33 +1026,8 @@ def test_DistributedDataParallel(self): ) self._barrier() - @unittest.skipIf( - BACKEND == "nccl", "nccl does not support DistributedDataParallelCPU" - ) - def test_DistributedDataParallelCPU(self): - # Run a simple end to end DDP-CPU model, use result of single node - # model as baseline - group, group_id, rank = self._init_global_test() - - # cpu training setup - model_base = self._create_Net() - - # DDP-CPU training setup - model_DDP = copy.deepcopy(model_base) - model_DDP = nn.parallel.DistributedDataParallelCPU(model_DDP) - - # dummy data initialization - local_bs = 2 - global_bs, input_cpu, target, loss = self._prepare_dummy_data(local_bs) - # check two model parameters over 2 iterations - self._test_DDP_2iter( - model_base, model_DDP, input_cpu, target, loss, local_bs, rank, global_bs - ) - self._barrier() - - -if BACKEND == "tcp" or BACKEND == "gloo" or BACKEND == "nccl": +if BACKEND == "gloo" or BACKEND == "nccl": WORLD_SIZE = os.environ["WORLD_SIZE"] class TestDistBackend(TestCase, _DistTestBase): @@ -1052,7 +1047,6 @@ def wrapper(self): @classmethod def setUpClass(cls): os.environ["MASTER_ADDR"] = MASTER_ADDR - os.environ["MASTER_PORT"] = MASTER_PORT os.environ["WORLD_SIZE"] = WORLD_SIZE for attr in dir(cls): if attr.startswith("test"): @@ -1060,6 +1054,17 @@ def setUpClass(cls): setattr(cls, attr, cls.manager_join(fn)) def setUp(self): + # Adding this hack until we fix the FileStore to delete its + # content at the end + global INIT_METHOD + if INIT_METHOD.startswith("file://"): + _, filename = tempfile.mkstemp(prefix=FOLDER) + INIT_METHOD = "file://{}".format(filename) + + if INIT_METHOD.startswith("env://"): + port = common.find_free_port() + os.environ["MASTER_PORT"] = str(port) + self.processes = [] self.rank = self.MANAGER_PROCESS_RANK Barrier.init() @@ -1081,7 +1086,10 @@ def _run(self, rank): self.rank = rank try: dist.init_process_group( - init_method=INIT_METHOD, backend=BACKEND, world_size=int(WORLD_SIZE) + init_method=INIT_METHOD, + backend=BACKEND, + world_size=int(WORLD_SIZE), + rank=self.rank ) except RuntimeError as e: if "recompile" in e.args[0]: diff --git a/test/test_jit.py b/test/test_jit.py index 5a9e9656e567ab..4fe4adc5b1d00d 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -396,6 +396,28 @@ def fn(x, y): self.assertEqual(fn(x, y), fn_traced(x, y)) + def test_disabled(self): + torch.jit._enabled = False + try: + def f(x, y): + return x + y + + self.assertIs(torch.jit.trace(torch.randn(2, 2), torch.randn(2, 2))(f), f) + self.assertIs(torch.jit.script(f), f) + + class MyModule(torch.jit.ScriptModule): + @torch.jit.script_method + def method(self, x): + return x + + # XXX: Unfortunately ScriptModule won't simply become Module now, + # because that requires disabling the JIT at startup time, which + # we can't do in here. + # We need to or those two conditions to make it work with all versions of Python + self.assertTrue(inspect.ismethod(MyModule.method) or inspect.isfunction(MyModule.method)) + finally: + torch.jit._enabled = True + # Backwards tracing was broken for indexing by a constant, # because it's internally implemented using as_strided, # and we attempted to trace its derivative (which is not @@ -956,6 +978,24 @@ def fn(x, y): self.assertExpectedGraph(traced_fn.graph) self.assertExportImport(traced_fn.graph, (x, y)) + def test_trace_tensor_factory(self): + def run(**kwargs): + inputs_require_grads = kwargs.pop('inputs_require_grads', True) + + def fn(x): + return x + torch.ones(2, 3, **kwargs) + input = torch.ones(2, 3, **kwargs) + self.checkTrace(fn, (input,), inputs_require_grads=inputs_require_grads) + # check we recorded 'ones' and did not just record a constant + tfn = torch.jit.trace(input)(fn) + self.assertTrue("ones" in str(tfn.graph)) + run() + run(dtype=torch.int, inputs_require_grads=False) + if RUN_CUDA: + run(device="cuda:0") + if RUN_CUDA_MULTI_GPU: + run(device="cuda:1") + # TODO: implement @unittest.expectedFailure def test_output_unflatten(self): @@ -1384,8 +1424,6 @@ def constant_prop(a, b): self.run_pass('constant_propagation', constant_prop.graph) self.assertExpected(canonical(constant_prop.graph)) - # TODO: implement - @unittest.expectedFailure def test_constant_prop_loop_constant(self): @torch.jit.script def constant_prop(): @@ -4701,8 +4739,12 @@ def __init__(self, m): @torch.jit.script_method def forward(self, x): x += x - if True: - if True: + # because we are testing if we emit `if` statement correctly + # we cannot use `True` as the condition. Constant prop + # would remove the `if` statements. + c = sum(x) > 4 + if c: + if c: y = self.m(x) else: y = self.m(x) diff --git a/test/test_thd_distributed.py b/test/test_thd_distributed.py new file mode 100644 index 00000000000000..47dbe9d056f154 --- /dev/null +++ b/test/test_thd_distributed.py @@ -0,0 +1,1148 @@ +from __future__ import absolute_import, division, print_function, unicode_literals +import copy +import fcntl +import multiprocessing +import os +import sys +import time +import unittest +from contextlib import contextmanager +from functools import reduce, wraps + +import torch +import torch.cuda +import torch.distributed as dist +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from common import TestCase +from torch._utils_internal import TEST_MASTER_ADDR as MASTER_ADDR +from torch.autograd import Variable + + +BACKEND = os.environ["BACKEND"] +TEMP_DIR = os.environ["TEMP_DIR"] +INIT_METHOD = os.getenv("INIT_METHOD", "env://") +MASTER_PORT = "29500" + +DEFAULT_TIMEOUT = 300 +CUSTOMIZED_TIMEOUT = {"test_DistributedDataParallel": 500} + + +def get_timeout(test_id): + test_name = test_id.split(".")[-1] + if test_name in CUSTOMIZED_TIMEOUT: + return CUSTOMIZED_TIMEOUT[test_name] + else: + return DEFAULT_TIMEOUT + + +if not dist.is_available(): + print("Distributed not available, skipping tests") + sys.exit(0) + +SKIP_IF_NO_CUDA_EXIT_CODE = 75 +SKIP_IF_NO_GPU_EXIT_CODE = 76 +SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE = 77 +SKIP_IF_BACKEND_UNAVAILABLE = 78 + + +def skip_if_no_cuda_distributed(func): + func.skip_if_no_cuda_distributed = True + + @wraps(func) + def wrapper(*args, **kwargs): + if not torch.cuda.is_available(): + sys.exit(SKIP_IF_NO_CUDA_EXIT_CODE) + + return func(*args, **kwargs) + + return wrapper + + +def skip_if_no_gpu(func): + """ Nccl multigpu tests requires at least 2 GPUS. Skip if this is not met""" + func.skip_if_no_gpu = True + + @wraps(func) + def wrapper(*args, **kwargs): + if not torch.cuda.is_available(): + sys.exit(SKIP_IF_NO_CUDA_EXIT_CODE) + if torch.cuda.device_count() < int(os.environ["WORLD_SIZE"]): + sys.exit(SKIP_IF_NO_GPU_EXIT_CODE) + + return func(*args, **kwargs) + + return wrapper + + +def skip_if_small_worldsize(func): + func.skip_if_small_worldsize = True + + @wraps(func) + def wrapper(*args, **kwargs): + if (os.environ["BACKEND"] != "mpi") and int(os.environ["WORLD_SIZE"]) <= 2: + sys.exit(SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE) + + return func(*args, **kwargs) + + return wrapper + + +def apply_hack_for_nccl(): + # This is a hack for a known NCCL issue using multiprocess + # in conjunction with multiple threads to manage different GPUs which + # may cause ncclCommInitRank to fail. + # http://docs.nvidia.com/deeplearning/sdk/nccl-release-notes/rel_2.1.4.html#rel_2.1.4 + # It slows down the performance of collective operations. + # Without this setting NCCL might throw unhandled error. + os.environ["NCCL_MAX_NRINGS"] = "1" + + +@contextmanager +def _lock(): + lockfile = os.path.join(TEMP_DIR, "lockfile") + with open(lockfile, "w") as lf: + try: + fcntl.flock(lf.fileno(), fcntl.LOCK_EX) + yield + finally: + fcntl.flock(lf.fileno(), fcntl.LOCK_UN) + lf.close() + + +def _build_tensor(size, value=None): + if value is None: + value = size + return torch.FloatTensor(size, size, size).fill_(value) + + +class Barrier(object): + barrier_id = 0 + + @classmethod + def init(cls): + cls.barrier_id = 0 + barrier_dir = os.path.join(TEMP_DIR, "barrier") + for f_name in os.listdir(barrier_dir): + os.unlink(os.path.join(barrier_dir, f_name)) + + @classmethod + def sync(cls, timeout=5): + cls.barrier_id += 1 + barrier_dir = os.path.join(TEMP_DIR, "barrier") + pid = str(os.getpid()) + barrier_file = os.path.join(barrier_dir, pid) + with _lock(): + with open(barrier_file, "w") as f: + f.write(str(cls.barrier_id)) + + start_time = time.time() + while True: + arrived = 0 + with _lock(): + for f_name in os.listdir(barrier_dir): + with open(os.path.join(barrier_dir, f_name), "r") as f: + data = f.read() + if int(data) >= cls.barrier_id: + arrived += 1 + if arrived == dist.get_world_size(): + break + + if time.time() - start_time > timeout: + raise RuntimeError("barrier timeout") + time.sleep(0.1) + + +class _DistTestBase(object): + def _barrier(self, *args, **kwargs): + Barrier.sync(*args, **kwargs) + + def _init_group_test(self): + group = [1, 2] + group_id = dist.new_group(group) + rank = dist.get_rank() + if rank not in group: + return ([], None, rank) + + return (group, group_id, rank) + + def _init_global_test(self): + group = [i for i in range(0, dist.get_world_size())] + group_id = dist.group.WORLD + rank = dist.get_rank() + return (group, group_id, rank) + + # HELPER FOR MULTIGPU TESTS + def _init_multigpu_helper(self): + """Multigpu tests are designed to simulate the multi nodes with multi + GPUs on each node. Nccl backend requires equal #GPUs in each process. + On a single node, all visible GPUs are evenly + divided to subsets, each process only uses a subset. + """ + nGPUs = torch.cuda.device_count() + world_size = dist.get_world_size() + visible_devices = range(nGPUs) + + if BACKEND == "nccl": + apply_hack_for_nccl() + + nGPUs_per_process = nGPUs // world_size + rank_to_GPU = { + i: list( + visible_devices[i * nGPUs_per_process: (i + 1) * nGPUs_per_process] + ) + for i in range(world_size) + } + return rank_to_GPU + + # GET RANK + def test_get_rank(self): + test_dir = os.path.join(TEMP_DIR, "test_dir") + pid = str(os.getpid()) + num_processes = dist.get_world_size() + with open(os.path.join(test_dir, pid), "w") as f: + f.write(str(dist.get_rank())) + + self._barrier() + + all_ranks = set() + for f_name in os.listdir(test_dir): + with open(os.path.join(test_dir, f_name), "r") as f: + all_ranks.add(int(f.read())) + self.assertEqual(len(all_ranks), num_processes) + + self._barrier() + + if dist.get_rank() == 0: + for f_name in os.listdir(test_dir): + os.unlink(os.path.join(test_dir, f_name)) + + self._barrier() + + # SEND RECV + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support send/recv") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support send/recv") + def test_send_recv(self): + rank = dist.get_rank() + tensor = _build_tensor(rank + 1) + for dest in range(0, dist.get_world_size()): + if dest == rank: + continue + dist.send(tensor, dest) + + for src in range(0, dist.get_world_size()): + if src == rank: + continue + tensor = _build_tensor(src + 1, value=-1) + expected_tensor = _build_tensor(src + 1) + dist.recv(tensor, src) + self.assertEqual(tensor, expected_tensor) + + self._barrier() + + # SEND RECV ANY SOURCE + @unittest.skipIf( + BACKEND == "gloo", "Gloo does not support send/recv from any source" + ) + @unittest.skipIf( + BACKEND == "nccl", "Nccl does not support send/recv from any source" + ) + def test_send_recv_any_source(self): + rank = dist.get_rank() + tensor = _build_tensor(10, rank) + for dest in range(0, dist.get_world_size()): + if dest == rank: + continue + dist.send(tensor, dest) + + recv_ranks = set() + for src in range(0, dist.get_world_size()): + if src == rank: + continue + tensor = _build_tensor(10, value=-1) + sender = dist.recv(tensor) + self.assertTrue(tensor.eq(sender).all()) + recv_ranks.add(sender) + + self.assertEqual(len(recv_ranks), dist.get_world_size() - 1) + self._barrier() + + # ISEND + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support isend") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support isend") + def test_isend(self): + rank = dist.get_rank() + world_size = dist.get_world_size() + + if rank == 0: + requests = [ + dist.isend(_build_tensor(dest, 10), dest) + for dest in range(1, world_size) + ] + for request in requests: + request.wait() + self.assertTrue(request.is_completed()) + else: + tensor = _build_tensor(rank, -1) + dist.recv(tensor, 0) + self.assertEqual(tensor, _build_tensor(rank, 10)) + + self._barrier() + + # IRECV + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support irecv") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support irecv") + def test_irecv(self): + rank = dist.get_rank() + world_size = dist.get_world_size() + + if rank == 0: + expected_tensors = [_build_tensor(src, -1) for src in range(1, world_size)] + requests = [ + dist.irecv(expected_tensors[src - 1], src) + for src in range(1, world_size) + ] + + for src in range(1, world_size): + requests[src - 1].wait() + self.assertTrue(requests[src - 1].is_completed()) + self.assertEqual(expected_tensors[src - 1], _build_tensor(src, 10)) + else: + tensor = _build_tensor(rank, 10) + dist.send(tensor, 0) + + self._barrier() + + # BROADCAST + def _test_broadcast_helper( + self, group, group_id, rank, cuda=False, rank_to_GPU=None + ): + for ttype, value, requires_cuda in [ + ("torch.FloatTensor", -1e-10, False), + ("torch.DoubleTensor", -1e-100, False), + ("torch.HalfTensor", -0.1, True), + ("torch.CharTensor", -2, False), + ("torch.ByteTensor", 129, False), + ("torch.IntTensor", -1e5, False), + ("torch.LongTensor", -1e15, False), + ]: + if requires_cuda and not cuda: + continue + for src in group: + expected_tensor = _build_tensor(src + 1, value).type(ttype) + if cuda: + expected_tensor = expected_tensor.cuda(rank_to_GPU[rank][0]) + if rank == src: + dist.broadcast(expected_tensor, src, group_id) + else: + tensor = _build_tensor(src + 1, -1).type(ttype) + if cuda: + tensor = tensor.cuda(rank_to_GPU[rank][0]) + dist.broadcast(tensor, src, group_id) + self.assertEqual(tensor.size(), expected_tensor.size()) + self.assertEqual(tensor.ne(expected_tensor).max(), 0) + + self._barrier() + + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + def test_broadcast(self): + group, group_id, rank = self._init_global_test() + self._test_broadcast_helper(group, group_id, rank) + + @unittest.skipIf( + BACKEND != "gloo" and BACKEND != "nccl", + "Only Gloo and Nccl backend supports CUDA allReduce", + ) + @skip_if_no_cuda_distributed + @skip_if_no_gpu + def test_broadcast_cuda(self): + group, group_id, rank = self._init_global_test() + rank_to_GPU = self._init_multigpu_helper() + self._test_broadcast_helper(group, group_id, rank, True, rank_to_GPU) + + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @skip_if_small_worldsize + def test_broadcast_group(self): + group, group_id, rank = self._init_group_test() + self._test_broadcast_helper(group, group_id, rank) + + # REDUCE + def _test_reduce_helper( + self, + group, + group_id, + rank, + op, + master_value, + worker_value, + expected_value, + cuda=False, + rank_to_GPU=None, + ): + for src in group: + if rank == src: + tensor = _build_tensor(src + 1).fill_(master_value) + if cuda: + tensor = tensor.cuda(rank_to_GPU[rank][0]) + dist.reduce(tensor, src, op, group_id) + self.assertEqual(tensor, _build_tensor(src + 1, expected_value)) + else: + tensor = _build_tensor(src + 1).fill_(worker_value) + if cuda: + tensor = tensor.cuda(rank_to_GPU[rank][0]) + dist.reduce(tensor, src, op, group_id) + + self._barrier() + + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + def test_reduce_sum(self): + group, group_id, rank = self._init_global_test() + self._test_reduce_helper( + group, + group_id, + rank, + dist.reduce_op.SUM, + 2, + 10, + 2 + (10 * (len(group) - 1)), + ) + + @unittest.skipIf(BACKEND != "nccl", "Only Nccl supports CUDA reduce") + @skip_if_no_cuda_distributed + @skip_if_no_gpu + def test_reduce_sum_cuda(self): + group, group_id, rank = self._init_global_test() + rank_to_GPU = self._init_multigpu_helper() + self._test_reduce_helper( + group, + group_id, + rank, + dist.reduce_op.SUM, + 2, + 10, + 2 + 10 * (len(group) - 1), + True, + rank_to_GPU, + ) + + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + def test_reduce_product(self): + group, group_id, rank = self._init_global_test() + self._test_reduce_helper( + group, + group_id, + rank, + dist.reduce_op.PRODUCT, + 2, + 10, + reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2), + ) + + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + def test_reduce_min(self): + group, group_id, rank = self._init_global_test() + self._test_reduce_helper(group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1) + + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + def test_reduce_max(self): + group, group_id, rank = self._init_global_test() + self._test_reduce_helper(group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10) + + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @skip_if_small_worldsize + def test_reduce_group_sum(self): + group, group_id, rank = self._init_group_test() + self._test_reduce_helper( + group, + group_id, + rank, + dist.reduce_op.SUM, + 2, + 10, + 2 + (10 * (len(group) - 1)), + ) + + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @skip_if_small_worldsize + def test_reduce_group_product(self): + group, group_id, rank = self._init_group_test() + self._test_reduce_helper( + group, + group_id, + rank, + dist.reduce_op.PRODUCT, + 2, + 10, + reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2), + ) + + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @skip_if_small_worldsize + def test_reduce_group_min(self): + group, group_id, rank = self._init_group_test() + self._test_reduce_helper(group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1) + + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support reduce") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @skip_if_small_worldsize + def test_reduce_group_max(self): + group, group_id, rank = self._init_group_test() + self._test_reduce_helper(group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10) + + # ALL REDUCE + def _test_all_reduce_helper( + self, + group, + group_id, + rank, + op, + master_value, + worker_value, + expected_value, + cuda=False, + rank_to_GPU=None, + ): + for src in group: + if rank == src: + tensor = _build_tensor(src + 1).fill_(master_value) + if cuda: + tensor = tensor.cuda(rank_to_GPU[rank][0]) + dist.all_reduce(tensor, op, group_id) + self.assertEqual(tensor, _build_tensor(src + 1, expected_value)) + else: + tensor = _build_tensor(src + 1).fill_(worker_value) + if cuda: + tensor = tensor.cuda(rank_to_GPU[rank][0]) + dist.all_reduce(tensor, op, group_id) + self.assertEqual(tensor, _build_tensor(src + 1, expected_value)) + + self._barrier() + + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + def test_all_reduce_sum(self): + group, group_id, rank = self._init_global_test() + self._test_all_reduce_helper( + group, + group_id, + rank, + dist.reduce_op.SUM, + 2, + 10, + 2 + (10 * (len(group) - 1)), + ) + + @unittest.skipIf( + BACKEND != "gloo" and BACKEND != "nccl", + "Only Gloo & Nccl backend support CUDA allReduce", + ) + @skip_if_no_cuda_distributed + @skip_if_no_gpu + def test_all_reduce_sum_cuda(self): + group, group_id, rank = self._init_global_test() + rank_to_GPU = self._init_multigpu_helper() + self._test_all_reduce_helper( + group, + group_id, + rank, + dist.reduce_op.SUM, + 2, + 10, + 2 + (10 * (len(group) - 1)), + True, + rank_to_GPU, + ) + + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + def test_all_reduce_product(self): + group, group_id, rank = self._init_global_test() + self._test_all_reduce_helper( + group, + group_id, + rank, + dist.reduce_op.PRODUCT, + 2, + 10, + reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2), + ) + + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + def test_all_reduce_min(self): + group, group_id, rank = self._init_global_test() + self._test_all_reduce_helper( + group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1 + ) + + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + def test_all_reduce_max(self): + group, group_id, rank = self._init_global_test() + self._test_all_reduce_helper( + group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10 + ) + + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @skip_if_small_worldsize + def test_all_reduce_group_sum(self): + group, group_id, rank = self._init_group_test() + self._test_all_reduce_helper( + group, + group_id, + rank, + dist.reduce_op.SUM, + 2, + 10, + 2 + (10 * (len(group) - 1)), + ) + + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @skip_if_small_worldsize + def test_all_reduce_group_product(self): + group, group_id, rank = self._init_group_test() + self._test_all_reduce_helper( + group, + group_id, + rank, + dist.reduce_op.PRODUCT, + 2, + 10, + reduce((lambda x, y: x * y), [10] * (len(group) - 1), 2), + ) + + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @skip_if_small_worldsize + def test_all_reduce_group_min(self): + group, group_id, rank = self._init_group_test() + self._test_all_reduce_helper( + group, group_id, rank, dist.reduce_op.MIN, 1010, 1, 1 + ) + + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @skip_if_small_worldsize + def test_all_reduce_group_max(self): + group, group_id, rank = self._init_group_test() + self._test_all_reduce_helper( + group, group_id, rank, dist.reduce_op.MAX, -1, 10, 10 + ) + + # SCATTER + def _test_scatter_helper(self, group, group_id, rank): + for dest in group: + tensor = _build_tensor(dest + 1, -1) + expected_tensor = _build_tensor(dest + 1, rank) + tensors = ( + [_build_tensor(dest + 1, i) for i in group] if rank == dest else [] + ) + dist.scatter(tensor, src=dest, scatter_list=tensors, group=group_id) + self.assertEqual(tensor, expected_tensor) + + self._barrier() + + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support scatter") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support scatter") + def test_scatter(self): + group, group_id, rank = self._init_global_test() + self._test_scatter_helper(group, group_id, rank) + + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support scatter") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support scatter") + @skip_if_small_worldsize + def test_scatter_group(self): + group, group_id, rank = self._init_group_test() + self._test_scatter_helper(group, group_id, rank) + + # GATHER + def _test_gather_helper(self, group, group_id, rank): + for dest in group: + tensor = _build_tensor(dest + 1, rank) + tensors = ( + [_build_tensor(dest + 1, -1) for i in group] if rank == dest else [] + ) + dist.gather(tensor, dst=dest, gather_list=tensors, group=group_id) + if rank == dest: + expected_tensors = [_build_tensor(dest + 1, i) for i in group] + for t1, t2 in zip(tensors, expected_tensors): + self.assertEqual(t1, t2) + + self._barrier() + + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support gather") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + def test_gather(self): + group, group_id, rank = self._init_global_test() + self._test_gather_helper(group, group_id, rank) + + @unittest.skipIf(BACKEND == "gloo", "Gloo does not support gather") + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @skip_if_small_worldsize + def test_gather_group(self): + group, group_id, rank = self._init_group_test() + self._test_gather_helper(group, group_id, rank) + + # ALL GATHER + def _test_all_gather_helper( + self, group, group_id, rank, cuda=False, rank_to_GPU=None + ): + for dest in group: + tensor = _build_tensor(dest + 1, rank) + tensors = [_build_tensor(dest + 1, -1) for i in group] + if cuda: + tensor = tensor.cuda(rank_to_GPU[rank][0]) + tensors = [t.cuda(rank_to_GPU[rank][0]) for t in tensors] + dist.all_gather(tensors, tensor, group_id) + + expected_tensors = [_build_tensor(dest + 1, i) for i in group] + for t1, t2 in zip(tensors, expected_tensors): + self.assertEqual(t1, t2) + + self._barrier() + + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + def test_all_gather(self): + group, group_id, rank = self._init_global_test() + self._test_all_gather_helper(group, group_id, rank) + + @unittest.skipIf(BACKEND != "nccl", "Only Nccl supports CUDA all gather") + @skip_if_no_cuda_distributed + @skip_if_no_gpu + def test_all_gather_cuda(self): + group, group_id, rank = self._init_global_test() + rank_to_GPU = self._init_multigpu_helper() + self._test_all_gather_helper(group, group_id, rank, True, rank_to_GPU) + + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @skip_if_small_worldsize + def test_all_gather_group(self): + group, group_id, rank = self._init_group_test() + self._test_all_gather_helper(group, group_id, rank) + + # BARRIER + def _test_barrier_helper(self, group, group_id, rank): + WAIT_TIME = 0.3 # seconds + + for dest in group: + expected_time = torch.DoubleTensor(1).fill_(0.0) + if dest == rank: + expected_time.fill_(time.time() + WAIT_TIME) + dist.broadcast(expected_time, dest, group_id) + time.sleep(WAIT_TIME + 0.1) # sleep a little bit longer + dist.barrier(group_id) + else: + dist.broadcast(expected_time, dest, group_id) + dist.barrier(group_id) + self.assertGreaterEqual(time.time(), expected_time[0]) + + self._barrier() + + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support CPU tensors") + def test_barrier(self): + group, group_id, rank = self._init_global_test() + self._test_barrier_helper(group, group_id, rank) + + @unittest.skipIf(BACKEND == "nccl", "Nccl does not support newGroup") + @skip_if_small_worldsize + def test_barrier_group(self): + group, group_id, rank = self._init_group_test() + self._test_barrier_helper(group, group_id, rank) + + def _test_broadcast_multigpu_helper(self, group, group_id, rank, rank_to_GPU): + for src in group: + expected_tensor = _build_tensor(src + 1) + tensors = [ + _build_tensor(src + 1, -1).cuda(device=i) for i in rank_to_GPU[rank] + ] + if rank == src: + tensors[0] = expected_tensor.cuda(device=rank_to_GPU[rank][0]) + + dist.broadcast_multigpu(tensors, src, group_id) + for tensor in tensors: + self.assertEqual(tensor, expected_tensor) + self._barrier() + + @unittest.skipIf(BACKEND != "nccl", "Only Nccl backend supports broadcast multigpu") + @skip_if_no_gpu + def test_broadcast_multigpu(self): + group, group_id, rank = self._init_global_test() + rank_to_GPU = self._init_multigpu_helper() + self._test_broadcast_multigpu_helper(group, group_id, rank, rank_to_GPU) + + def _test_all_reduce_multigpu_helper( + self, + group, + group_id, + rank, + rank_to_GPU, + op, + master_value, + worker_value, + expected_value, + ): + for src in group: + if rank == src: + tensors = [ + _build_tensor(src + 1, master_value).cuda(device=i) + for i in rank_to_GPU[rank] + ] + else: + tensors = [ + _build_tensor(src + 1, worker_value).cuda(device=i) + for i in rank_to_GPU[rank] + ] + + dist.all_reduce_multigpu(tensors, op, group_id) + expected_tensor = _build_tensor(src + 1, expected_value) + for tensor in tensors: + self.assertEqual(tensor, expected_tensor) + + self._barrier() + + @unittest.skipIf(BACKEND != "nccl", "Only Nccl backend supports allreduce multigpu") + @skip_if_no_gpu + def test_all_reduce_multigpu(self): + group, group_id, rank = self._init_global_test() + rank_to_GPU = self._init_multigpu_helper() + self._test_all_reduce_multigpu_helper( + group, + group_id, + rank, + rank_to_GPU, + dist.reduce_op.SUM, + 2, + 10, + (2 + 10 * (len(group) - 1)) * len(rank_to_GPU[0]), + ) + + def _test_reduce_multigpu_helper( + self, + group, + group_id, + rank, + rank_to_GPU, + op, + master_value, + worker_value, + expected_value, + ): + for src in group: + if rank == src: + tensors = [ + _build_tensor(src + 1, master_value).cuda(device=i) + for i in rank_to_GPU[rank] + ] + dist.reduce_multigpu(tensors, src, op, group_id) + expected_tensor = _build_tensor(src + 1, expected_value) + self.assertEqual(tensors[0], expected_tensor) + else: + tensors = [ + _build_tensor(src + 1, worker_value).cuda(device=i) + for i in rank_to_GPU[rank] + ] + dist.reduce_multigpu(tensors, src, op, group_id) + + self._barrier() + + @unittest.skipIf(BACKEND != "nccl", "Only Nccl backend supports reduce multigpu") + @skip_if_no_gpu + def test_reduce_multigpu(self): + group, group_id, rank = self._init_global_test() + rank_to_GPU = self._init_multigpu_helper() + self._test_reduce_multigpu_helper( + group, + group_id, + rank, + rank_to_GPU, + dist.reduce_op.SUM, + 2, + 10, + (2 + 10 * (len(group) - 1)) * len(rank_to_GPU[0]), + ) + + def _test_all_gather_multigpu_helper(self, group, group_id, rank, rank_to_GPU): + for dest in group: + tensors = [ + _build_tensor(dest + 1).cuda(device=i) for i in rank_to_GPU[rank] + ] + + # construct expected output along with + # a place holder to receive all gather results + output_tensors = [] + expected_output = [] + output_per_gpu = ( + [_build_tensor(dest + 1, -1)] * len(rank_to_GPU[0]) * len(group) + ) + expected_per_gpu = ( + [_build_tensor(dest + 1)] * len(rank_to_GPU[0]) * len(group) + ) + for gpu in rank_to_GPU[rank]: + output_tensors.append([t.cuda(device=gpu) for t in output_per_gpu]) + expected_output.append([t.cuda(device=gpu) for t in expected_per_gpu]) + + dist.all_gather_multigpu(output_tensors, tensors, group_id) + self.assertEqual(output_tensors, expected_output) + + self._barrier() + + @unittest.skipIf(BACKEND != "nccl", "Only Nccl backend supports allgather multigpu") + @skip_if_no_gpu + def test_all_gather_multigpu(self): + group, group_id, rank = self._init_global_test() + rank_to_GPU = self._init_multigpu_helper() + self._test_all_gather_multigpu_helper(group, group_id, rank, rank_to_GPU) + + def _create_Net(self): + class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.fc1 = nn.Linear(2, 10, bias=False) + self.fc2 = nn.Linear(10, 50, bias=False) + self.fc3 = nn.Linear(50, 4, bias=False) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.relu(self.fc1(x)) + x = self.relu(self.fc2(x)) + x = self.fc3(x) + return F.softmax(x, dim=1) + + return Net() + + def _model_step(self, model): + for param in model.parameters(): + param.data += param.grad + param.grad = None + + def _prepare_dummy_data(self, local_bs): + # global_bs for DDP should be divisible by WORLD_SIZE + global_bs = int(WORLD_SIZE) * local_bs + input_cpu = torch.randn(global_bs, 2) + target = torch.randn(global_bs, 4) + loss = nn.MSELoss() + return global_bs, input_cpu, target, loss + + # END TO END TEST FOR DISTRIBUTEDDATAPARALLEL + def _test_DDP_helper(self, model, input_var, target, loss): + model.train() + output = model(input_var) + l = loss(output, target) + l.backward() + + def _assert_equal_param(self, param_gpu, param_DDP): + self.assertEqual(len(param_gpu), len(param_DDP)) + for p_gpu, p_DDP in zip(param_gpu, param_DDP): + self.assertEqual(p_gpu, p_DDP) + + def _test_DDP_2iter( + self, model_base, model_DDP, input, target, loss, local_bs, rank, batch_size + ): + for _ in range(2): + # single cpu/gpu training + self._test_DDP_helper(model_base, input, target, loss) + + # DDP training, DDP scatters subsets of input_cpu to nodes/GPUs + self._test_DDP_helper( + model_DDP, + input[rank * local_bs: (rank + 1) * local_bs], + target[rank * local_bs: (rank + 1) * local_bs], + loss, + ) + + # Update weights and run a second iteration to shake out errors + self._model_step(model_base) + self._model_step(model_DDP) + self._assert_equal_param( + list(model_base.parameters()), list(model_DDP.module.parameters()) + ) + + # Shuffle the input so that DDP input is different + input = input[torch.randperm(batch_size)] + + @unittest.skipIf( + BACKEND != "nccl" and BACKEND != "gloo", + "Only Nccl & Gloo backend support DistributedDataParallel", + ) + @skip_if_no_cuda_distributed + @skip_if_no_gpu + def test_DistributedDataParallel(self): + # Run a simple end to end DDP model, use result of single node model + # as baseline + group, group_id, rank = self._init_global_test() + rank_to_GPU = self._init_multigpu_helper() + + # cpu training setup + model = self._create_Net() + + # single gpu training setup + model_gpu = copy.deepcopy(model) + gpu_subset = list(rank_to_GPU[rank]) + model_gpu.cuda(gpu_subset[0]) + + # DDP training setup + model_DDP = copy.deepcopy(model) + model_DDP.cuda(gpu_subset[0]) + model_DDP = nn.parallel.DistributedDataParallel( + model_DDP, device_ids=gpu_subset + ) + + # dummy data initialization + local_bs = len(gpu_subset) + global_bs, input_cpu, target, loss = self._prepare_dummy_data(local_bs) + + # check two model parameters over 2 iterations + self._test_DDP_2iter( + model_gpu, + model_DDP, + input_cpu.cuda(gpu_subset[0]), + target.cuda(gpu_subset[0]), + loss, + local_bs, + rank, + global_bs, + ) + self._barrier() + + @unittest.skipIf( + BACKEND == "nccl", "nccl does not support DistributedDataParallelCPU" + ) + def test_DistributedDataParallelCPU(self): + # Run a simple end to end DDP-CPU model, use result of single node + # model as baseline + group, group_id, rank = self._init_global_test() + + # cpu training setup + model_base = self._create_Net() + + # DDP-CPU training setup + model_DDP = copy.deepcopy(model_base) + model_DDP = nn.parallel.DistributedDataParallelCPU(model_DDP) + + # dummy data initialization + local_bs = 2 + global_bs, input_cpu, target, loss = self._prepare_dummy_data(local_bs) + + # check two model parameters over 2 iterations + self._test_DDP_2iter( + model_base, model_DDP, input_cpu, target, loss, local_bs, rank, global_bs + ) + self._barrier() + + +if BACKEND == "tcp" or BACKEND == "gloo" or BACKEND == "nccl": + WORLD_SIZE = os.environ["WORLD_SIZE"] + + class TestDistBackend(TestCase, _DistTestBase): + MANAGER_PROCESS_RANK = -1 + + @staticmethod + def manager_join(fn): + @wraps(fn) + def wrapper(self): + if self.rank == self.MANAGER_PROCESS_RANK: + self._join_and_reduce(fn) + else: + fn(self) + + return wrapper + + @classmethod + def setUpClass(cls): + os.environ["MASTER_ADDR"] = MASTER_ADDR + os.environ["MASTER_PORT"] = MASTER_PORT + os.environ["WORLD_SIZE"] = WORLD_SIZE + for attr in dir(cls): + if attr.startswith("test"): + fn = getattr(cls, attr) + setattr(cls, attr, cls.manager_join(fn)) + + def setUp(self): + self.processes = [] + self.rank = self.MANAGER_PROCESS_RANK + Barrier.init() + for rank in range(int(WORLD_SIZE)): + self.processes.append(self._spawn_process(rank)) + + def tearDown(self): + for p in self.processes: + p.terminate() + + def _spawn_process(self, rank): + os.environ["RANK"] = str(rank) + name = "process " + str(rank) + process = multiprocessing.Process(target=self._run, name=name, args=(rank,)) + process.start() + return process + + def _run(self, rank): + self.rank = rank + try: + dist.init_process_group( + init_method=INIT_METHOD, backend=BACKEND, world_size=int(WORLD_SIZE) + ) + except RuntimeError as e: + if "recompile" in e.args[0]: + sys.exit(SKIP_IF_BACKEND_UNAVAILABLE) + # sys.exit(0) + raise + # self.id() == e.g. '__main__.TestDistributed.test_get_rank' + # We're retreiving a corresponding test and executing it. + getattr(self, self.id().split(".")[2])() + sys.exit(0) + + def _join_and_reduce(self, fn): + skip_ok = ( + getattr(fn, "skip_if_no_cuda_distributed", False) or + getattr(fn, "skip_if_no_gpu", False) or + getattr(fn, "skip_if_small_worldsize", False) + ) + self.JOIN_TIMEOUT = get_timeout(self.id()) + for p in self.processes: + p.join(self.JOIN_TIMEOUT) + + first_process = self.processes[0] + for p in self.processes: + self.assertEqual(p.exitcode, first_process.exitcode) + + if first_process.exitcode == SKIP_IF_BACKEND_UNAVAILABLE: + raise unittest.SkipTest("Compiled without the " + BACKEND + " backend") + + if skip_ok: + # do this first so we don't give an error message about + # mismatched exit codes if the first isn't valid + assert ( + first_process.exitcode == 0 or + first_process.exitcode == SKIP_IF_NO_CUDA_EXIT_CODE or + first_process.exitcode == SKIP_IF_NO_GPU_EXIT_CODE or + first_process.exitcode == SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE + ) + + if first_process.exitcode == SKIP_IF_NO_CUDA_EXIT_CODE: + raise unittest.SkipTest("cuda is not available") + if first_process.exitcode == SKIP_IF_NO_GPU_EXIT_CODE: + raise unittest.SkipTest( + "One unique gpu per process is not available" + ) + if first_process.exitcode == SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE: + raise unittest.SkipTest("worldsize is too small to run group tests") + + self.assertEqual(first_process.exitcode, 0) + + +elif BACKEND == "mpi": + WORLD_SIZE = os.environ["WORLD_SIZE"] + dist.init_process_group(init_method=INIT_METHOD, backend="mpi") + + class TestMPI(TestCase, _DistTestBase): + pass + + +if __name__ == "__main__": + assert ( + not torch.cuda._initialized + ), "test_distributed must not have initialized CUDA context on main process" + + unittest.main() diff --git a/test/test_torch.py b/test/test_torch.py index ff84dbff1cb054..91f2f702552c77 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -179,8 +179,6 @@ def test_namespace(ns, *skips): 'as_strided_', re.compile('^clamp_(min|max)_?$'), 'coalesce', - 'digamma', - 'digamma_', 'index_put', 'is_coalesced', 'is_distributed', @@ -4278,13 +4276,12 @@ def test_eig(self): Xhat = torch.mm(torch.mm(v, torch.diag(e.select(1, 0))), v.t()) self.assertEqual(X, Xhat, 1e-8, 'VeV\' wrong') - @skipIfNoLapack - @skipIfRocm - def test_symeig(self): - xval = torch.rand(100, 3) + @staticmethod + def _test_symeig(self, conv_fn): + xval = conv_fn(torch.rand(100, 3)) cov = torch.mm(xval.t(), xval) - rese = torch.zeros(3) - resv = torch.zeros(3, 3) + rese = conv_fn(torch.zeros(3)) + resv = conv_fn(torch.zeros(3, 3)) # First call to symeig self.assertTrue(resv.is_contiguous(), 'resv is not contiguous') @@ -4298,17 +4295,30 @@ def test_symeig(self): ahat = torch.mm(torch.mm(resv, torch.diag(rese)), resv.t()) self.assertEqual(cov, ahat, 1e-8, 'VeV\' wrong') + # test eigenvectors=False + rese2 = conv_fn(torch.zeros(3)) + resv2 = conv_fn(torch.randn(3, 3)) + expected_resv2 = conv_fn(torch.zeros(3, 3)) + torch.symeig(cov.clone(), False, out=(rese2, resv2)) + self.assertEqual(rese, rese2) + self.assertEqual(resv2, expected_resv2) + # test non-contiguous - X = torch.rand(5, 5) + X = conv_fn(torch.rand(5, 5)) X = X.t() * X - e = torch.zeros(4, 2).select(1, 1) - v = torch.zeros(4, 2, 4)[:, 1] + e = conv_fn(torch.zeros(4, 2)).select(1, 1) + v = conv_fn(torch.zeros(4, 2, 4))[:, 1] self.assertFalse(v.is_contiguous(), 'V is contiguous') self.assertFalse(e.is_contiguous(), 'E is contiguous') torch.symeig(X, True, out=(e, v)) Xhat = torch.mm(torch.mm(v, torch.diag(e)), v.t()) self.assertEqual(X, Xhat, 1e-8, 'VeV\' wrong') + @skipIfNoLapack + @skipIfRocm + def test_symeig(self): + self._test_symeig(self, lambda x: x) + @skipIfNoLapack def test_svd(self): a = torch.Tensor(((8.79, 6.11, -9.15, 9.57, -3.49, 9.84), @@ -6094,7 +6104,7 @@ def _test_abs(tensors_dict): _test_abs(self._make_tensors((3, 5, 7), val_range=(0, max_val))) _test_abs(self._make_tensors((2, 2, 5, 8, 2, 3), val_range=(0, max_val))) _test_abs(self._make_tensors((1000, ), val_range=(0, max_val))) - _test_abs(self._make_tensors((30, 30, 30), val_range=(0, max_val))) + _test_abs(self._make_tensors((10, 10, 10), val_range=(0, max_val))) # Checking that the right abs function is called for LongTensor bignumber = 2 ^ 31 + 1 @@ -8474,6 +8484,67 @@ def test_unique(self): self.assertEqual(torch.ByteTensor([7, 42, 128, 133]), byte_unique) self.assertEqual(torch.LongTensor([3, 0, 0, 0, 1, 2]), byte_inverse) + def test_unique_dim(self): + def run_test(dtype=torch.float): + x = torch.tensor([[[1., 1.], + [0., 1.], + [2., 1.], + [0., 1.]], + [[1., 1.], + [0., 1.], + [2., 1.], + [0., 1.]]], dtype=dtype) + expected_unique_dim0 = torch.tensor([[[1., 1.], + [0., 1.], + [2., 1.], + [0., 1.]]], dtype=dtype) + expected_inverse_dim0 = torch.tensor([0, 0]) + expected_unique_dim1 = torch.tensor([[[0., 1.], + [1., 1.], + [2., 1.]], + [[0., 1.], + [1., 1.], + [2., 1.]]], dtype=dtype) + expected_inverse_dim1 = torch.tensor([1, 0, 2, 0]) + expected_unique_dim2 = torch.tensor([[[1., 1.], + [0., 1.], + [2., 1.], + [0., 1.]], + [[1., 1.], + [0., 1.], + [2., 1.], + [0., 1.]]], dtype=dtype) + expected_inverse_dim2 = torch.tensor([0, 1]) + + # dim0 + x_unique = torch.unique(x, dim=0) + self.assertEqual(expected_unique_dim0, x_unique) + + x_unique, x_inverse = torch.unique(x, return_inverse=True, dim=0) + self.assertEqual(expected_unique_dim0, x_unique) + self.assertEqual(expected_inverse_dim0, x_inverse) + + # dim1 + x_unique = torch.unique(x, dim=1) + self.assertEqual(expected_unique_dim1, x_unique) + + x_unique, x_inverse = torch.unique(x, return_inverse=True, dim=1) + self.assertEqual(expected_unique_dim1, x_unique) + self.assertEqual(expected_inverse_dim1, x_inverse) + + # dim2 + x_unique = torch.unique(x, dim=2) + self.assertEqual(expected_unique_dim2, x_unique) + + x_unique, x_inverse = torch.unique(x, return_inverse=True, dim=2) + self.assertEqual(expected_unique_dim2, x_unique) + self.assertEqual(expected_inverse_dim2, x_inverse) + + run_test(torch.float) + run_test(torch.double) + run_test(torch.long) + run_test(torch.uint8) + @staticmethod def _test_bincount(self, device): # negative input throws diff --git a/tools/autograd/gen_variable_factories.py b/tools/autograd/gen_variable_factories.py index c963650933cf25..ac3e8782eb355d 100644 --- a/tools/autograd/gen_variable_factories.py +++ b/tools/autograd/gen_variable_factories.py @@ -5,11 +5,16 @@ import re from .utils import CodeTemplate, write +from .gen_variable_type import format_trace + FUNCTION_TEMPLATE = CodeTemplate("""\ inline at::Tensor ${name}(${formals}) { + ${pre_record_trace} at::Tensor tensor = at::${name}(${actuals}); - return autograd::make_variable(tensor, /*requires_grad=*/${requires_grad}); + auto result = autograd::make_variable(tensor, /*requires_grad=*/${requires_grad}); + ${post_record_trace} + return result; } """) @@ -53,6 +58,10 @@ def process_function(decl, has_tensor_options): requires_grad = "options.requires_grad()" if has_tensor_options else "false" if decl['name'].endswith('_like') and not has_tensor_options: actuals.append('at::TensorOptions({}, /*discard_runtime_type=*/true)'.format(actuals[0])) + + pre_record_trace, post_record_trace = format_trace(decl) + return FUNCTION_TEMPLATE.substitute( - name=decl["name"], formals=formals, actuals=actuals, requires_grad=requires_grad + name=decl["name"], formals=formals, actuals=actuals, requires_grad=requires_grad, + pre_record_trace=pre_record_trace, post_record_trace=post_record_trace ) diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py index 0fe32115da314e..caa6744bb38542 100644 --- a/tools/autograd/gen_variable_type.py +++ b/tools/autograd/gen_variable_type.py @@ -141,7 +141,7 @@ POST_RECORD_TRACE = CodeTemplate("""\ if (jit::tracer::isTracing()) { - jit::tracer::postRecordTrace(node, ArrayRef(${trace_outputs}) ); + jit::tracer::postRecordTrace(node, at::ArrayRef(${trace_outputs}) ); } """) @@ -183,6 +183,41 @@ def should_trace(declaration): return True +def get_trace_outputs(declaration): + if declaration['return_type'] == 'std::vector': + return 'flatten_tensor_args({})'.format(declaration['returns'][0]['name']) + elif declaration['name'].endswith('_out'): + output_args = [arg['name'] for arg in declaration['arguments'] + if arg.get('output', False)] + return '{' + ', '.join(output_args) + '}' + trace_outs = [r['name'] for r in declaration['returns']] + if any(ret['dynamic_type'] == 'TensorList' for ret in declaration['returns']): + return CodeTemplate("flatten_tensor_args( ${outs} )").substitute(outs=trace_outs) + else: + return CodeTemplate("{ ${outs} }").substitute(outs=trace_outs) + + +def format_trace(declaration): + local = {} + + add_trace_inputs = [] + for argument in declaration['arguments']: + add_trace_inputs.append(ADD_TRACE_INPUT.substitute(input=argument['name'])) + local['add_trace_inputs'] = '\n'.join(add_trace_inputs) + + # Record inplace operations as out-of-place operations (e.g., + # not add_ but add) + # TODO: Add a proper concept of side effects to the IR, and + # properly record inplace operations. + local['trace_name'] = uninplace_api_name(declaration['api_name']) + if local['trace_name'] in RENAME_TRACE: + local['trace_name'] = RENAME_TRACE[local['trace_name']] + + local['trace_outputs'] = get_trace_outputs(declaration) + + return (PRE_RECORD_TRACE.substitute(local), POST_RECORD_TRACE.substitute(local)) + + def gen_variable_type(out, aten_declarations, template_path): """VariableType.h and VariableType.cpp body @@ -361,42 +396,10 @@ def reference_args(args): res.append(arg['name']) return res - def get_trace_outputs(declaration): - if declaration['return_type'] == 'std::vector': - return 'flatten_tensor_args({})'.format(declaration['returns'][0]['name']) - elif name.endswith('_out'): - output_args = [arg['name'] for arg in arguments - if arg.get('output', False)] - return '{' + ', '.join(output_args) + '}' - trace_outs = [r['name'] for r in declaration['returns']] - if any(ret['dynamic_type'] == 'TensorList' for ret in declaration['returns']): - return CodeTemplate("flatten_tensor_args( ${outs} )").substitute(outs=trace_outs) - else: - return CodeTemplate("{ ${outs} }").substitute(outs=trace_outs) - def emit_record_trace(env): if not should_trace(declaration): return ('', '') - - local = {} - - add_trace_inputs = [] - for argument in declaration['arguments']: - add_trace_inputs.append(ADD_TRACE_INPUT.substitute(input=argument['name'])) - local['add_trace_inputs'] = '\n'.join(add_trace_inputs) - - # Record inplace operations as out-of-place operations (e.g., - # not add_ but add) - # TODO: Add a proper concept of side effects to the IR, and - # properly record inplace operations. - local['trace_name'] = uninplace_api_name(declaration['api_name']) - if local['trace_name'] in RENAME_TRACE: - local['trace_name'] = RENAME_TRACE[local['trace_name']] - - local['trace_outputs'] = get_trace_outputs(declaration) - - combined = nested_dict(local, nested_dict(env, declaration)) - return (PRE_RECORD_TRACE.substitute(combined), POST_RECORD_TRACE.substitute(combined)) + return format_trace(declaration) def declare_returned_variables(): if modifies_arguments: diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp index 89101a24714b72..244606ca7938d7 100644 --- a/tools/autograd/templates/VariableType.cpp +++ b/tools/autograd/templates/VariableType.cpp @@ -43,7 +43,7 @@ using namespace torch::autograd::generated; namespace torch { namespace autograd { VariableType::VariableType(Context* context, Type* baseType) - : Type(context, baseType->type_id(), /*is_variable=*/true, /*is_undefined=*/false) + : Type(baseType->type_id(), /*is_variable=*/true, /*is_undefined=*/false) , baseType(baseType) , id_(context->freshTypeID()) { str = std::string("Variable[") + baseType->toString() + "]"; diff --git a/tools/autograd/templates/variable_factories.h b/tools/autograd/templates/variable_factories.h index bc2fa21385777f..bf74abc9138c65 100644 --- a/tools/autograd/templates/variable_factories.h +++ b/tools/autograd/templates/variable_factories.h @@ -3,7 +3,7 @@ // ${generated_comment} #include - +#include #include #include diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh index 994a96ad822b41..d1cdb855c9099f 100755 --- a/tools/build_pytorch_libs.sh +++ b/tools/build_pytorch_libs.sh @@ -281,6 +281,12 @@ function build_caffe2() { # STOP!!! Are you trying to add a C or CXX flag? Add it # to CMakeLists.txt and aten/CMakeLists.txt, not here. # We need the vanilla cmake build to work. + + # This is needed by the aten tests built with caffe2 + if [ -f "${INSTALL_DIR}/lib/libnccl.so" ] && [ ! -f "lib/libnccl.so.1" ]; then + cp "${INSTALL_DIR}/lib/libnccl.so.1" "lib/libnccl.so.1" + fi + ${CMAKE_INSTALL} -j"$MAX_JOBS" # Install Python proto files diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py index d337143dd8b09e..ff7fce56e91552 100644 --- a/tools/jit/gen_jit_dispatch.py +++ b/tools/jit/gen_jit_dispatch.py @@ -262,7 +262,8 @@ def declkey(decl): arguments.extend([ # XXX - until we actually have first-class interpreter types for these # concepts, the default values to be encoded in Tensors - + # If you change this, you also need to update [TensorOptions in script] + # in the tracer code. # dtype is specified as an int64_t of at::ScalarType {'name': 'dtype', 'simple_type': 'ScalarType', 'default': 'float', 'kwarg_only': True}, # layout is specified as an int64_t of at::Layout diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py index 39d14668958c94..0a76a89a20d55a 100644 --- a/torch/_tensor_docs.py +++ b/torch/_tensor_docs.py @@ -650,6 +650,20 @@ def add_docstr_all(method, docstr): See :func:`torch.diagonal` """) +add_docstr_all('digamma', + r""" +digamma() -> Tensor + +See :func:`torch.digamma` +""") + +add_docstr_all('digamma_', + r""" +digamma_() -> Tensor + +In-place version of :meth:`~Tensor.digamma` +""") + add_docstr_all('dim', r""" dim() -> int diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index 6561c7a7c23889..a9db54d3117842 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -1168,6 +1168,26 @@ def parse_kwargs(desc): [ 1.0500, 0.7336, -0.3836, -1.1015]]]) """) +add_docstr(torch.digamma, + r""" +digamma(input) -> Tensor + +Computes the logarithmic derivative of the gamma function on `input`. + +.. math:: + \psi(x) = \frac{d}{dx} \ln\left(\Gamma\left(x\right)\right) = \frac{\Gamma'(x)}{\Gamma(x)} + +Args: + input (Tensor): the tensor to compute the digamma function on + +Example:: + + >>> a = torch.tensor([1, 0.5]) + >>> torch.digamma(a) + tensor([-0.5772, -1.9635]) +""") + + add_docstr(torch.dist, r""" dist(input, other, p=2) -> Tensor @@ -4117,7 +4137,7 @@ def parse_kwargs(desc): Constructs a sparse tensors in COO(rdinate) format with non-zero elements at the given :attr:`indices` with the given :attr:`values`. A sparse tensor can be `uncoalesced`, in that case, there are duplicate coordinates in the indices, and the value at that index is the sum of all duplicate value entries: -`torch.spaerse`_. +`torch.sparse`_. Args: indices (array_like): Initial data for the tensor. Can be a list, tuple, @@ -4439,6 +4459,15 @@ def parse_kwargs(desc): upper(boolean, optional): controls whether to consider upper-triangular or lower-triangular region out (tuple, optional): the output tuple of (Tensor, Tensor) +Returns: + (Tensor, Tensor): A tuple containing + + - **e** (*Tensor*): Shape :math:`(m)`. Each element is an eigenvalue of ``input``, + The eigenvalues are in ascending order. + - **V** (*Tensor*): Shape :math:`(m \times m)`. + If ``eigenvectors=False``, it's a tensor filled with zeros. + Otherwise, this tensor contains the orthonormal eigenvectors of the ``input``. + Examples:: diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index 75e309ac0faf06..c1be47ad494397 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -554,11 +554,11 @@ def build_table(events, sort_by=None, header=None): header_sep = '-' * max_name_length + (' ' + '-' * col_width) * 5 # Have to use a list because nonlocal is Py3 only... - result = [''] + result = [] def append(s): - result[0] += s - result[0] += '\n' + result.append(s) + result.append('\n') # Yes, newline after the end as well # Actual printing if header is not None: @@ -572,4 +572,4 @@ def append(s): append(row_format.format(evt.key, evt.cpu_time_str, evt.cuda_time_str, evt.count, evt.cpu_time_total_str, evt.cuda_time_total_str)) - return result[0] + return ''.join(result) diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp index af367c3e544905..e17997e6e9baba 100644 --- a/torch/csrc/Module.cpp +++ b/torch/csrc/Module.cpp @@ -584,13 +584,20 @@ static PyObject* initModule() { ASSERT_TRUE(THCPStream_init(module)); #endif + auto set_module_attr = [&](const char* name, PyObject* v, bool incref = true) { + // PyModule_AddObject steals reference + if (incref) { + Py_INCREF(v); + } + return PyModule_AddObject(module, name, v) == 0; + }; + #ifdef USE_CUDNN PyObject *has_cudnn = Py_True; #else PyObject *has_cudnn = Py_False; #endif - Py_INCREF(has_cudnn); - ASSERT_TRUE(PyModule_AddObject(module, "has_cudnn", has_cudnn) == 0); + ASSERT_TRUE(set_module_attr("has_cudnn", has_cudnn)); #ifdef USE_DISTRIBUTED_MW // See comment on CUDA objects @@ -611,19 +618,20 @@ static PyObject* initModule() { // Set ATen warnings to issue Python warnings at::Warning::set_warning_handler(&warning_handler); - ASSERT_TRUE(PyModule_AddObject(module, "has_mkl", at::hasMKL() ? Py_True : Py_False) == 0); + ASSERT_TRUE(set_module_attr("has_mkl", at::hasMKL() ? Py_True : Py_False)); + ASSERT_TRUE(set_module_attr("has_lapack", at::hasLAPACK() ? Py_True : Py_False)); #ifdef _GLIBCXX_USE_CXX11_ABI - ASSERT_TRUE(PyModule_AddObject(module, "_GLIBCXX_USE_CXX11_ABI", - _GLIBCXX_USE_CXX11_ABI ? Py_True : Py_False) == 0); + ASSERT_TRUE(set_module_attr("_GLIBCXX_USE_CXX11_ABI", _GLIBCXX_USE_CXX11_ABI ? Py_True : Py_False)); #else - ASSERT_TRUE(PyModule_AddObject(module, "_GLIBCXX_USE_CXX11_ABI", Py_False) == 0); + ASSERT_TRUE(set_module_attr("_GLIBCXX_USE_CXX11_ABI", Py_False)); #endif auto& defaultGenerator = at::globalContext().defaultGenerator(at::kCPU); THPDefaultGenerator = (THPGenerator*)THPGenerator_NewWithGenerator( defaultGenerator); - ASSERT_TRUE(PyModule_AddObject(module, "default_generator", (PyObject*)THPDefaultGenerator) == 0); + // This reference is meant to be given away, so no need to incref here. + ASSERT_TRUE(set_module_attr("default_generator", (PyObject*)THPDefaultGenerator, /* incref= */ false)); #ifdef USE_NUMPY if (_import_array() < 0) return NULL; diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp index a4fcc6c45e874d..8fd95eda86f121 100644 --- a/torch/csrc/cuda/Module.cpp +++ b/torch/csrc/cuda/Module.cpp @@ -333,16 +333,15 @@ static PyObject * THCPModule_initExtension(PyObject *self) THCPCharStorage_postInit(m); THCPByteStorage_postInit(m); -#ifdef USE_MAGMA - THCMagma_init(state); - bool has_magma = true; -#else - bool has_magma = false; -#endif + bool has_magma = at::hasMAGMA(); + if (has_magma) { + THCMagma_init(state); + } bool has_half = true; auto set_module_attr = [&](const char* name, PyObject* v) { + // PyObject_SetAttrString doesn't steal reference. So no need to incref. if (PyObject_SetAttrString(m, name, v) < 0) { throw python_error(); } diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index fdf88bc0704a47..a67d009e024360 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -346,8 +346,8 @@ PyObject* c10d_init(PyObject* _unused) { #endif shared_ptr_class_<::c10d::ProcessGroup::Work>(module, "Work") - .def("isCompleted", &::c10d::ProcessGroup::Work::isCompleted) - .def("isSuccess", &::c10d::ProcessGroup::Work::isSuccess) + .def("is_completed", &::c10d::ProcessGroup::Work::isCompleted) + .def("is_success", &::c10d::ProcessGroup::Work::isSuccess) .def("exception", &::c10d::ProcessGroup::Work::exception) .def("synchronize", &::c10d::ProcessGroup::Work::synchronize) .def( diff --git a/torch/csrc/generic/Storage.cpp b/torch/csrc/generic/Storage.cpp index 42f3f583b848e9..d8f33c533b2039 100644 --- a/torch/csrc/generic/Storage.cpp +++ b/torch/csrc/generic/Storage.cpp @@ -151,9 +151,9 @@ static PyObject * THPStorage_(get)(THPStorage *self, PyObject *index) int64_t nindex = THPUtils_unpackLong(index); if (nindex < 0) nindex += THWStorage_(size)(LIBRARY_STATE self->cdata); - if (nindex < 0 || nindex >= self->cdata->size()) { + if (nindex < 0 || nindex >= self->cdata->numel()) { PyErr_Format(PyExc_IndexError, "index %" PRId64 " out of range for storage of " - "size %" PRId64, (int64_t) nindex, (int64_t) self->cdata->size()); + "size %" PRId64, (int64_t) nindex, (int64_t) self->cdata->numel()); return NULL; } real value = THWStorage_(get)(LIBRARY_STATE self->cdata, nindex); diff --git a/torch/csrc/generic/StorageSharing.cpp b/torch/csrc/generic/StorageSharing.cpp index 4a7c01b2ca2e82..6b462160c6d0b2 100644 --- a/torch/csrc/generic/StorageSharing.cpp +++ b/torch/csrc/generic/StorageSharing.cpp @@ -79,7 +79,7 @@ static PyObject * THPStorage_(shareFilename)(THPStorage *self) } else { // TODO: retry on collision // TODO: free GIL - but remember to reacquire it when an exception is thrown - THWStoragePtr new_storage(THPStorage_(newFilenameStorage)(storage->size())); + THWStoragePtr new_storage(THPStorage_(newFilenameStorage)(storage->numel())); THWStorage_(copy)(new_storage, storage); THWStorage_(swap)(storage, new_storage); ctx = THManagedMapAllocator::fromDataPtr(storage->data_ptr()); @@ -90,7 +90,7 @@ static PyObject * THPStorage_(shareFilename)(THPStorage *self) if (!manager_handle) return NULL; THPObjectPtr storage_handle(PyBytes_FromString(ctx->filename())); if (!storage_handle) return NULL; - THPObjectPtr size(PyLong_FromLong(storage->size())); + THPObjectPtr size(PyLong_FromLong(storage->numel())); if (!size) return NULL; THPObjectPtr tuple(PyTuple_New(3)); @@ -158,7 +158,7 @@ static PyObject * THPStorage_(shareFd)(THPStorage *self) if ((ctx = THMapAllocator::fromDataPtr(storage->data_ptr()))) { // done } else { - THWStoragePtr new_storage(THPStorage_(newFdStorage)(storage->size())); + THWStoragePtr new_storage(THPStorage_(newFdStorage)(storage->numel())); THWStorage_(copy)(new_storage, storage); THWStorage_(swap)(storage, new_storage); ctx = THMapAllocator::fromDataPtr(storage->data_ptr()); @@ -167,7 +167,7 @@ static PyObject * THPStorage_(shareFd)(THPStorage *self) THPObjectPtr storage_handle(PyLong_FromLong(ctx->fd())); if (!storage_handle) return NULL; - THPObjectPtr size(PyLong_FromLong(storage->size())); + THPObjectPtr size(PyLong_FromLong(storage->numel())); if (!size) return NULL; THPObjectPtr tuple(PyTuple_New(2)); @@ -220,7 +220,7 @@ static PyObject * THPStorage_(shareCuda)(THPStorage *self) THPObjectPtr device(PyLong_FromLong(storage->device().index())); THPObjectPtr _handle(Py_None); Py_INCREF(Py_None); - THPObjectPtr size(PyLong_FromLong(storage->size())); + THPObjectPtr size(PyLong_FromLong(storage->numel())); THPObjectPtr _offset(PyLong_FromLong(0)); if (THWStorage_(data)(LIBRARY_STATE storage)) { size_t base_size; diff --git a/torch/csrc/jit/constants.cpp b/torch/csrc/jit/constants.cpp index f51a735acea1b5..d7876411c687a6 100644 --- a/torch/csrc/jit/constants.cpp +++ b/torch/csrc/jit/constants.cpp @@ -13,7 +13,9 @@ Value* insertConstant( Node * n = g.create(prim::Constant); if(val.isTensor()) { at::Tensor ref = std::move(val).toTensor(); - JIT_ASSERT(ref.defined()); + if(!ref.defined()) { + throw constant_not_supported_error("undefined tensors cannot become constants"); + } n->output()->inferTypeFrom(ref); // note: before t_ because of std::move(ref) n->t_(attr::value, std::move(ref)); } else if(val.isInt()) { diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp index 6855002d4fd9cb..bfd8ec9b9f1764 100644 --- a/torch/csrc/jit/passes/constant_propagation.cpp +++ b/torch/csrc/jit/passes/constant_propagation.cpp @@ -31,6 +31,10 @@ std::unordered_set skip_list = { aten::randn_like, aten::randperm, aten::randperm_out, + prim::Constant, + prim::Undefined, + // TODO (zach): we should consider skipping tensor factories in the cases + // where the constant tensor would be large but cheap to create. }; std::vector runNode(Node* n) { @@ -40,9 +44,14 @@ std::vector runNode(Node* n) { stack.push_back(*(toIValue(input))); } op(stack); - auto var_outputs = fmap(stack, [&](IValue v) { + auto var_outputs = fmap(stack, [&](IValue v) -> IValue { if (v.isTensor()) { - return IValue(autograd::as_variable_ref(v.toTensor()).data()); + auto t = std::move(v).toTensor(); + if(t.defined()) { + return IValue(autograd::as_variable_ref(t).data()); + } else { + return t; + } } else { return v; } @@ -119,11 +128,11 @@ bool removeExtraNodeOutputs(Node *n) { } // anonymous namespace void ConstantPropagation(Node* n, bool recurse) { - bool constant_inputs = (n->inputs().size() > 0) && - std::all_of(n->inputs().begin(), n->inputs().end(), [&](Value* v) { - return v->node()->kind() == prim::Constant; - }); - bool supported_node = skip_list.count(n->kind()) == 0; + bool constant_inputs = + std::all_of(n->inputs().begin(), n->inputs().end(), [&](Value* v) { + return v->node()->kind() == prim::Constant; + }); + bool supported_node = !n->kind().is_onnx() && skip_list.count(n->kind()) == 0; auto run_blocks = [&]() { if (recurse) { for (Block * block : n->blocks()) { @@ -150,7 +159,6 @@ void ConstantPropagation(Node* n, bool recurse) { } void ConstantPropagation(Block* block, bool recurse) { - ConstantPropagation(block->param_node(), recurse); for(auto it = block->nodes().begin(); it != block->nodes().end();) { Node *n = *it; it++; //advance iterator bc the current node may be destroyed diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp index d16d4b00f07e91..d685584a4045be 100644 --- a/torch/csrc/jit/python_ir.cpp +++ b/torch/csrc/jit/python_ir.cpp @@ -443,14 +443,29 @@ void initPythonIRBindings(PyObject * module_) { switch(t->kind()) { case TypeKind::DynamicType: return "DynamicType"; + case TypeKind::TensorType: + return "TensorType"; + case TypeKind::NumberType: + return "NumberType"; + case TypeKind::NoneType: + return "NoneType"; case TypeKind::CompleteTensorType: return "CompleteTensorType"; case TypeKind::TupleType: return "TupleType"; - default: - AT_ERROR("unknown type kind"); - return ""; + case TypeKind::ListType: + return "ListType"; + case TypeKind::IntType: + return "IntType"; + case TypeKind::FloatType: + return "FloatType"; + case TypeKind::StringType: + return "StringType"; + case TypeKind::GeneratorType: + return "GeneratorType"; } + // not reachable, but some compilers complain + AT_ERROR("Unknown Type Kind"); }) .def("sizes",[](Type& t) { return t.expect()->sizes(); diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp index 5bc7bd574cf766..fee8924277d11e 100644 --- a/torch/csrc/jit/tracer.cpp +++ b/torch/csrc/jit/tracer.cpp @@ -48,6 +48,16 @@ void addInputs(Node *n, const char * name, at::TensorList value) { n->addInput(list_node->output()); } +void addInputs(Node* n, const char * name, const at::TensorOptions& options) { + // [TensorOptions in script] - update this when you change how we schematize TensorOptions + detail::genericAddInput(n, static_cast(options.dtype())); + detail::genericAddInput(n, static_cast(options.layout())); + std::vector device = { + static_cast(options.device().type()), + static_cast(options.device().index())}; + detail::genericAddInput(n, std::move(device)); +} + void addInputs(Node *n, const char * name, at::IntList value) { using ArgumentStash = jit::tracer::ArgumentStash; std::vector info = ArgumentStash::hasIntList(name) ? diff --git a/torch/csrc/jit/tracer.h b/torch/csrc/jit/tracer.h index 789b3fd2d4591c..b811534ce27401 100644 --- a/torch/csrc/jit/tracer.h +++ b/torch/csrc/jit/tracer.h @@ -229,16 +229,17 @@ inline void abandon() { // NB: those serve both as an intermediate steps in addInputs below, // as well as the overloads that terminate template recursion -void addInputs(Node *n, const char * name, int64_t value); -void addInputs(Node *n, const char * name, bool value); -void addInputs(Node *n, const char * name, double value); -void addInputs(Node *n, const char * name, const at::Scalar& value); -void addInputs(Node *n, const char * name, const at::Tensor& value); -void addInputs(Node *n, const char * name, at::IntList value); -void addInputs(Node *n, const char * name, at::TensorList value); -void addInputs(Node *n, const char * name, const ArrayRef& value); -void addInputs(Node *n, const char * name, const std::string& value); -void addInputs(Node *n, const char * name, const at::SparseTensorRef& value); +TORCH_API void addInputs(Node *n, const char * name, int64_t value); +TORCH_API void addInputs(Node *n, const char * name, bool value); +TORCH_API void addInputs(Node *n, const char * name, double value); +TORCH_API void addInputs(Node *n, const char * name, const at::Scalar& value); +TORCH_API void addInputs(Node *n, const char * name, const at::Tensor& value); +TORCH_API void addInputs(Node *n, const char * name, at::IntList value); +TORCH_API void addInputs(Node *n, const char * name, at::TensorList value); +TORCH_API void addInputs(Node *n, const char * name, const ArrayRef& value); +TORCH_API void addInputs(Node *n, const char * name, const std::string& value); +TORCH_API void addInputs(Node *n, const char * name, const at::SparseTensorRef& value); +TORCH_API void addInputs(Node *n, const char * name, const at::TensorOptions& value); template void addInputs(Node *n, const char * name, std::array value) { diff --git a/torch/csrc/jit/type.cpp b/torch/csrc/jit/type.cpp index c7e33fae7e20ac..e5a3e64ac067d8 100644 --- a/torch/csrc/jit/type.cpp +++ b/torch/csrc/jit/type.cpp @@ -51,6 +51,8 @@ std::ostream& operator<<(std::ostream & out, const Type & t) { out << "None"; } else if(t.kind() == TypeKind::StringType) { out << "string"; + } else if(t.kind() == TypeKind::GeneratorType) { + out << "Generator"; } else { AT_ERROR("unknown type kind"); } diff --git a/torch/distributed/c10d/__init__.py b/torch/distributed/c10d/__init__.py index 3b98424e891479..5356097743aa3c 100644 --- a/torch/distributed/c10d/__init__.py +++ b/torch/distributed/c10d/__init__.py @@ -6,20 +6,8 @@ def is_available(): if is_available() and not torch._C._c10d_init(): - raise RuntimeError("c10d initialization failed") + raise RuntimeError("Failed to initialize PyTorch distributed support") if is_available(): - from .rendezvous import rendezvous, register_rendezvous_handler - from . import BroadcastOptions, AllreduceOptions - - DEFAULT_REDUCE_OPTIONS = AllreduceOptions() - - def broadcast(tensor, src, process_group): - opts = BroadcastOptions() - opts.rootRank = src - opts.rootTensor = 0 - return process_group.broadcast([tensor], opts) - - def all_reduce(tensor, process_group, opts=DEFAULT_REDUCE_OPTIONS): - return process_group.allreduce([tensor], opts) + from .distributed_c10d import * diff --git a/torch/distributed/c10d/distributed_c10d.py b/torch/distributed/c10d/distributed_c10d.py new file mode 100644 index 00000000000000..dc341f99427552 --- /dev/null +++ b/torch/distributed/c10d/distributed_c10d.py @@ -0,0 +1,1054 @@ +import torch + +from .rendezvous import rendezvous, register_rendezvous_handler +from . import BroadcastOptions, AllreduceOptions, ReduceOptions, \ + ScatterOptions, GatherOptions +from . import ReduceOp as reduce_op +from . import PrefixStore +from . import ProcessGroupGloo + + +_MPI_AVAILBLE = True +_NCCL_AVAILBLE = True + + +try: + from. import ProcessGroupMPI +except ImportError: + _MPI_AVAILBLE = False + +try: + from. import ProcessGroupNCCL +except ImportError: + _NCCL_AVAILBLE = False + + +class DistBackend: + UNDEFINED = -1 + GLOO = 0 + NCCL = 2 + MPI = 3 + + +class group(object): + WORLD = object() + + +class GroupMember(object): + # Alias to group.WORLD for backward compatibility + WORLD = group.WORLD + NON_GROUP_MEMBER = object() + + +# Cached process groups, map from ProcessGroup to (DistBackend, Store) +_pg_map = {} +# Process group's names, map from ProcessGroup to str +_pg_names = {} +# Process group's global rank to local rank mapping +_pg_group_ranks = {} + +# Default process group state +_default_pg = None +_default_pg_init_method = None + +# Process group count for default naming +_group_count = 0 + + +def _rank_not_in_group(group): + """ + Helper that checks if the current process's rank is not in a given group + + """ + return group == GroupMember.NON_GROUP_MEMBER + + +def _get_group_rank(group, rank): + """ + Helper that gets a given group's local rank in the group from a given global + rank + + """ + if group is GroupMember.WORLD: + raise RuntimeError("group.WORLD does not have local rank to global " + "rank mapping") + group_rank = _pg_group_ranks[group][rank] + if group_rank is None: + raise RuntimeError("The global rank is not part of the group") + return group_rank + + +def _get_global_rank(group, group_rank): + """ + Helper that gets a given group's global rank from a given local rank in the + group + + """ + if group is GroupMember.WORLD: + raise RuntimeError("group.WORLD does not have local rank to global " + "rank mapping") + group_rank_map = _pg_group_ranks[group] + for rank, grp_rank in group_rank_map.items(): + if grp_rank == group_rank: + return rank + raise RuntimeError("The group rank is not part of the group") + + +def _check_default_pg(): + """ + Helper that checks if the default ProcessGroup has been initializd, with + assertion + + """ + assert _default_pg is not None, \ + "Default process group is not initialized" + + +def is_mpi_available(): + """ + Checks if MPI is available + + """ + return _MPI_AVAILBLE + + +def is_nccl_available(): + """ + Checks if NCCL is available + + """ + return _NCCL_AVAILBLE + + +def is_initialized(): + """ + Checking if the default process group has been initialized + + """ + return _default_pg is not None + + +def get_default_group(): + """ + Getting the default process group created by init_process_group + + """ + if not is_initialized(): + raise RuntimeError("Default process group has not been initialized, " + "please make sure to call init_process_group.") + return _default_pg + + +def init_process_group(backend, + init_method="env://", + **kwargs): + """ + Initializes the default distributed process group, and this will also + initialize the distributed package + + Arguments: + backend (str): Name of the backend to use. Depending on build-time + configuration valid values include: + ``mpi`` and ``gloo``. + init_method (str, optional): URL specifying how to initialize the + process group. + world_size (int, optional): Number of processes participating in + the job. + rank (int, optional): Rank of the current process. + group_name (str, optional, deprecated): Group name. + + To enable ``backend == mpi``, PyTorch needs to built from source on + a system that supports MPI. The same applies to NCCL as well. + + """ + global _pg_map + global _pg_names + global _default_pg + global _default_pg_init_method + + if _default_pg is not None: + raise RuntimeError("trying to initialize the default process group " + "twice!") + + world_size = kwargs.pop('world_size', -1) + group_name = kwargs.pop('group_name', '') + rank = kwargs.pop('rank', -1) + assert len(kwargs) == 0, \ + "got unexpected keyword arguments: %s" % ",".join(kwargs.keys()) + + if backend == "mpi": + if not is_mpi_available(): + raise RuntimeError("Distributed package doesn't have MPI built in") + + _default_pg = ProcessGroupMPI() + _pg_map[_default_pg] = (DistBackend.MPI, None) + else: + # backward compatible API + if init_method != "env://" and world_size != -1 and rank != -1: + url = "{}?rank={}&world_size={}".format(init_method, + rank, + world_size) + store, _, _ = next(rendezvous(url)) + else: + store, rank, world_size = next(rendezvous(init_method)) + + if backend == "gloo": + _default_pg = ProcessGroupGloo(store, rank, world_size) + _pg_map[_default_pg] = (DistBackend.GLOO, store) + _pg_names[_default_pg] = group_name + elif backend == "nccl": + if not is_nccl_available(): + raise RuntimeError("Distributed package doesn't have NCCL " + "built in") + _default_pg = ProcessGroupNCCL(store, rank, world_size) + _pg_map[_default_pg] = (DistBackend.NCCL, store) + _pg_names[_default_pg] = group_name + else: + raise RuntimeError("Invalid distributed backend name: " + backend) + + _default_pg_init_method = init_method + + +def _new_process_group_helper(world_size, rank, group_name=""): + """ + Create a new distributed process group. And the new process group can be + used to perform collective operations. + + """ + global _pg_map + global _group_count + global _pg_names + + if not group_name: + group_name = str(_group_count) + _group_count += 1 + + if group_name in _pg_names.values(): + raise RuntimeError("The specified group name has already been " + "created, please use a different group name") + + default_backend, default_store = _pg_map[_default_pg] + + # Create the prefix store + store = PrefixStore(group_name, default_store) + + if default_backend == DistBackend.GLOO: + pg = ProcessGroupGloo(store, rank, world_size) + _pg_map[pg] = (DistBackend.GLOO, store, group_name) + _pg_names[_default_pg] = group_name + elif default_backend == DistBackend.NCCL: + if not is_nccl_available(): + raise RuntimeError("Distributed package doesn't have NCCL " + "built in") + pg = ProcessGroupNCCL(store, rank, world_size) + _pg_map[pg] = (DistBackend.NCCL, store, group_name) + _pg_names[_default_pg] = group_name + else: + raise RuntimeError("Unsupported distributed backend by group") + return pg + + +def destroy_process_group(group=group.WORLD): + """ + Destroy a given process group, and deinitialize the distributed package + + Arguments: + group (ProcessGroup, optional): The process group to be destroyed, if + group.WORLD is given, all process + groups including the default one will + be destroyed. + """ + if _rank_not_in_group(group): + return + + global _pg_map + global _pg_names + global _pg_group_ranks + global _default_pg + global _default_pg_init_method + + if group == GroupMember.WORLD: + pg = _default_pg + + if _pg_map.get(pg, None) is None: + raise RuntimeError("Invalid process group specified") + + if group == GroupMember.WORLD: + _default_pg = None + _default_pg_init_method = None + _pg_map.clear() + _pg_names.clear() + _pg_group_ranks.clear() + else: + del _pg_map[pg] + del _pg_names[pg] + del _pg_group_ranks[pg] + + +def get_rank(group=group.WORLD): + """ + Returns the rank of currrent process group + + Rank is a unique identifier assigned to each process within a distributed + process group. They are always consecutive integers ranging from 0 to + ``world_size``. + + Arguments: + group (ProcessGroup, optional): The process group to work on + + Returns: + The rank of the process group + -1, if not part of the group + + """ + if _rank_not_in_group(group): + return -1 + + if group == GroupMember.WORLD: + _check_default_pg() + return _default_pg.rank() + + return group.rank() + + +def get_world_size(group=group.WORLD): + """ + Returns the number of processes in the current process group + + Arguments: + group (ProcessGroup, optional): The process group to work on + + Returns: + The world size of the process group + -1, if not part of the group + + """ + if _rank_not_in_group(group): + return -1 + + if group == GroupMember.WORLD: + _check_default_pg() + return _default_pg.size() + + return group.size() + + +def isend(tensor, + dst, + group=group.WORLD): + """ + Sends a tensor asynchronously. + + Arguments: + tensor (Tensor): Tensor to send. + dst (int): Destination rank. + group (ProcessGroup, optional): The process group to work on + + Returns: + A distributed request object. + None, if not part of the group + + """ + if _rank_not_in_group(group): + return + + if group == GroupMember.WORLD: + _check_default_pg() + return _default_pg.send([tensor], dst) + else: + group_dst_rank = _get_group_rank(group, dst) + return group.send([tensor], group_dst_rank) + + +def irecv(tensor, + src, + group=group.WORLD): + """ + Receives a tensor asynchronously. + + Arguments: + tensor (Tensor): Tensor to fill with received data. + src (int): Source rank. + group (ProcessGroup, optional): The process group to work on + + Returns: + A distributed request object. + None, if not part of the group + + """ + if _rank_not_in_group(group): + return + + if group == GroupMember.WORLD: + _check_default_pg() + return _default_pg.recv([tensor], src) + else: + group_src_rank = _get_group_rank(group, src) + return group.recv([tensor], group_src_rank) + + +def send(tensor, + dst, + group=group.WORLD): + """ + Sends a tensor synchronously. + + Arguments: + tensor (Tensor): Tensor to send. + dst (int): Destination rank. + group (ProcessGroup, optional): The process group to work on + + """ + if _rank_not_in_group(group): + return + + if group == GroupMember.WORLD: + _check_default_pg() + _default_pg.send([tensor], dst).wait() + else: + group_dst_rank = _get_group_rank(group, dst) + group.send([tensor], group_dst_rank).wait() + + +def recv(tensor, + src=None, + group=group.WORLD): + """ + Receives a tensor synchronously. + + Arguments: + tensor (Tensor): Tensor to fill with received data. + src (int, optional): Source rank. Will receive from any + process if unspecified. + group (ProcessGroup, optional): The process group to work on + + Returns: + Sender rank + -1, if not part of the group + + """ + if _rank_not_in_group(group): + return -1 + + if group == GroupMember.WORLD: + _check_default_pg() + pg = _default_pg + else: + pg = group + + if src is None: + rank_tensor = torch.IntTensor([-1]) + pg.recv_anysource([tensor], rank_tensor).wait() + src_rank = rank_tensor[0].item() + if group == GroupMember.WORLD: + return src_rank + else: + return _get_global_rank(pg, src_rank) + else: + if group == GroupMember.WORLD: + pg.recv([tensor], src).wait() + else: + group_src_rank = _get_group_rank(pg, src) + pg.recv([tensor], group_src_rank).wait() + return src + + +def broadcast_multigpu(tensor_list, + src, + group=group.WORLD, + async_op=False, + src_tensor=0): + """ + Broadcasts the tensor to the whole group with multiple GPU tensors + per node. + + ``tensor`` must have the same number of elements in all the GPUs from + all processes participating in the collective. each tensor in the list must + be on a different GPU + + Only nccl and gloo backend are currently supported + tensors should only be GPU tensors + + Arguments: + tensor_list (List[Tensor]): Tensors that participate in the collective + operation. if ``src`` is the rank, then ``src_tensor``th element of + ``tensor_list`` (``tensor_list[src_tensor]``) will be broadcasted + to all other tensors (on different GPUs) in the src process and + all tensors in ``tensor_list`` of other non-src processes. + You also need to make sure that ``len(tensor_list)`` is the same + for all the distributed processes calling this function. + + src (int): Source rank. + group (ProcessGroup, optional): The process group to work on + async_op (bool, optional): Whether this op should be an async op + src_tensor (int, optional): Source tensor rank within ``tensor_list`` + + Returns: + Async work handle, if async_op is set to True. + None, if not async_op or if not part of the group + + """ + if _rank_not_in_group(group): + return + + opts = BroadcastOptions() + opts.rootRank = src + opts.rootTensor = src_tensor + + if group == GroupMember.WORLD: + _check_default_pg() + work = _default_pg.broadcast(tensor_list, opts) + else: + group_src_rank = _get_group_rank(group, src) + opts.rootRank = group_src_rank + work = group.broadcast(tensor_list, opts) + if async_op: + return work + else: + work.wait() + + +def broadcast(tensor, + src, + group=group.WORLD, + async_op=False): + """ + Broadcasts the tensor to the whole group. + + ``tensor`` must have the same number of elements in all processes + participating in the collective. + + Arguments: + tensor (Tensor): Data to be sent if ``src`` is the rank of current + process, and tensor to be used to save received data otherwise. + src (int): Source rank. + group (ProcessGroup, optional): The process group to work on + async_op (bool, optional): Whether this op should be an async op + + Returns: + Async work handle, if async_op is set to True. + None, if not async_op or if not part of the group + + """ + if _rank_not_in_group(group): + return + + opts = BroadcastOptions() + opts.rootRank = src + opts.rootTensor = 0 + + if group == GroupMember.WORLD: + _check_default_pg() + work = _default_pg.broadcast([tensor], opts) + else: + group_src_rank = _get_group_rank(group, src) + opts.rootRank = group_src_rank + work = group.broadcast([tensor], opts) + if async_op: + return work + else: + work.wait() + + +def all_reduce_multigpu(tensor_list, + op=reduce_op.SUM, + group=group.WORLD, + async_op=False): + """ + Reduces the tensor data across all machines in such a way that all get + the final result. This function reduces a number of tensors on every node, + while each tensor resides on different GPUs. + Therefore, the input tensor in the tensor list needs to be GPU tensors. + Also, each tensor in the tensor list needs to reside on a different GPU. + + After the call, all ``tensor`` in ``tensor_list`` is going to be bitwise + identical in all processes. + + Only nccl and gloo backend is currently supported + tensors should only be GPU tensors + + Arguments: + tensor list (List[Tensor]): List of input and output tensors of + the collective. The function operates in-place and requires that + each tensor to be a GPU tensor on different GPUs. + You also need to make sure that ``len(tensor_list)`` is the same for + all the distributed processes calling this function. + op (optional): One of the values from + ``torch.distributed.c10d.reduce_op`` + enum. Specifies an operation used for element-wise reductions. + group (ProcessGroup, optional): The process group to work on + async_op (bool, optional): Whether this op should be an async op + + Returns: + Async work handle, if async_op is set to True. + None, if not async_op or if not part of the group + + """ + if _rank_not_in_group(group): + return + + opts = AllreduceOptions() + opts.reduceOp = op + if group == GroupMember.WORLD: + _check_default_pg() + work = _default_pg.allreduce(tensor_list, opts) + else: + work = group.allreduce(tensor_list, opts) + + if async_op: + return work + else: + work.wait() + + +def all_reduce(tensor, + op=reduce_op.SUM, + group=group.WORLD, + async_op=False): + """ + Reduces the tensor data across all machines in such a way that all get + the final result. + + After the call ``tensor`` is going to be bitwise identical in all processes. + + Arguments: + tensor (Tensor): Input and output of the collective. The function + operates in-place. + op (optional): One of the values from + ``torch.distributed.c10d.reduce_op`` + enum. Specifies an operation used for element-wise reductions. + group (ProcessGroup, optional): The process group to work on + async_op (bool, optional): Whether this op should be an async op + + Returns: + Async work handle, if async_op is set to True. + None, if not async_op or if not part of the group + + """ + if _rank_not_in_group(group): + return + + opts = AllreduceOptions() + opts.reduceOp = op + if group == GroupMember.WORLD: + _check_default_pg() + work = _default_pg.allreduce([tensor], opts) + else: + work = group.allreduce([tensor], opts) + + if async_op: + return work + else: + work.wait() + + +def reduce_multigpu(tensor_list, + dst, + op=reduce_op.SUM, + group=group.WORLD, + async_op=False, + dst_tensor=0): + """ + Reduces the tensor data on multiple GPUs across all machines. Each tensor + in ``tensor_list`` should reside on a separate GPU + + Only the GPU of ``tensor_list[dst_tensor]`` on the process with rank ``dst`` + is going to receive the final result. + + Only nccl backend is currently supported + tensors should only be GPU tensors + + Arguments: + tensor_list (List[Tensor]): Input and output GPU tensors of the + collective. The function operates in-place. + You also need to make sure that ``len(tensor_list)`` is the same for + all the distributed processes calling this function. + dst (int): Destination rank + op (optional): One of the values from + ``torch.distributed.c10d.reduce_op`` + enum. Specifies an operation used for element-wise reductions. + group (ProcessGroup, optional): The process group to work on + async_op (bool, optional): Whether this op should be an async op + dst_tensor (int, optional): Destination tensor rank within + ``tensor_list`` + + Returns: + Async work handle, if async_op is set to True. + None, otherwise + + """ + if _rank_not_in_group(group): + return + + opts = ReduceOptions() + opts.reduceOp = op + opts.rootRank = dst + opts.rootTensor = dst_tensor + + if group == GroupMember.WORLD: + _check_default_pg() + work = _default_pg.reduce(tensor_list, opts) + else: + group_dst_rank = _get_group_rank(group, dst) + opts.rootRank = group_dst_rank + work = group.reduce(tensor_list, opts) + + if async_op: + return work + else: + work.wait() + + +def reduce(tensor, + dst, + op=reduce_op.SUM, + group=group.WORLD, + async_op=False): + """ + Reduces the tensor data across all machines. + + Only the process with rank ``dst`` is going to receive the final result. + + Arguments: + tensor (Tensor): Input and output of the collective. The function + operates in-place. + dst (int): Destination rank + op (optional): One of the values from + ``torch.distributed.c10d.reduce_op`` + enum. Specifies an operation used for element-wise reductions. + group (ProcessGroup, optional): The process group to work on + async_op (bool, optional): Whether this op should be an async op + + Returns: + Async work handle, if async_op is set to True. + None, if not async_op or if not part of the group + + """ + if _rank_not_in_group(group): + return + + opts = ReduceOptions() + opts.reduceOp = op + opts.rootRank = dst + + if group == GroupMember.WORLD: + _check_default_pg() + work = _default_pg.reduce([tensor], opts) + else: + group_dst_rank = _get_group_rank(group, dst) + opts.rootRank = group_dst_rank + work = group.reduce([tensor], opts) + + if async_op: + return work + else: + work.wait() + + +def all_gather_multigpu(output_tensor_lists, + input_tensor_list, + group=group.WORLD, + async_op=False): + """ + Gathers tensors from the whole group in a list. + Each tensor in ``tensor_list`` should reside on a separate GPU + + Only nccl backend is currently supported + tensors should only be GPU tensors + + Arguments: + output_tensor_lists (List[List[Tensor]]): Output lists. It should + contain correctly-sized tensors on each GPU to be used for output of + the collective. + e.g. ``output_tensor_lists[i]`` contains the all_gather + result that resides on the GPU of ``input_tensor_list[i]``. + Note that each element of ``output_tensor_lists[i]`` has the size of + ``world_size * len(input_tensor_list)``, since the function all + gathers the result from every single GPU in the group. To interpret + each element of ``output_tensor_list[i]``, note that + ``input_tensor_list[j]`` of rank k will be appear in + ``output_tensor_list[i][rank * world_size + j]`` + Also note that ``len(output_tensor_lists)``, and the size of each + element in ``output_tensor_lists`` (each element is a list, + therefore ``len(output_tensor_lists[i])``) need to be the same + for all the distributed processes calling this function. + + input_tensor_list (List[Tensor]): List of tensors(on different GPUs) to + be broadcast from current process. + Note that ``len(input_tensor_list)`` needs to be the same for + all the distributed processes calling this function. + + group (ProcessGroup, optional): The process group to work on + async_op (bool, optional): Whether this op should be an async op + + Returns: + Async work handle, if async_op is set to True. + None, if not async_op or if not part of the group + + """ + if _rank_not_in_group(group): + return + + if group == GroupMember.WORLD: + _check_default_pg() + work = _default_pg.allgather(output_tensor_lists, input_tensor_list) + else: + work = group.allgather(output_tensor_lists, input_tensor_list) + + if async_op: + return work + else: + work.wait() + + +def all_gather(tensor_list, + tensor, + group=group.WORLD, + async_op=False): + """ + Gathers tensors from the whole group in a list. + + Arguments: + tensor_list (list[Tensor]): Output list. It should contain + correctly-sized tensors to be used for output of the collective. + tensor (Tensor): Tensor to be broadcast from current process. + group (ProcessGroup, optional): The process group to work on + async_op (bool, optional): Whether this op should be an async op + + Returns: + Async work handle, if async_op is set to True. + None, if not async_op or if not part of the group + + """ + if _rank_not_in_group(group): + return + + if group == GroupMember.WORLD: + _check_default_pg() + work = _default_pg.allgather([tensor_list], [tensor]) + else: + work = group.allgather([tensor_list], [tensor]) + + if async_op: + return work + else: + work.wait() + + +def gather(tensor, + gather_list, + dst, + group=group.WORLD, + async_op=False): + """ + Gathers a list of tensors in a single process. + + Arguments: + tensor (Tensor): Input tensor. + gather_list (list[Tensor]): List of appropriately-sized tensors to + use for received data. Required only in the receiving process. + dst (int): Destination rank. Required in all processes except the one + that is receiveing the data. + group (ProcessGroup, optional): The process group to work on + async_op (bool, optional): Whether this op should be an async op + + Returns: + Async work handle, if async_op is set to True. + None, if not async_op or if not part of the group + + """ + if _rank_not_in_group(group): + return + + my_rank = get_rank() + if dst == my_rank: + if gather_list is None: + raise RuntimeError("gather_list is a required argument in gather " + "destination") + else: + if gather_list: + raise RuntimeError("non-empty gather_list can be given only " + "to gather destination") + + opts = GatherOptions() + opts.rootRank = dst + + if group == GroupMember.WORLD: + _check_default_pg() + work = _default_pg.gather([gather_list], [tensor], opts) + else: + group_dst_rank = _get_group_rank(group, dst) + opts.rootRank = group_dst_rank + work = group.gather([gather_list], [tensor], opts) + + if async_op: + return work + else: + work.wait() + + +def scatter(tensor, + scatter_list, + src, + group=group.WORLD, + async_op=False): + """ + Scatters a list of tensors to all processes in a group. + + Each process will receive exactly one tensor and store its data in the + ``tensor`` argument. + + Arguments: + tensor (Tensor): Output tensor. + scatter_list (list[Tensor]): List of tensors to scatter. Required only + in the process that is sending the data. + src (int): Source rank. Required in all processes except the one that + is sending the data. + group (ProcessGroup, optional): The process group to work on + async_op (bool, optional): Whether this op should be an async op + + Returns: + Async work handle, if async_op is set to True. + None, if not async_op or if not part of the group + + """ + if _rank_not_in_group(group): + return + + my_rank = get_rank() + if src == my_rank: + if scatter_list is None: + raise RuntimeError("scatter_list is a required argument in " + "scatter source") + else: + if scatter_list: + raise RuntimeError("non-empty can be given only to scatter " + "source") + + opts = ScatterOptions() + opts.rootRank = src + + if group == GroupMember.WORLD: + _check_default_pg() + work = _default_pg.scatter([tensor], [scatter_list], opts) + else: + group_src_rank = _get_group_rank(group, src) + opts.rootRank = group_src_rank + work = group.scatter([tensor], [scatter_list], opts) + + if async_op: + return work + else: + work.wait() + + +def barrier(group=group.WORLD, + async_op=False): + """ + Synchronizes all processes. + + This collective blocks processes until the whole group enters this function, + if async_op is False, or if async work handle is called on wait(). + + Arguments: + group (ProcessGroup, optional): The process group to work on + async_op (bool, optional): Whether this op should be an async op + + Returns: + Async work handle, if async_op is set to True. + None, if not async_op or if not part of the group + """ + if _rank_not_in_group(group): + return + + if group == GroupMember.WORLD: + _check_default_pg() + work = _default_pg.barrier() + else: + work = group.barrier() + + if async_op: + return work + else: + work.wait() + + +def new_group(ranks=None): + """ + Creates a new distributed group. + + This function requires that all processes in the main group (i.e. all + processes that are part of the distributed job) enter this function, even + if they are not going to be members of the group. Additionally, groups + should be created in the same order in all processes. + + Arguments: + ranks (list[int]): List of ranks of group members. + + Returns: + A handle of distributed group that can be given to collective calls. + """ + + _check_default_pg() + + global _pg_group_ranks + + default_backend, _ = _pg_map[_default_pg] + if default_backend == DistBackend.MPI: + raise RuntimeError("Only NCCL and Gloo backend currently support " + "new_group function") + + global_rank = _default_pg.rank() + global_world_size = _default_pg.size() + + # checks the input ranks + if ranks is not None: + group_world_size = len(ranks) + if group_world_size > global_world_size: + raise RuntimeError("the new group's world size should be less or " + "equal to the world size set by " + "init_process_group") + # check ranks' sanity + for rank in ranks: + if rank < 0 or rank >= global_world_size: + raise RuntimeError("The new group's rank should be within the " + "the world_size set by init_process_group") + + if global_rank in ranks: + group_rank = ranks.index(global_rank) + else: + group_rank = None + else: + group_world_size = global_world_size + group_rank = global_rank + + # Release ranks not in the group + if global_rank not in ranks: + return GroupMember.NON_GROUP_MEMBER + + pg = _new_process_group_helper(group_world_size, group_rank) + + # Create the global rank to group rank mapping + _pg_group_ranks[pg] = {} + for rank in range(global_world_size): + if rank in ranks: + _pg_group_ranks[pg][rank] = ranks.index(rank) + else: + _pg_group_ranks[pg][rank] = None + + return pg + + +# TODO: delete these functions and replace DDP with public functions +DEFAULT_REDUCE_OPTIONS = AllreduceOptions() + + +def _broadcast(tensor, src, process_group): + opts = BroadcastOptions() + opts.rootRank = src + opts.rootTensor = 0 + return process_group.broadcast([tensor], opts) + + +def _all_reduce(tensor, process_group, opts=DEFAULT_REDUCE_OPTIONS): + return process_group.allreduce([tensor], opts) diff --git a/torch/distributed/c10d/rendezvous.py b/torch/distributed/c10d/rendezvous.py index 062443f87abfec..30c9f2dfe7dd3b 100644 --- a/torch/distributed/c10d/rendezvous.py +++ b/torch/distributed/c10d/rendezvous.py @@ -3,6 +3,7 @@ except ImportError: from urlparse import urlparse +import os from . import FileStore, TCPStore @@ -59,13 +60,13 @@ def _error(msg): query = dict(pair.split("=") for pair in filter(None, result.query.split("&"))) if "rank" not in query: raise _error("rank parameter missing") - if "size" not in query: - raise _error("size parameter missing") + if "world_size" not in query: + raise _error("world size parameter missing") rank = int(query["rank"]) - size = int(query["size"]) + world_size = int(query["world_size"]) store = FileStore(path) - yield (store, rank, size) + yield (store, rank, world_size) # If this configuration is invalidated, there is nothing we can do about it raise RuntimeError("Unable to perform rerendezvous using file:// method") @@ -81,18 +82,52 @@ def _error(msg): query = dict(pair.split("=") for pair in filter(None, result.query.split("&"))) if "rank" not in query: raise _error("rank parameter missing") - if "size" not in query: - raise _error("size parameter missing") + if "world_size" not in query: + raise _error("world size parameter missing") rank = int(query["rank"]) - size = int(query["size"]) + world_size = int(query["world_size"]) start_daemon = rank == 0 store = TCPStore(result.hostname, result.port, start_daemon) - yield (store, rank, size) + yield (store, rank, world_size) # If this configuration is invalidated, there is nothing we can do about it raise RuntimeError("Unable to perform rerendezvous using tcp:// method") +def _env_rendezvous_handler(url): + def _error(msg): + return ValueError("env:// rendezvous: " + msg) + + if url != "env://": + raise _error("Only `env://` is expected for the env init method") + world_size = os.environ["WORLD_SIZE"] + if world_size is None: + raise _error("world size is missing") + rank = os.environ["RANK"] + if rank is None: + raise _error("rank is missing") + master_addr = os.environ["MASTER_ADDR"] + if master_addr is None: + raise _error("master addr is missing") + master_port = os.environ["MASTER_PORT"] + if master_port is None: + raise _error("master port is missing") + + # Converting before creating the store + rank = int(rank) + world_size = int(world_size) + master_port = int(master_port) + + # Now start the TCP store daemon on the rank 0 + start_daemon = rank == 0 + store = TCPStore(master_addr, master_port, start_daemon) + yield (store, rank, world_size) + + # If this configuration is invalidated, there is nothing we can do about it + raise RuntimeError("Unable to perform rerendezvous using env:// method") + + register_rendezvous_handler("file", _file_rendezvous_handler) register_rendezvous_handler("tcp", _tcp_rendezvous_handler) +register_rendezvous_handler("env", _env_rendezvous_handler) diff --git a/torch/functional.py b/torch/functional.py index 055141b7469a20..8c78b6efe9f80f 100644 --- a/torch/functional.py +++ b/torch/functional.py @@ -389,7 +389,7 @@ def isnan(tensor): return tensor != tensor -def unique(input, sorted=False, return_inverse=False): +def unique(input, sorted=False, return_inverse=False, dim=None): r"""Returns the unique scalar elements of the input tensor as a 1-D tensor. Arguments: @@ -431,11 +431,19 @@ def unique(input, sorted=False, return_inverse=False): [ 1, 2]]) """ - output, inverse_indices = torch._unique( - input, - sorted=sorted, - return_inverse=return_inverse, - ) + if dim is not None: + output, inverse_indices = torch._unique_dim( + input, + dim, + sorted=sorted, + return_inverse=return_inverse + ) + else: + output, inverse_indices = torch._unique( + input, + sorted=sorted, + return_inverse=return_inverse, + ) if return_inverse: return output, inverse_indices else: diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py index 30904ac7adff7d..551a17565e1763 100644 --- a/torch/jit/__init__.py +++ b/torch/jit/__init__.py @@ -21,6 +21,25 @@ import collections import re + +def _parse_env(name, default, true_message, false_message): + value = os.environ.get(name) + if value is None: + return default + if value.lower() in {'1', 'true', 'yes'}: + return True + elif value.lower() in {'0', 'false', 'no'}: + return False + if value == '1v': + print(true_message) + return True + elif value == '0v': + print(false_message) + return False + raise ValueError('Unknown setting of {}. Try using 0 or 1.'.format(name)) + + +_enabled = _parse_env('PYTORCH_JIT', True, "> Using PyTorch JIT", "> PyTorch JIT DISABLED") _flatten = torch._C._jit_flatten _unflatten = torch._C._jit_unflatten _jit_script_compile = torch._C._jit_script_compile @@ -431,6 +450,8 @@ def trace(*args, **kwargs): ... return x * 2 """ def wrapper(func): + if not _enabled: + return func executor_options = {'optimize': True} for name in executor_options: executor_options[name] = kwargs.pop(name, executor_options[name]) @@ -509,6 +530,8 @@ def __getattr__(self, attr): def script(fn, optimize=True, _frames_up=0): + if not _enabled: + return fn rcb = createResolutionCallback(_frames_up + 1) ast = get_jit_ast(fn, is_method=False) graph = _jit_script_compile(ast, rcb) @@ -528,6 +551,8 @@ def script(fn, optimize=True, _frames_up=0): def script_method(fn): + if not _enabled: + return fn # NOTE: we need to traverse two frames here because the meta-class frame # for ScriptModule will be present, as opposed to invoking @script on a # a function or invoking define() on a CompilationUnit. @@ -547,6 +572,8 @@ def script_method(fn): def batch(batch_size=1, optimize=True, _frames_up=0): def decorator(fn): + if not _enabled: + return fn import torch.jit.batchop mod = script(fn, optimize, _frames_up) res_graph = torch.to_batch_graph(mod.graph) @@ -757,57 +784,60 @@ def init_then_register(self, *args, **kwargs): return super(ScriptMeta, cls).__init__(name, bases, attrs) -class ScriptModule(with_metaclass(ScriptMeta, torch._C.ScriptModule, Module)): - def __init__(self, optimize=True): - # must be before Module.init since the field is used in __getattr__ - Module.__init__(self) - self._set_optimized(optimize) - self._parameters = OrderedParameterDict(self) - self._buffers = OrderedBufferDict(self) - self._modules = OrderedModuleDict(self) - - def __getattr__(self, attr): - if self._has_method(attr): - if attr in self.__class__._original_methods: - original_method = self.__class__._original_methods[attr] - script_method = self._get_method(attr) - return functools.wraps(original_method)(script_method) +if _enabled: + class ScriptModule(with_metaclass(ScriptMeta, torch._C.ScriptModule, Module)): + def __init__(self, optimize=True): + # must be before Module.init since the field is used in __getattr__ + Module.__init__(self) + self._set_optimized(optimize) + self._parameters = OrderedParameterDict(self) + self._buffers = OrderedBufferDict(self) + self._modules = OrderedModuleDict(self) + + def __getattr__(self, attr): + if self._has_method(attr): + if attr in self.__class__._original_methods: + original_method = self.__class__._original_methods[attr] + script_method = self._get_method(attr) + return functools.wraps(original_method)(script_method) + else: + return self._get_method(attr) + if attr == 'graph' and self._has_method('forward'): + return self.__getattr__('forward').graph + return Module.__getattr__(self, attr) + + def __setattr__(self, attr, value): + if attr not in self._constants_set: + return super(ScriptModule, self).__setattr__(attr, value) + if hasattr(self, attr): + raise RuntimeError("attempting to re-assign constant '{}'".format(attr)) + if isinstance(value, ModuleList): + # special case for list of modules. Modules need to be registered with their + # parent module. To do this, we create a ConstModuleList, which is itself a module, that + # contains each of these modules as submodules. The ConstModuleList then + # is set as an attribute of the parent module. + super(ScriptModule, self).__setattr__(attr, _ConstModuleList(value)) + elif isinstance(value, Sequential): + super(ScriptModule, self).__setattr__(attr, _ConstSequential(value)) else: - return self._get_method(attr) - if attr == 'graph' and self._has_method('forward'): - return self.__getattr__('forward').graph - return Module.__getattr__(self, attr) - - def __setattr__(self, attr, value): - if attr not in self._constants_set: - return super(ScriptModule, self).__setattr__(attr, value) - if hasattr(self, attr): - raise RuntimeError("attempting to re-assign constant '{}'".format(attr)) - if isinstance(value, ModuleList): - # special case for list of modules. Modules need to be registered with their - # parent module. To do this, we create a ConstModuleList, which is itself a module, that - # contains each of these modules as submodules. The ConstModuleList then - # is set as an attribute of the parent module. - super(ScriptModule, self).__setattr__(attr, _ConstModuleList(value)) - elif isinstance(value, Sequential): - super(ScriptModule, self).__setattr__(attr, _ConstSequential(value)) - else: - super(ScriptModule, self).__setattr__(attr, _get_valid_constant(value)) - - def __dir__(self): - return sorted(Module.__dir__(self) + self._method_names()) - - def define(self, lang): - # We use frames_up=1 to get to the proper surrounding scope. The stack - # will look like: - # 0. createResolutionCallback - # 1. define() - # 2. surrounding scope. - # - # createResolutionCallback internally adds 1 to get us to our frame, then - # we add 1 to get to the proper surrounding scope. - rcb = createResolutionCallback(frames_up=1) - self._define(lang, rcb, True) + super(ScriptModule, self).__setattr__(attr, _get_valid_constant(value)) + + def __dir__(self): + return sorted(Module.__dir__(self) + self._method_names()) + + def define(self, lang): + # We use frames_up=1 to get to the proper surrounding scope. The stack + # will look like: + # 0. createResolutionCallback + # 1. define() + # 2. surrounding scope. + # + # createResolutionCallback internally adds 1 to get us to our frame, then + # we add 1 to get to the proper surrounding scope. + rcb = createResolutionCallback(frames_up=1) + self._define(lang, rcb, True) +else: + ScriptModule = torch.nn.Module def _get_methods(cls): @@ -966,12 +996,12 @@ def register_all(mod): return _builtin_table -def _register_builtin(callable, op): - _get_builtin_table()[id(callable)] = op +def _register_builtin(fn, op): + _get_builtin_table()[id(fn)] = op -def _find_builtin(callable): - return _get_builtin_table().get(id(callable)) +def _find_builtin(fn): + return _get_builtin_table().get(id(fn)) if not torch._C._jit_init(): diff --git a/torch/lib/c10d/ProcessGroupMPI.cpp b/torch/lib/c10d/ProcessGroupMPI.cpp index 3d2bad9191a1fb..3afa33c7536bac 100644 --- a/torch/lib/c10d/ProcessGroupMPI.cpp +++ b/torch/lib/c10d/ProcessGroupMPI.cpp @@ -386,16 +386,17 @@ std::shared_ptr ProcessGroupMPI::gather( const GatherOptions& opts) { checkSingleTensor(inputTensors); + if (outputTensors.size() != 1) { + throw std::runtime_error("Gather: multi-GPU collective is not supported"); + } + if (rank_ != opts.rootRank) { - if (outputTensors.size() > 0) { + if (outputTensors[0].size() > 0) { throw std::runtime_error( "Gather: number of output tensors should be 0 " "for non-root"); } } else { - if (outputTensors.size() != 1) { - throw std::runtime_error("Gather: multi-GPU collective is not supported"); - } if (static_cast(size_) != outputTensors[0].size()) { throw std::runtime_error( "Gather: number of output tensors should equal " @@ -449,17 +450,17 @@ std::shared_ptr ProcessGroupMPI::scatter( std::vector>& inputTensors, const ScatterOptions& opts) { checkSingleTensor(outputTensors); + if (inputTensors.size() != 1) { + throw std::runtime_error("Scatter: multi-GPU collective is not supported"); + } if (rank_ != opts.rootRank) { - if (inputTensors.size() > 0) { + if (inputTensors[0].size() > 0) { throw std::runtime_error( "Scatter: number of input tensors should be 0 " "for non-root"); } } else { - if (inputTensors.size() != 1) { - throw std::runtime_error("Gather: multi-GPU collective is not supported"); - } if (static_cast(size_) != inputTensors[0].size()) { throw std::runtime_error( "Scatter: number of input tensors should equal " diff --git a/torch/nn/modules/__init__.py b/torch/nn/modules/__init__.py index 01ce9bf2ac7cab..f2683756ce59f8 100644 --- a/torch/nn/modules/__init__.py +++ b/torch/nn/modules/__init__.py @@ -43,10 +43,10 @@ 'InstanceNorm2d', 'InstanceNorm3d', 'LayerNorm', 'GroupNorm', 'Dropout', 'Dropout2d', 'Dropout3d', 'AlphaDropout', 'FeatureAlphaDropout', 'ReflectionPad1d', 'ReflectionPad2d', 'ReplicationPad2d', 'ReplicationPad1d', 'ReplicationPad3d', - 'CrossMapLRN2d', 'Embedding', 'EmbeddingBag', 'RNNBase', 'RNN', 'LSTM', 'GRU', 'RNNCell', 'LSTMCell', 'GRUCell', - 'PixelShuffle', 'Upsample', 'UpsamplingNearest2d', 'UpsamplingBilinear2d', 'PairwiseDistance', - 'AdaptiveMaxPool1d', 'AdaptiveMaxPool2d', 'AdaptiveMaxPool3d', 'AdaptiveAvgPool1d', 'AdaptiveAvgPool2d', - 'AdaptiveAvgPool3d', 'TripletMarginLoss', 'ZeroPad2d', 'ConstantPad1d', 'ConstantPad2d', + 'CrossMapLRN2d', 'Embedding', 'EmbeddingBag', 'RNNBase', 'RNN', 'LSTM', 'GRU', 'RNNCellBase', 'RNNCell', + 'LSTMCell', 'GRUCell', 'PixelShuffle', 'Upsample', 'UpsamplingNearest2d', 'UpsamplingBilinear2d', + 'PairwiseDistance', 'AdaptiveMaxPool1d', 'AdaptiveMaxPool2d', 'AdaptiveMaxPool3d', 'AdaptiveAvgPool1d', + 'AdaptiveAvgPool2d', 'AdaptiveAvgPool3d', 'TripletMarginLoss', 'ZeroPad2d', 'ConstantPad1d', 'ConstantPad2d', 'ConstantPad3d', 'Bilinear', 'CosineSimilarity', 'Unfold', 'Fold', 'AdaptiveLogSoftmaxWithLoss', ] diff --git a/torch/nn/parallel/distributed_c10d.py b/torch/nn/parallel/distributed_c10d.py index 1310d2d748c89c..daa03f9f585114 100644 --- a/torch/nn/parallel/distributed_c10d.py +++ b/torch/nn/parallel/distributed_c10d.py @@ -91,13 +91,14 @@ class _DistributedDataParallelC10d(Module): Args: module: module to be parallelized - process_group: the c10d process group to be used for distributed data - all-reduction device_ids: CUDA devices (default: all devices) output_device: device location of output (default: device_ids[0]) broadcast_buffers: flag that enables syncing (broadcasting) buffers of the module at beginning of the forward function. (default: True) + process_group: the c10d process group to be used for distributed data + all-reduction. If None, the default process group will + be used bucket_cap_mb: DistributedDataParallelC10d will bucket parameters into multiple buckets so that gradient reduction of each bucket can potentially overlap with backward computation. @@ -112,9 +113,9 @@ class _DistributedDataParallelC10d(Module): >>> pg = torch.distributed.c10d.ProcessGroupGloo(store, rank, world_size) >>> net = torch.nn._DistributedDataParallelC10d(model, pg) """ - def __init__(self, module, process_group, device_ids=None, + def __init__(self, module, device_ids=None, output_device=None, dim=0, broadcast_buffers=True, - bucket_cap_mb=25): + process_group=None, bucket_cap_mb=25): super(_DistributedDataParallelC10d, self).__init__() @@ -125,13 +126,19 @@ def __init__(self, module, process_group, device_ids=None, if output_device is None: output_device = device_ids[0] + if process_group is None: + self.process_group = c10d.get_default_group() + else: + self.process_group = process_group + self.dim = dim self.module = module - self.process_group = process_group self.device_ids = device_ids self.output_device = output_device self.broadcast_buffers = broadcast_buffers + self.allreduce_opts = c10d.AllreduceOptions() + MB = 1024 * 1024 # used for intra-node param sync and inter-node sync as well @@ -341,7 +348,8 @@ def _queue_reduction(self, bucket_idx): nccl.reduce(grads_batch_coalesced, root=0, streams=self.default_streams) # now work on the first gpu - reduction_work = c10d.all_reduce(grads_batch_coalesced[0], self.process_group) + reduction_work = self.process_group.allreduce([grads_batch_coalesced[0]], + self.allreduce_opts) self.reduction_works[bucket_idx] = reduction_work self.buckets_coalesced[bucket_idx] = grads_batch_coalesced[0] diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py index 34c30aea654ed7..b65ea160b5c213 100644 --- a/torch/onnx/utils.py +++ b/torch/onnx/utils.py @@ -19,6 +19,7 @@ from torch.autograd import Function, function from torch.jit import _unique_state_dict from torch.onnx import ONNX_ARCHIVE_MODEL_PROTO_NAME, ExportTypes, OperatorExportTypes +from torch._C import ListType @contextlib.contextmanager @@ -103,24 +104,32 @@ def export(model, args, f, export_params=True, verbose=False, training=False, operator_export_type=operator_export_type) -def _list_constant_prop(g, block): +# ONNX can't handle constants that are lists of tensors, which can +# get generated in constant prop. So we split them back into prim::ListConstructs +def _split_tensor_list_constants(g, block): for node in block.nodes(): for subblock in node.blocks(): - _list_constant_prop(g, subblock) - if node.kind() == "prim::ListConstruct": - input_nodes = [i.node() for i in node.inputs()] - if all(inode.kind() == "prim::Constant" and inode.kindOf("value") == "i" for inode in input_nodes): - input_values = [inode['value'] for inode in input_nodes] - const_node = g.create("prim::Constant") - const_node.insertBefore(node) - const_node.is_("value", input_values) - const_node.output().setType(torch._C.ListType.ofInts()) - node.output().replaceAllUsesWith(const_node.output()) + _split_tensor_list_constants(g, subblock) + if node.kind() == "prim::Constant": + output_type = node.output().type() + if output_type.isSubtypeOf(ListType.ofTensors()): + inputs = [g.create("prim::Constant").t_('value', t) + .insertBefore(node).output() + for t in node['value']] + lc = (g.create("prim::ListConstruct", inputs) + .insertBefore(node) + .output() + .setType(ListType.ofTensors())) + node.output().replaceAllUsesWith(lc) def _optimize_graph(graph, operator_export_type): - _list_constant_prop(graph, graph) - + # we record now record some ops like ones/zeros + # into a trace where we previously recorded constants + # use constant prop to maintain our current level of onnx support + # without implementing symbolics for all of them + torch._C._jit_pass_constant_propagation(graph) + _split_tensor_list_constants(graph, graph) # run dce to eliminate dead parts of the graph that might have been # left behind by things like symbolic_override torch._C._jit_pass_dce(graph) diff --git a/torch/tensor.py b/torch/tensor.py index ed2f7f0c10a565..904d3a5eeb3760 100644 --- a/torch/tensor.py +++ b/torch/tensor.py @@ -319,13 +319,22 @@ def masked_fill(self, mask, value): """ return self.clone().masked_fill_(mask, value) - def unique(self, sorted=False, return_inverse=False): + def unique(self, sorted=False, return_inverse=False, dim=None): r"""Returns the unique scalar elements of the tensor as a 1-D tensor. See :func:`torch.unique` """ - output, inverse_indices = self._unique( - sorted=sorted, return_inverse=return_inverse) + if dim is not None: + output, inverse_indices = self._unique_dim( + sorted=sorted, + return_inverse=return_inverse, + dim=dim + ) + else: + output, inverse_indices = self._unique( + sorted=sorted, + return_inverse=return_inverse + ) if return_inverse: return output, inverse_indices else: