diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md new file mode 100644 index 00000000000000..712143336a1af7 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug-report.md @@ -0,0 +1,49 @@ +--- +name: "\U0001F41B Bug Report" +about: Submit a bug report to help us improve PyTorch + +--- + +## ๐Ÿ› Bug + + + +## To Reproduce + +Steps to reproduce the behavior: + +1. +1. +1. + + + +## Expected behavior + + + +## Environment + +Please copy and paste the output from our +[environment collection script](https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py) +(or fill out the checklist below manually). + +You can get the script and run it with: +``` +wget https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py +# For security purposes, please check the contents of collect_env.py before running it. +python collect_env.py +``` + + - PyTorch Version (e.g., 1.0): + - OS (e.g., Linux): + - How you installed PyTorch (`conda`, `pip`, source): + - Build command you used (if compiling from source): + - Python version: + - CUDA/cuDNN version: + - GPU models and configuration: + - Any other relevant information: + +## Additional context + + diff --git a/.github/ISSUE_TEMPLATE/documentation.md b/.github/ISSUE_TEMPLATE/documentation.md new file mode 100644 index 00000000000000..a699c2e4548f8a --- /dev/null +++ b/.github/ISSUE_TEMPLATE/documentation.md @@ -0,0 +1,9 @@ +--- +name: "\U0001F4DA Documentation" +about: Report an issue related to https://pytorch.org/docs + +--- + +## ๐Ÿ“š Documentation + + diff --git a/.github/ISSUE_TEMPLATE/feature-request.md b/.github/ISSUE_TEMPLATE/feature-request.md new file mode 100644 index 00000000000000..e1d2bc306eae8c --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature-request.md @@ -0,0 +1,24 @@ +--- +name: "\U0001F680Feature Request" +about: Submit a proposal/request for a new PyTorch feature + +--- + +## ๐Ÿš€ Feature + + +## Motivation + + + +## Pitch + + + +## Alternatives + + + +## Additional context + + diff --git a/.github/ISSUE_TEMPLATE/questions-help-support.md b/.github/ISSUE_TEMPLATE/questions-help-support.md new file mode 100644 index 00000000000000..77bfb55b9a468a --- /dev/null +++ b/.github/ISSUE_TEMPLATE/questions-help-support.md @@ -0,0 +1,13 @@ +--- +name: "โ“Questions/Help/Support" +about: Do you need support? We have resources. + +--- + +## โ“ Questions and Help + +### Please note that this issue tracker is not a help form and this issue will be closed. + +We have a set of [listed resources available on the website](https://pytorch.org/resources). Our primary means of support is our discussion forum: + +- [Discussion Forum](https://discuss.pytorch.org/) diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index 2dc64157c5d00d..e076b329b28f5b 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -102,17 +102,6 @@ fi # Add the test binaries so that they won't be git clean'ed away git add -f build/bin -# Test C FFI plugins -# cffi install doesn't work for Python 3.7 -if [[ "$BUILD_ENVIRONMENT" != *pynightly* ]]; then - # TODO: Don't run this here - pip install cffi - git clone https://github.com/pytorch/extension-ffi.git - pushd extension-ffi/script - python build.py - popd -fi - # Test documentation build if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then pushd docs diff --git a/.jenkins/pytorch/enabled-configs.txt b/.jenkins/pytorch/enabled-configs.txt index da9f62db38ded2..cffb72aa7acc4f 100644 --- a/.jenkins/pytorch/enabled-configs.txt +++ b/.jenkins/pytorch/enabled-configs.txt @@ -40,8 +40,8 @@ pytorch-macos-10.13-cuda9.2-cudnn7-py3-build pytorch-docker-build-test short-perf-test-cpu short-perf-test-gpu -py2-clang3.8-rocm1.7.1-ubuntu16.04-build -py2-clang3.8-rocm1.7.1-ubuntu16.04-test +py2-clang7-rocmdeb-ubuntu16.04-build +py2-clang7-rocmdeb-ubuntu16.04-test pytorch-ppc64le-cuda9.2-cudnn7-py3-build pytorch-ppc64le-cuda9.2-cudnn7-py3-test pytorch-ppc64le-cuda9.1-cudnn7-py3-build diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index 471fd8fac1fc6e..c43e821d98daf5 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -102,6 +102,7 @@ test_aten() { SUDO=sudo fi + ${SUDO} ln -s "$TORCH_LIB_PATH"/libc10* build/bin ${SUDO} ln -s "$TORCH_LIB_PATH"/libcaffe2* build/bin ${SUDO} ln -s "$TORCH_LIB_PATH"/libnccl* build/bin diff --git a/aten/src/ATen/Registry.h b/aten/src/ATen/Registry.h deleted file mode 100644 index 9d8d8ff2ee8404..00000000000000 --- a/aten/src/ATen/Registry.h +++ /dev/null @@ -1,2 +0,0 @@ -#pragma once -#include diff --git a/aten/src/ATen/core/Half-inl.h b/aten/src/ATen/core/Half-inl.h index 75ff2a2fe6937f..f964649e19f172 100644 --- a/aten/src/ATen/core/Half-inl.h +++ b/aten/src/ATen/core/Half-inl.h @@ -190,6 +190,33 @@ inline AT_HOST_DEVICE Half operator/(int a, Half b) { return static_cast(a) / b; } +//// Arithmetic with longs +inline AT_HOST_DEVICE Half operator+(Half a, long b) { + return a + static_cast(b); +} +inline AT_HOST_DEVICE Half operator-(Half a, long b) { + return a - static_cast(b); +} +inline AT_HOST_DEVICE Half operator*(Half a, long b) { + return a * static_cast(b); +} +inline AT_HOST_DEVICE Half operator/(Half a, long b) { + return a / static_cast(b); +} + +inline AT_HOST_DEVICE Half operator+(long a, Half b) { + return static_cast(a) + b; +} +inline AT_HOST_DEVICE Half operator-(long a, Half b) { + return static_cast(a) - b; +} +inline AT_HOST_DEVICE Half operator*(long a, Half b) { + return static_cast(a) * b; +} +inline AT_HOST_DEVICE Half operator/(long a, Half b) { + return static_cast(a) / b; +} + /// NOTE: we do not define comparisons directly and instead rely on the implicit /// conversion from at::Half to float. diff --git a/aten/src/ATen/core/LegacyTypeDispatch.cpp b/aten/src/ATen/core/LegacyTypeDispatch.cpp index 6835399bfe2ca8..56c19cda3f4271 100644 --- a/aten/src/ATen/core/LegacyTypeDispatch.cpp +++ b/aten/src/ATen/core/LegacyTypeDispatch.cpp @@ -9,7 +9,10 @@ LegacyTypeDispatch & globalLegacyTypeDispatch() { return singleton; } -AT_DEFINE_REGISTRY(LegacyTypeInitRegistry, LegacyTypeInitInterface, LegacyTypeInitArgs) +C10_DEFINE_REGISTRY( + LegacyTypeInitRegistry, + LegacyTypeInitInterface, + LegacyTypeInitArgs) const LegacyTypeInitInterface& getLegacyTypeInit() { static std::unique_ptr legacy_type_init; diff --git a/aten/src/ATen/core/LegacyTypeDispatch.h b/aten/src/ATen/core/LegacyTypeDispatch.h index 53cedf04e4601a..5383acbb97ebf7 100644 --- a/aten/src/ATen/core/LegacyTypeDispatch.h +++ b/aten/src/ATen/core/LegacyTypeDispatch.h @@ -43,8 +43,12 @@ struct CAFFE2_API LegacyTypeInitInterface { } }; struct CAFFE2_API LegacyTypeInitArgs {}; -AT_DECLARE_REGISTRY(LegacyTypeInitRegistry, LegacyTypeInitInterface, LegacyTypeInitArgs); -#define REGISTER_LEGACY_TYPE_INIT(clsname) AT_REGISTER_CLASS(LegacyTypeInitRegistry, clsname, clsname) +C10_DECLARE_REGISTRY( + LegacyTypeInitRegistry, + LegacyTypeInitInterface, + LegacyTypeInitArgs); +#define REGISTER_LEGACY_TYPE_INIT(clsname) \ + C10_REGISTER_CLASS(LegacyTypeInitRegistry, clsname, clsname) CAFFE2_API const LegacyTypeInitInterface& getLegacyTypeInit(); diff --git a/aten/src/ATen/core/Registry.h b/aten/src/ATen/core/Registry.h deleted file mode 100644 index 98a3e4a18c7258..00000000000000 --- a/aten/src/ATen/core/Registry.h +++ /dev/null @@ -1,217 +0,0 @@ -#pragma once - -/** - * Simple registry implementation that uses static variables to - * register object creators during program initialization time. - */ - -// NB: This Registry works poorly when you have other namespaces. -// Make all macro invocations from inside the at namespace. - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -namespace at { - -template -inline void PrintOffendingKey(const KeyType& /*key*/) { - printf("[key type printing not supported]\n"); -} - -template <> -inline void PrintOffendingKey(const std::string& key) { - printf("Offending key: %s.\n", key.c_str()); -} - -/** - * @brief A template class that allows one to register classes by keys. - * - * The keys are usually a std::string specifying the name, but can be anything that - * can be used in a std::map. - * - * You should most likely not use the Registry class explicitly, but use the - * helper macros below to declare specific registries as well as registering - * objects. - */ -template -class CAFFE2_API Registry { - public: - typedef std::function Creator; - - Registry() : registry_() {} - - void Register(const SrcType& key, Creator creator) { - // The if statement below is essentially the same as the following line: - // CHECK_EQ(registry_.count(key), 0) << "Key " << key - // << " registered twice."; - // However, CHECK_EQ depends on google logging, and since registration is - // carried out at static initialization time, we do not want to have an - // explicit dependency on glog's initialization function. - std::lock_guard lock(register_mutex_); - if (registry_.count(key) != 0) { - printf("Key already registered.\n"); - PrintOffendingKey(key); - std::exit(1); - } - registry_[key] = creator; - } - - void Register(const SrcType& key, Creator creator, const std::string& help_msg) { - Register(key, creator); - help_message_[key] = help_msg; - } - - inline bool Has(const SrcType& key) { return (registry_.count(key) != 0); } - - ObjectPtrType Create(const SrcType& key, Args... args) { - if (registry_.count(key) == 0) { - // Returns nullptr if the key is not registered. - return nullptr; - } - return registry_[key](args...); - } - - /** - * Returns the keys currently registered as a std::vector. - */ - std::vector Keys() { - std::vector keys; - for (const auto& it : registry_) { - keys.push_back(it.first); - } - return keys; - } - - const std::unordered_map& HelpMessage() const { - return help_message_; - } - - const char* HelpMessage(const SrcType& key) const { - auto it = help_message_.find(key); - if (it == help_message_.end()) { - return nullptr; - } - return it->second.c_str(); - } - - private: - std::unordered_map registry_; - std::unordered_map help_message_; - std::mutex register_mutex_; - - Registry(const Registry&) = delete; - Registry& operator=(const Registry&) = delete; -}; - -template -class CAFFE2_API Registerer { - public: - Registerer( - const SrcType& key, - Registry* registry, - typename Registry::Creator creator, - const std::string& help_msg = "") { - registry->Register(key, creator, help_msg); - } - - template - static ObjectPtrType DefaultCreator(Args... args) { - // TODO(jiayq): old versions of NVCC does not handle make_unique well - // so we are forced to use a unique_ptr constructor here. Check if it is - // fine to use make_unique in the future. - // return make_unique(args...); - return ObjectPtrType(new DerivedType(args...)); - } -}; - -/** - * AT_ANONYMOUS_VARIABLE(str) introduces an identifier starting with - * str and ending with a number that varies with the line. - * Pretty much a copy from 'folly/Preprocessor.h' - */ -#define AT_CONCATENATE_IMPL(s1, s2) s1##s2 -#define AT_CONCATENATE(s1, s2) AT_CONCATENATE_IMPL(s1, s2) -#ifdef __COUNTER__ -#define AT_ANONYMOUS_VARIABLE(str) AT_CONCATENATE(str, __COUNTER__) -#else -#define AT_ANONYMOUS_VARIABLE(str) AT_CONCATENATE(str, __LINE__) -#endif - -/** - * AT_DECLARE_TYPED_REGISTRY is a macro that expands to a function - * declaration, as well as creating a convenient typename for its corresponding - * registerer. - */ -#define AT_DECLARE_TYPED_REGISTRY( \ - RegistryName, SrcType, ObjectType, PtrType, ...) \ - CAFFE2_API Registry, __VA_ARGS__>* \ - RegistryName(); \ - typedef Registerer, __VA_ARGS__> \ - Registerer##RegistryName; \ - extern template class Registerer, __VA_ARGS__>; - -#define AT_DEFINE_TYPED_REGISTRY( \ - RegistryName, SrcType, ObjectType, PtrType, ...) \ - Registry, __VA_ARGS__>* RegistryName() { \ - static Registry, __VA_ARGS__>* registry = \ - new Registry, __VA_ARGS__>(); \ - return registry; \ - } \ - template class Registerer, __VA_ARGS__>; - -// Note(Yangqing): The __VA_ARGS__ below allows one to specify a templated -// creator with comma in its templated arguments. -#define AT_REGISTER_TYPED_CREATOR(RegistryName, key, ...) \ - namespace { \ - Registerer##RegistryName AT_ANONYMOUS_VARIABLE(g_##RegistryName)( \ - key, RegistryName(), __VA_ARGS__); \ - } - -#define AT_REGISTER_TYPED_CLASS(RegistryName, key, ...) \ - namespace { \ - Registerer##RegistryName AT_ANONYMOUS_VARIABLE(g_##RegistryName)( \ - key, \ - RegistryName(), \ - Registerer##RegistryName::DefaultCreator<__VA_ARGS__>, \ - ::at::demangle_type<__VA_ARGS__>()); \ - } - -// AT_DECLARE_REGISTRY and AT_DEFINE_REGISTRY are hard-wired to use std::string -// as the key -// type, because that is the most commonly used cases. -#define AT_DECLARE_REGISTRY(RegistryName, ObjectType, ...) \ - AT_DECLARE_TYPED_REGISTRY( \ - RegistryName, std::string, ObjectType, std::unique_ptr, __VA_ARGS__) - -#define AT_DEFINE_REGISTRY(RegistryName, ObjectType, ...) \ - AT_DEFINE_TYPED_REGISTRY( \ - RegistryName, std::string, ObjectType, std::unique_ptr, __VA_ARGS__) - -#define AT_DECLARE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \ - AT_DECLARE_TYPED_REGISTRY( \ - RegistryName, std::string, ObjectType, std::shared_ptr, __VA_ARGS__) - -#define AT_DEFINE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \ - AT_DEFINE_TYPED_REGISTRY( \ - RegistryName, std::string, ObjectType, std::shared_ptr, __VA_ARGS__) - -// AT_REGISTER_CREATOR and AT_REGISTER_CLASS are hard-wired to use std::string -// as the key -// type, because that is the most commonly used cases. -#define AT_REGISTER_CREATOR(RegistryName, key, ...) \ - AT_REGISTER_TYPED_CREATOR(RegistryName, #key, __VA_ARGS__) - -#define AT_REGISTER_CLASS(RegistryName, key, ...) \ - AT_REGISTER_TYPED_CLASS(RegistryName, #key, __VA_ARGS__) - -} // namespace at diff --git a/aten/src/ATen/core/StorageImpl.h b/aten/src/ATen/core/StorageImpl.h index bba2df4e0d1bec..a92b14d147c5ae 100644 --- a/aten/src/ATen/core/StorageImpl.h +++ b/aten/src/ATen/core/StorageImpl.h @@ -74,6 +74,8 @@ struct CAFFE2_API StorageImpl : public c10::intrusive_ptr_target { template inline T* data() const { + // TODO: This is bad: it means storage.data() calls only work on + // T that are valid ScalarType. FIXME! auto data_type_T = at::scalarTypeToDataType(at::CTypeToScalarType::to()); if (dtype().id() != data_type_T) { AT_ERROR( diff --git a/aten/src/ATen/core/Tensor.h b/aten/src/ATen/core/Tensor.h index fa31741313db39..31de431bf367b0 100644 --- a/aten/src/ATen/core/Tensor.h +++ b/aten/src/ATen/core/Tensor.h @@ -533,6 +533,7 @@ class CAFFE2_API Tensor { Tensor mv(const Tensor & vec) const; Tensor mvlgamma(int64_t p) const; Tensor & mvlgamma_(int64_t p); + Tensor narrow_copy(int64_t dim, int64_t start, int64_t length) const; Tensor narrow(int64_t dim, int64_t start, int64_t length) const; Tensor permute(IntList dims) const; Tensor pin_memory() const; diff --git a/aten/src/ATen/core/TensorImpl.cpp b/aten/src/ATen/core/TensorImpl.cpp index 5b568482d8dfe2..d8f38e98ef4434 100644 --- a/aten/src/ATen/core/TensorImpl.cpp +++ b/aten/src/ATen/core/TensorImpl.cpp @@ -45,6 +45,9 @@ IntList TensorImpl::sizes() const { } IntList TensorImpl::strides() const { + AT_ASSERTM(strides_.size() == sizes_.size(), + "Caffe2 tensors don't (yet) have meaningful strides and cannot " + "be used in PyTorch."); return strides_; } @@ -52,6 +55,10 @@ bool TensorImpl::compute_contiguous() const { bool is_contiguous = true; if (is_empty()) return is_contiguous; + if (strides_.empty()) { + // Special case for Caffe2 tensors which don't have strides set. + return true; + } int64_t z = 1; for (int64_t d = dim() - 1; d >= 0; d--) { if (size(d) != 1) { @@ -82,6 +89,9 @@ int64_t TensorImpl::size(int64_t d) const { } int64_t TensorImpl::stride(int64_t d) const { + AT_ASSERTM(strides_.size() == sizes_.size(), + "Caffe2 tensors don't (yet) have meaningful strides and cannot " + "be used in PyTorch."); d = at::maybe_wrap_dim(d, dim(), false); return strides_[d]; } diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h index 27232e2a3a8e97..7d7ce6a980249c 100644 --- a/aten/src/ATen/core/TensorImpl.h +++ b/aten/src/ATen/core/TensorImpl.h @@ -9,24 +9,142 @@ #include "ATen/core/TensorTypeIdRegistration.h" #include "ATen/core/LegacyTypeDispatch.h" #include "ATen/core/Backend.h" +#include "ATen/core/context_base.h" +#include "ATen/core/WrapDimMinimal.h" -struct THTensor; +#include "caffe2/core/allocator.h" +#include "caffe2/core/common.h" +#include "caffe2/core/flags.h" +#include "caffe2/core/logging.h" + +// A global boolean variable to control whether we free memory when a Tensor +// is shrinked to a smaller size. As a result, a Tensor is always going to +// keep the memory allocated for its maximum capacity reshaped to so far. +// +// This parameter is respected "upper-case" methods which call Resize() +// (e.g., CopyFrom, ResizeLike); it is NOT respected by Tensor::resize_ +// or ShrinkTo, both of which guarantee to never to free memory. +CAFFE2_DECLARE_bool(caffe2_keep_on_shrink); + +// Since we can have high variance in blob memory allocated across different +// inputs in the same run, we will shrink the blob only if the memory gain +// is larger than this flag in bytes. This only applies to functions which +// respect caffe2_keep_on_shrink. +CAFFE2_DECLARE_int64(caffe2_max_keep_on_shrink_memory); + + +namespace caffe2 { + +// Defined by protobuf +class DeviceOption; + +} namespace at { class Scalar; struct Type; struct Storage; class Tensor; -} // namespace at -namespace at { +/** + * A utility function to convert vector to vector. + */ +inline std::vector ToVectorint64_t(ArrayRef src) { + return std::vector(src.begin(), src.end()); +} + +/** + * Return product of all dimensions starting from k + */ +inline int64_t size_from_dim_(int k, IntList dims) { + int64_t r = 1; + for (size_t i = k; i < dims.size(); ++i) { + r *= dims[i]; + } + return r; +} + +// Product of all dims up to k (not including dims[k]) +inline int64_t size_to_dim_(int k, IntList dims) { + CAFFE_ENFORCE((unsigned)k <= dims.size()); + int64_t r = 1; + for (int i = 0; i < k; ++i) { + r *= dims[i]; + } + return r; +} + +// Product of all dims between k and l (not including dims[k] and dims[l]) +inline int64_t size_between_dim_(int k, int l, IntList dims) { + CAFFE_ENFORCE((unsigned)l < dims.size()); + int64_t r = 1; + if (k < l) { + for (int i = k + 1; i < l; ++i) { + r *= dims[i]; + } + } else { + for (int i = l + 1; i < k; ++i) { + r *= dims[i]; + } + } + return r; +} + +/** + * The low-level representation of a tensor, which contains a storage + * (which contains the actual data) and metadata (e.g., sizes and strides) + * describing this data as a tensor. + * + * Some basic characteristics about our in-memory representation of + * tensors: + * + * - It contains a pointer to a storage struct (Storage/StorageImpl) + * which contains the pointer to the actual data and records the + * data type and device of the view. This allows multiple tensors + * to alias the same underlying data, which allows to efficiently + * implement differing *views* on a tensor. + * + * - The tensor struct itself records view-specific metadata about + * the tensor, e.g., sizes, strides and offset into storage. + * Each view of a storage can have a different size or offset. + * + * - This class is intrusively refcounted. It is refcounted so that + * we can support prompt deallocation of large tensors; it is + * intrusively refcounted so that we can still perform reference + * counted operations on raw pointers, which is often more convenient + * when passing tensors across language boundaries. + */ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { TensorImpl() = delete; TensorImpl(TensorTypeId type_id, const caffe2::TypeMeta& data_type, Allocator *allocator, bool is_variable); TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable); + explicit TensorImpl(at::Storage storage) : storage_(std::move(storage)), storage_offset_(0) { + data_type_ = storage_ ? storage_.dtype() : caffe2::TypeMeta{}; + } + + TensorImpl(const TensorImpl&) = default; + TensorImpl& operator=(const TensorImpl&) = default; + TensorImpl(TensorImpl&&) = default; + TensorImpl& operator=(TensorImpl&&) = default; + virtual void release_resources() override; + // TODO: Ideally, type_id() would be the *only* key we need to consult + // to do a dispatch, instead of having to grovel through three different + // variables. Here's what's standing in the way: + // + // - To eliminate ScalarType, we have to allocate a TensorTypeId for + // each ScalarType+Backend combination, and then set it appropriately + // when we initially allocate a TensorImpl. + // + // - To eliminate is_variable, we have to allocate two classes of + // TensorTypeId: ones that are variables, and ones that are not. + // We may not want to eliminate this in the short term, because + // hard-coding variable status into type_id() makes it more difficult + // to do the "thread-local no_grad" trick (where we process Variables + // "as if" they were non-Variables by setting a thread local variable.) + // Type & type() const { // NB: It's valid to use getTypeRaw here, because the TensorImpl // could not have been created without initializing the Type first. @@ -42,9 +160,17 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { virtual const Storage& storage() const; friend struct Type; + /** + * The number of elements in a tensor. + * + * WARNING: If you are using the Caffe2 API, this method can sometimes + * return -1, specifically when a tensor has not yet had its storage + * allocated by calling mutable_data(). You can use this case to + * test if a tensor is initialized or not. + */ virtual int64_t numel() const { #ifdef DEBUG - AT_ASSERT(compute_numel() == numel_); + AT_ASSERT(numel_ == -1 || compute_numel() == numel_); #endif return numel_; } @@ -100,11 +226,25 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { template inline T * data() const { AT_ASSERT(!is_variable()); - return storage_.data() + storage_offset_; + CAFFE_ENFORCE_WITH_CALLER( + storage_.data() || numel_ == 0, + "The tensor has a non-zero number of elements, but its data is not allocated yet. " + "Caffe2 uses a lazy allocation, so you will need to call " + "mutable_data() or raw_mutable_data() to actually allocate memory."); + CAFFE_ENFORCE_WITH_CALLER( + storage_.IsType(), + "Tensor type mismatch, caller expects elements to be ", + caffe2::TypeMeta::TypeName(), + ", while tensor contains ", + data_type_.name(), + ". "); + // We managed the type check ourselves + return storage_.unsafe_data() + storage_offset_; } inline void* data() const { AT_ASSERT(!is_variable()); + CAFFE_ENFORCE_WITH_CALLER(storage_.data() || numel_ == 0); return static_cast( static_cast(storage_.data()) + data_type_.itemsize() * storage_offset_); @@ -119,6 +259,9 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { const caffe2::TypeMeta& dtype() const { return data_type_; } + size_t itemsize() const { + return data_type_.itemsize(); + } virtual int64_t storage_offset() const { return storage_offset_; @@ -139,13 +282,13 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { } virtual void set_size(int64_t dim, int64_t new_size) { - sizes_[dim] = new_size; + sizes_.at(dim) = new_size; refresh_numel(); refresh_contiguous(); } virtual void set_stride(int64_t dim, int64_t new_stride) { - strides_[dim] = new_stride; + strides_.at(dim) = new_stride; refresh_numel(); refresh_contiguous(); } @@ -214,5 +357,516 @@ struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { private: TensorImpl(Storage&& storage, TensorTypeId type_id, const caffe2::TypeMeta& data_type, bool is_variable); + + public: + + at::DeviceType device_type() const { + AT_ASSERT(!is_variable()); + return storage_.device_type(); + } + + at::Device GetDevice() const { + return storage_.device(); + } + + /** + * The static context of a tensor intuitively represents the device + * type of a tensor; e.g., a CPU tensor is associated with the + * GetCPUStaticContext(). This method replaces the former Context template + * parameter which was previously used to identify the device type + * of a tensor. + */ + at::BaseStaticContext* GetStaticContext() const { + return ::caffe2::get_static_context(device_type()); + } + + /** + * @brief Copies the data from a source tensor, with a contex provided to + * carry out the underlying memcpy operation. This method respects + * caffe2_keep_on_shrink. + */ + void CopyFrom(const TensorImpl& src, at::BaseContext* context = nullptr) { + if ((void*)&src == (void*)this) { + return; + } + if (data_type_ != src.dtype()) { + CAFFE_ENFORCE_WITH_CALLER( + src.is_contiguous(), + "Right now only copy of contiguous source Tensor is supported."); + storage_ = at::Storage(device_type(), src.dtype()); + data_type_ = src.dtype(); + } + if (src.numel() == -1) { + sizes_.clear(); + numel_ = -1; + strides_.clear(); + is_contiguous_ = true; + storage_.reset(); + data_type_ = caffe2::TypeMeta(); + return; + } + Resize(src.dims()); + if (numel() > 0) { + if (data_type_.copy()) { + CAFFE_ENFORCE( + device_type() == ::at::DeviceType::CPU, + "In CopyFrom source and dest tensors must both be CPU for meta copy"); + CAFFE_ENFORCE( + src.device_type() == ::at::DeviceType::CPU, + "In CopyFrom source and dest tensors must both be CPU for meta copy"); + data_type_.copy()(src.data(), raw_mutable_data(data_type_), numel()); + } else { + // We'll need to use a non-CPU context to perform the copy if + // one of the context is not CPU since only non-CPU context + // knows how to copy between CPU and that context + if (src.device_type() != ::at::DeviceType::CPU || device_type() == ::at::DeviceType::CPU) { + if (!context) { + CreateContext(src.GetDevice()) + ->CopyBytesToDevice( + numel() * itemsize(), + src.data(), + raw_mutable_data(data_type_), + device_type()); + } else { + CAFFE_ENFORCE( + context->device_type() == src.device_type(), + "Type for provided context does not match the type of source"); + context->CopyBytesToDevice( + numel() * itemsize(), src.data(), raw_mutable_data(data_type_), device_type()); + } + } else { + // In case source context is CPU, and target context is non-CPU + // We'll have to create a Context from target and perform the + // copy using that context + CreateContext(GetDevice()) + ->CopyBytesFromCPU( + numel() * itemsize(), + src.data(), + raw_mutable_data(data_type_)); + } + } + } + } + + /** + * @brief Extends the outer-most dimension of this tensor by num elements, + * preserving the existing data. + * + * The underlying data may be reallocated in order to accommodate the new + * elements, in which case this tensors' capacity is grown at a factor of + * growthPct. This ensures that Extend runs on an amortized O(1) time + * complexity. + */ + void Extend(int64_t num, float growthPct, at::BaseContext* context) { + CAFFE_ENFORCE_GE_WITH_CALLER(sizes_.size(), 1u); + CAFFE_ENFORCE_GE_WITH_CALLER( + num, 0, "`num` must be non-negative for Extend"); + CAFFE_ENFORCE_WITH_CALLER( + is_contiguous_, + "Right now Extend is only supported for contiguous Tensor."); + auto newDims = sizes_; + newDims[0] += num; + if (!storage_.data()) { + Resize(newDims); + return; + } + auto newNumel = std::accumulate( + newDims.begin(), + newDims.end(), + static_cast(1), + std::multiplies()); + if (newNumel * storage_.itemsize() <= storage_.capacity()) { + sizes_ = newDims; + numel_ = newNumel; + return; + } + auto newCapacity = sizes_; + newCapacity[0] = std::max( + newDims[0], std::ceil(sizes_[0] * (growthPct + 100) / 100)); + auto oldData = std::move(storage_.data_ptr()); + auto oldSize = numel_; + auto oldDims = sizes_; + Resize(newCapacity); + auto* newData = raw_mutable_data(data_type_); + CAFFE_ENFORCE( + context != nullptr, "Context must be provided to Extend the tensor"); + context->CopyItemsSameDevice( + data_type_, oldSize, oldData.get(), newData); + reserved_ = true; + sizes_ = newDims; + numel_ = newNumel; + } + + /** + * @brief Reserve space for the underlying tensor. + * + * This must be called after Resize(), since we only specify the first + * dimension This does not copy over the old data to the newly allocated space + */ + template + void ReserveSpace(const T& outer_dim) { + CAFFE_ENFORCE_WITH_CALLER( + is_contiguous_, + "Right now ReserveSpace is only supported for contiguous Tensor."); + CAFFE_ENFORCE( + numel_ != -1, "size should be initialized before calling ReserveSpace"); + CAFFE_ENFORCE( + storage_.unique(), "Can't call ReserveSpace on shared storage."); + auto newCapacity = sizes_; + newCapacity[0] = outer_dim; + auto newNumel = std::accumulate( + newCapacity.begin(), + newCapacity.end(), + static_cast(1), + std::multiplies()); + if (newNumel * storage_.itemsize() <= storage_.capacity()) { + return; + } + // Old data is discarded + storage_.data_ptr().clear(); + auto oldSize = numel_; + auto oldDims = sizes_; + Resize(newCapacity); + // Allocate new memory but don't copy over the data + raw_mutable_data(data_type_); + sizes_ = oldDims; + numel_ = oldSize; + reserved_ = true; + } + + /** + * @brief Resizes a tensor. + * + * Resize takes in a vector of ints specifying the dimensions of the tensor. + * You can pass in an empty vector to specify that it is a scalar (i.e. + * containing one single item). + * + * The underlying storage may be deleted after calling Resize: if the new + * shape leads to a different number of items in the tensor, the old memory + * is deleted and new memory will be allocated next time you call + * mutable_data(). However, if the shape is different but the total number of + * items is the same, the underlying storage is kept. + * + * This method respects caffe2_keep_on_shrink. Consult the internal logic + * of this method to see exactly under what circumstances this flag matters. + */ + template + void Resize(Ts... dim_source) { + bool is_init = numel_ == -1; + bool size_changed = SetDims(dim_source...); + if (size_changed) { + // If needed, we will free the data. the next mutable_data() call + // will create the data storage. + bool reset_tensor = false; + if (reserved_) { + // If tensor is reserved then don't claim its memeory unless capacity() + // is smaller than new size + reset_tensor = storage_.capacity() < (storage_offset_ + numel_) * storage_.itemsize(); + } else { + reset_tensor = storage_.capacity() < (storage_offset_ + numel_) * storage_.itemsize() || + !caffe2::FLAGS_caffe2_keep_on_shrink || + storage_.capacity() - (storage_offset_ + numel_) * storage_.itemsize() > + static_cast(caffe2::FLAGS_caffe2_max_keep_on_shrink_memory); + } + + if (reset_tensor && !is_init) { + FreeMemory(); + } + } + } + + /** + * Resizes the tensor without touching underlying storage. + * This requires the total size of the tensor to remains constant. + */ + inline void Reshape(const std::vector& dims) { + CAFFE_ENFORCE_WITH_CALLER( + is_contiguous_, + "Right now Reshape is only supported for contiguous Tensor."); + int64_t new_size = 1; + for (auto d : dims) { + CAFFE_ENFORCE_GE_WITH_CALLER(d, 0); + new_size *= d; + } + CAFFE_ENFORCE_WITH_CALLER( + new_size == numel_, + "New size and old size are not equal. You cannot use Reshape, " + "but should use Resize." + // TODO(jiayq): remove the following warning after pending diffs + // stabilize. + " The old caffe2 mixes Reshape and Resize but this behavior has " + "been changed. If you find this error, most likely you will need " + "to change corresponding code from Reshape to Resize."); + sizes_ = dims; + update_to_contiguous_strides(); + } + + /** + * Release whatever memory the tensor was holding but keep size and type + * information. Subsequent call to mutable_data will trigger new memory + * allocation. + */ + inline void FreeMemory() { + // We'll detach from the old Storage and create a new one + storage_ = at::Storage(storage_.device_type(), data_type_); + storage_offset_ = 0; + } + + /** + * @brief Shares the data with another tensor. + * + * To share data between two tensors, the sizes of the two tensors must be + * equal already. The reason we do not implicitly do a Resize to make the two + * tensors have the same shape is that we want to allow tensors of different + * shapes but the same number of items to still be able to share data. This + * allows one to e.g. have a n-dimensional Tensor and a flattened version + * sharing the same underlying storage. + * + * The source tensor should already have its data allocated. + */ + void ShareData(const TensorImpl& src) { + // Right now, we are assuming the device_type are the same, since it is + // inherently the same in the non-templatized code. We should probably add + // an ENFORCE here which might affect perf a little bit. + CAFFE_ENFORCE_EQ_WITH_CALLER( + src.numel_, + numel_, + "Size mismatch - did you call reshape before sharing the data?"); + // It is possible that the source tensor hasn't called mutable_data() yet, + // in which case ShareData() doesn't make much sense since we don't really + // know what to share yet. + CAFFE_ENFORCE_WITH_CALLER( + src.storage_.data() || src.numel_ == 0, + "Source tensor has no content and has size > 0"); + // Finally, do sharing. + /* Since we create new Storage whenever we need to change data_type/capacity + * this still keeps the original semantics + */ + storage_ = src.storage(); + data_type_ = src.dtype(); + storage_offset_ = src.storage_offset(); + } + + void ShareExternalPointer( + at::DataPtr&& data_ptr, + const caffe2::TypeMeta& data_type, + size_t capacity) { + CAFFE_ENFORCE_WITH_CALLER( + data_type.id() != caffe2::TypeIdentifier::uninitialized(), + "To share with a raw external pointer you need to pass in an " + "initialized data_type(TypeMeta)."); + if (!capacity) { + capacity = numel_ * data_type.itemsize(); + } + if (storage_.unique()) { + CAFFE_ENFORCE_WITH_CALLER( + numel_ >= 0, + "To share data with a raw pointer, you need to set shape first."); + storage_.UniqueStorageShareExternalPointer( + std::move(data_ptr), data_type, capacity); + data_type_ = data_type; + storage_offset_ = 0; + } else { + int64_t numel = capacity / data_type.itemsize(); + // Create a new Storage + storage_ = at::Storage(data_type, numel, std::move(data_ptr), nullptr, true); + data_type_ = data_type; + storage_offset_ = 0; + } + } + + /** + * Returns a mutable raw pointer of the underlying storage. Since we will need + * to know the type of the data for allocation, a TypeMeta object is passed in + * to specify the necessary information. This is conceptually equivalent of + * calling mutable_data() where the TypeMeta parameter meta is derived from + * the type T. This function differs from mutable_data() in the sense that + * the type T can be specified during runtime via the TypeMeta object. + * + * If the existing data does not match the desired type, it will be deleted + * and a new storage will be created. + */ + inline void* raw_mutable_data(const caffe2::TypeMeta& meta) { + // For 0-size tensors it's fine to return any pointer (including nullptr) + if (data_type_ == meta && (storage_.data() || numel_ == 0)) { + return static_cast(static_cast(storage_.data()) + storage_offset_ * meta.itemsize()); + } else { + CAFFE_ENFORCE_WITH_CALLER( + numel_ >= 0, + "Tensor is not initialized. You probably need to call Resize() " + "before calling mutable_data()"); + bool had_special_dtor = data_type_.dtor() != nullptr; + storage_offset_ = 0; + if (storage_.unique()) { + storage_.set_dtype(meta); + } else { + if (data_type_ != meta) { + storage_ = at::Storage(storage_.device_type(), meta); + } + } + data_type_ = meta; + + // We can reuse the existing buffer if the current data does not have + // a special destructor and the new data doesn't have a special + // constructor. + if (numel_ == 0 || + (meta.ctor() == nullptr && !had_special_dtor && + storage_.numel() >= numel_)) { + AT_ASSERT(storage_offset_ == 0); // because we just reallocated + return storage_.data(); + } + const at::Allocator* allocator = storage_.allocator(); + // TODO: Get rid of StaticContext + CAFFE_ENFORCE( + allocator == nullptr, + "Allocator is not used within Caffe2 functions, please use StaticContext instead."); + if (meta.ctor()) { + // For types that need placement new, we will call it, as well as + // making sure that when the data is freed, it calls the right + // destruction procedure. + auto size = numel_; + auto dtor = data_type_.dtor(); + void* ptr; + at::DeleterFnPtr deleter; + auto ptr_and_deleter = GetStaticContext()->New( + numel_ * storage_.itemsize()); // Removing this can get rid of + // InefficientStdFunctionContext + ptr = ptr_and_deleter.first; + deleter = ptr_and_deleter.second; + storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr( + ptr, + [size, dtor, deleter](void* local_ptr) -> void { + dtor(local_ptr, size); + deleter(local_ptr); + }, + at::Device(storage_.device_type()))); + data_type_.ctor()(storage_.data(), numel_); + } else { + // For fundamental type, new and delete is easier. + auto ptr_and_deleter = + GetStaticContext()->New(numel_ * storage_.itemsize()); + storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr( + ptr_and_deleter.first, + ptr_and_deleter.second, + at::Device(storage_.device_type()))); + } + storage_.set_numel(numel_); + AT_ASSERT(storage_offset_ == 0); // because we just reallocated + return storage_.data(); + } + } + + /** + * Returns a typed pointer of the underlying storage. + * + * For fundamental types, we reuse possible existing storage if there + * is sufficient capacity. + */ + template + inline T* mutable_data() { + if ((numel_ == 0 || storage_.data()) && storage_.IsType()) { + return static_cast(storage_.data()) + storage_offset_; + } + // Check it here statically - otherwise TypeMeta would throw the runtime + // error in attempt to invoke TypeMeta::ctor() + static_assert( + std::is_default_constructible::value, + "Tensor can't hold non-default-constructible types"); + return static_cast(raw_mutable_data(caffe2::TypeMeta::Make())); + } + + /** + * Returns the dimensions of the tensor as a vector. + */ + inline const std::vector& dims() const { + // TODO: This method will no longer work if we change the + // internal representation of dims(). That's BAD. Let's get + // people to stop using this. + return sizes_; + } + + protected: + // we decide to keep reserved_ and it will + // live in Tensor after the split + // The logic is that if Extend() or ReserveSpace() were ever called, + // then subsequent Resize()s will not free up Storage. + bool reserved_ = false; + + private: + template < + typename T, + typename = typename std::enable_if::value>::type> + bool SetDims(const std::vector& src) { + auto old_numel = numel_; + sizes_.resize(src.size()); + int64_t new_numel = 1; + for (size_t i = 0; i < src.size(); ++i) { + new_numel *= src[i]; + sizes_[i] = src[i]; + } + update_to_contiguous_strides(); + numel_ = new_numel; + return numel_ != old_numel; + } + + bool SetDims() { + auto old_numel = numel_; + sizes_.resize(0); + update_to_contiguous_strides(); + numel_ = 1; + return numel_ != old_numel; + } + + // TODO(jiayq): maybe rewrite the following functions with initializer list. + // NVCC does not play well with initializer lists last time, but worth + // another shot. + bool SetDims(const int64_t d0) { + auto old_numel = numel_; + sizes_.resize(1); + sizes_[0] = d0; + update_to_contiguous_strides(); + numel_ = d0; + return numel_ != old_numel; + } + + bool SetDims(const int64_t d0, const int64_t d1) { + auto old_numel = numel_; + sizes_.resize(2); + sizes_[0] = d0; + sizes_[1] = d1; + update_to_contiguous_strides(); + numel_ = d0 * d1; + return numel_ != old_numel; + } + + bool SetDims(const int64_t d0, const int64_t d1, const int64_t d2) { + auto old_numel = numel_; + sizes_.resize(3); + sizes_[0] = d0; + sizes_[1] = d1; + sizes_[2] = d2; + update_to_contiguous_strides(); + numel_ = d0 * d1 * d2; + return numel_ != old_numel; + } + + bool + SetDims(const int64_t d0, const int64_t d1, const int64_t d2, const int64_t d3) { + auto old_numel = numel_; + sizes_.resize(4); + sizes_[0] = d0; + sizes_[1] = d1; + sizes_[2] = d2; + sizes_[3] = d3; + update_to_contiguous_strides(); + numel_ = d0 * d1 * d2 * d3; + return numel_ != old_numel; + } + + inline void update_to_contiguous_strides() { + strides_.resize(0); + is_contiguous_ = true; + } + }; } // namespace at diff --git a/aten/src/ATen/core/TensorMethods.h b/aten/src/ATen/core/TensorMethods.h index c6197b4fc2d08b..857131298376b1 100644 --- a/aten/src/ATen/core/TensorMethods.h +++ b/aten/src/ATen/core/TensorMethods.h @@ -902,6 +902,9 @@ inline Tensor Tensor::mvlgamma(int64_t p) const { inline Tensor & Tensor::mvlgamma_(int64_t p) { return type().mvlgamma_(*this, p); } +inline Tensor Tensor::narrow_copy(int64_t dim, int64_t start, int64_t length) const { + return type().narrow_copy(*this, dim, start, length); +} inline Tensor Tensor::narrow(int64_t dim, int64_t start, int64_t length) const { return type().narrow(*this, dim, start, length); } diff --git a/aten/src/ATen/core/Type.h b/aten/src/ATen/core/Type.h index 3a2ccbe1e45edb..009ee309d7808a 100644 --- a/aten/src/ATen/core/Type.h +++ b/aten/src/ATen/core/Type.h @@ -492,6 +492,7 @@ struct CAFFE2_API Type { virtual Tensor mv(const Tensor & self, const Tensor & vec) const = 0; virtual Tensor mvlgamma(const Tensor & self, int64_t p) const = 0; virtual Tensor & mvlgamma_(Tensor & self, int64_t p) const = 0; + virtual Tensor narrow_copy(const Tensor & self, int64_t dim, int64_t start, int64_t length) const = 0; virtual Tensor narrow(const Tensor & self, int64_t dim, int64_t start, int64_t length) const = 0; virtual Tensor permute(const Tensor & self, IntList dims) const = 0; virtual Tensor pin_memory(const Tensor & self) const = 0; diff --git a/aten/src/ATen/core/VariableHooksInterface.cpp b/aten/src/ATen/core/VariableHooksInterface.cpp index 3728114492e53b..b9d90f56b8683b 100644 --- a/aten/src/ATen/core/VariableHooksInterface.cpp +++ b/aten/src/ATen/core/VariableHooksInterface.cpp @@ -24,6 +24,9 @@ namespace detail { } -AT_DEFINE_REGISTRY(VariableHooksRegistry, VariableHooksInterface, VariableHooksArgs) +C10_DEFINE_REGISTRY( + VariableHooksRegistry, + VariableHooksInterface, + VariableHooksArgs) } // namespace at::detail diff --git a/aten/src/ATen/core/VariableHooksInterface.h b/aten/src/ATen/core/VariableHooksInterface.h index e8fd4da9e27536..0b8eb1532c1bc6 100644 --- a/aten/src/ATen/core/VariableHooksInterface.h +++ b/aten/src/ATen/core/VariableHooksInterface.h @@ -1,8 +1,8 @@ #pragma once -#include -#include #include +#include +#include "c10/util/Registry.h" namespace at { class LegacyTypeDispatch; @@ -39,8 +39,12 @@ struct CAFFE2_API VariableHooksInterface { // for the "..." in a variadic macro" struct CAFFE2_API VariableHooksArgs {}; -AT_DECLARE_REGISTRY(VariableHooksRegistry, VariableHooksInterface, VariableHooksArgs) -#define REGISTER_VARIABLE_HOOKS(clsname) AT_REGISTER_CLASS(VariableHooksRegistry, clsname, clsname) +C10_DECLARE_REGISTRY( + VariableHooksRegistry, + VariableHooksInterface, + VariableHooksArgs); +#define REGISTER_VARIABLE_HOOKS(clsname) \ + C10_REGISTER_CLASS(VariableHooksRegistry, clsname, clsname) namespace detail { CAFFE2_API const VariableHooksInterface& getVariableHooks(); diff --git a/aten/src/ATen/core/WrapDimMinimal.h b/aten/src/ATen/core/WrapDimMinimal.h index 6971bac0b3f67c..859c1da0590a9d 100644 --- a/aten/src/ATen/core/WrapDimMinimal.h +++ b/aten/src/ATen/core/WrapDimMinimal.h @@ -20,4 +20,10 @@ static inline int64_t maybe_wrap_dim(int64_t dim, int64_t dim_post_expr, bool wr return dim; } +// Wrap around axis_index if it is negative, s.t., -1 is the last dim +// This is the "Caffe2" name +static inline int canonical_axis_index_(int axis_index, int ndims) { + return maybe_wrap_dim(axis_index, ndims, false); +} + } diff --git a/aten/src/ATen/core/context_base.cpp b/aten/src/ATen/core/context_base.cpp index e34c6880c0210a..f81bd81361305f 100644 --- a/aten/src/ATen/core/context_base.cpp +++ b/aten/src/ATen/core/context_base.cpp @@ -1,5 +1,16 @@ #include +namespace at { + +C10_DEFINE_TYPED_REGISTRY( + ContextRegistry, + at::DeviceType, + at::BaseContext, + std::unique_ptr, + at::Device); + +} // namespace at + namespace caffe2 { // TODO: rename context.h -> context_cpu.h & context_base.h -> context.h diff --git a/aten/src/ATen/core/context_base.h b/aten/src/ATen/core/context_base.h index 326cae5eb9691e..13bc885da344ee 100644 --- a/aten/src/ATen/core/context_base.h +++ b/aten/src/ATen/core/context_base.h @@ -6,11 +6,12 @@ #include #include -#include +#include +#include #include #include #include -#include +#include namespace caffe2 { class Event; @@ -31,11 +32,6 @@ class CAFFE2_API BaseStaticContext { virtual std::pair New(size_t nbytes) const = 0; - virtual std::unique_ptr CreateContext() = 0; - - virtual std::unique_ptr CreateContext( - const caffe2::DeviceOption&) = 0; - virtual DeviceType GetDeviceType() = 0; /* @@ -184,6 +180,22 @@ class CAFFE2_API BaseContext { } }; +// Context constructor registry +C10_DECLARE_TYPED_REGISTRY( + ContextRegistry, + at::DeviceType, + at::BaseContext, + std::unique_ptr, + at::Device); + +#define REGISTER_CONTEXT(type, ...) \ + C10_REGISTER_TYPED_CLASS(ContextRegistry, type, __VA_ARGS__) + +inline std::unique_ptr CreateContext( + const at::Device& device) { + return at::ContextRegistry()->Create(device.type(), device); +} + } // namespace at namespace caffe2 { diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp index 8dfb1e8ebb75b6..5df0a5b49ca93b 100644 --- a/aten/src/ATen/core/ivalue.cpp +++ b/aten/src/ATen/core/ivalue.cpp @@ -1,10 +1,19 @@ #include #include -#define TORCH_FORALL_TAGS(_) \ - _(None) \ - _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(String) \ - _(TensorList) _(Blob) +#define TORCH_FORALL_TAGS(_) \ + _(None) \ + _(Tensor) \ + _(Double) \ + _(Int) \ + _(Tuple) \ + _(IntList) \ + _(DoubleList) \ + _(String) \ + _(TensorList) \ + _(Blob) \ + _(GenericList) \ + _(World) \ namespace torch { namespace jit { @@ -16,7 +25,7 @@ CAFFE2_API c10::intrusive_ptr ConstantString::create( namespace { template -std::ostream& printList(std::ostream & out, const ConstantList &v, +std::ostream& printList(std::ostream & out, const List &v, const std::string start, const std::string delim, const std::string finish) { out << start; for(size_t i = 0; i < v.elements().size(); ++i) { @@ -40,13 +49,13 @@ std::ostream& operator<<(std::ostream & out, const ConstantString & v) { } template -std::ostream& operator<<(std::ostream & out, const ConstantList & v) { +std::ostream& operator<<(std::ostream & out, const List & v) { return printList(out, v, "[", ", ", "]"); } // tuple case template<> -std::ostream& operator<<(std::ostream & out, const ConstantList & v) { +std::ostream& operator<<(std::ostream & out, const List & v) { return printList(out, v, "(", ", ", ")"); } diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h index 513845d4c12af0..5e210d638d9226 100644 --- a/aten/src/ATen/core/ivalue.h +++ b/aten/src/ATen/core/ivalue.h @@ -33,16 +33,17 @@ struct CAFFE2_API ConstantString final : c10::intrusive_ptr_target { const ConstantString& v); }; -// non-mutable list template -struct C10_EXPORT ConstantList final : c10::intrusive_ptr_target { +struct C10_EXPORT List : c10::intrusive_ptr_target { private: - const std::vector elements_; + std::vector elements_; + public: - ConstantList(std::vector elements_) - : elements_(std::move(elements_)) {} - static c10::intrusive_ptr> create(std::vector elements_) { - return c10::make_intrusive>(std::move(elements_)); + typedef Elem ElemType; + + List(std::vector elements_) : elements_(std::move(elements_)) {} + static c10::intrusive_ptr> create(std::vector elements_) { + return c10::make_intrusive>(std::move(elements_)); } const std::vector& elements() const { return elements_; @@ -50,13 +51,30 @@ struct C10_EXPORT ConstantList final : c10::intrusive_ptr_target { operator const std::vector&() const { return elements(); } + + std::vector& elements() { + return elements_; + } + operator std::vector&() { + return elements(); + } +}; + +struct World { + int64_t world_id; }; struct IValue; -using Tuple = ConstantList; -using IntList = ConstantList; -using TensorList = ConstantList; -using DoubleList = ConstantList; +struct C10_EXPORT Tuple : public List { + using List::List; + static c10::intrusive_ptr create(std::vector elements_) { + return c10::make_intrusive(std::move(elements_)); + } +}; +using IntList = List; +using TensorList = List; +using DoubleList = List; +using GenericList = List; // IValue is the generic tagged union used by the interpreter to hold // all value types. @@ -65,10 +83,19 @@ using DoubleList = ConstantList; // to mark whether that type is a subtype of c10::intrusive_ptr_target and needs // retain/release calls. -#define TORCH_FORALL_TAGS(_) \ - _(None) \ - _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(String) \ - _(TensorList) _(Blob) +#define TORCH_FORALL_TAGS(_) \ + _(None) \ + _(Tensor) \ + _(Double) \ + _(Int) \ + _(Tuple) \ + _(IntList) \ + _(DoubleList) \ + _(String) \ + _(TensorList) \ + _(Blob) \ + _(GenericList) \ + _(World) \ struct CAFFE2_API IValue final { IValue() @@ -128,6 +155,13 @@ struct CAFFE2_API IValue final { return at::Tensor(toIntrusivePtr()); } + const IValue& toIValue() const { + return *this; + } + IValue& toIValue() { + return *this; + } + IValue(caffe2::Blob blob) : tag(Tag::Blob), is_intrusive_ptr(true) { // TODO (after Tensor merge) If we pass in a Blob holding a Tensor, extract // and @@ -170,6 +204,17 @@ struct CAFFE2_API IValue final { return payload.as_double; } + // World + IValue(World w) + : tag(Tag::World), is_intrusive_ptr(false) { + payload.as_world = w; + } + bool isWorld() const { return Tag::World == tag; } + World toWorld() const { + AT_ASSERT(isWorld()); + return payload.as_world; + } + // Int IValue(int64_t i) : tag(Tag::Int), is_intrusive_ptr(false) { @@ -207,6 +252,7 @@ struct CAFFE2_API IValue final { const std::vector& toIntListRef() const; const std::vector& toDoubleListRef() const; const std::vector& toTensorListRef() const; + const std::vector& toGenericListRef() const; // ConstantString IValue(c10::intrusive_ptr v); @@ -247,6 +293,19 @@ struct CAFFE2_API IValue final { return toIntrusivePtr(); } + //GenericList + IValue(c10::intrusive_ptr v); + IValue(std::vector v); + bool isGenericList() const { return Tag::GenericList == tag; } + c10::intrusive_ptr toGenericList() && { + AT_ASSERT(isGenericList()); + return moveToIntrusivePtr(); + } + c10::intrusive_ptr toGenericList() const & { + AT_ASSERT(isGenericList()); + return toIntrusivePtr(); + } + // None bool isNone() { return Tag::None == tag; @@ -338,6 +397,7 @@ struct CAFFE2_API IValue final { int64_t as_int; double as_double; c10::intrusive_ptr_target* as_intrusive_ptr; + World as_world; } payload; Tag tag; bool is_intrusive_ptr; @@ -362,12 +422,16 @@ DEFINE_TO(int64_t, toInt) DEFINE_TO(c10::intrusive_ptr, toDoubleList) DEFINE_TO(c10::intrusive_ptr, toIntList) DEFINE_TO(c10::intrusive_ptr, toTensorList) +DEFINE_TO(c10::intrusive_ptr, toGenericList) DEFINE_TO(c10::intrusive_ptr, toString) DEFINE_TO(at::Scalar, toScalar) DEFINE_TO(bool, toInt) DEFINE_TO(std::vector, toIntListRef) DEFINE_TO(std::vector, toDoubleListRef) DEFINE_TO(std::vector, toTensorListRef) +DEFINE_TO(std::vector, toGenericListRef) +DEFINE_TO(World, toWorld) +DEFINE_TO(IValue, toIValue) #undef DEFINE_TO @@ -433,6 +497,14 @@ inline IValue::IValue(c10::intrusive_ptr v) inline IValue::IValue(std::vector v) : IValue(TensorList::create(std::move(v))) {} +inline IValue::IValue(c10::intrusive_ptr v) +: tag(Tag::GenericList), is_intrusive_ptr(true) { + payload.as_intrusive_ptr = v.release(); +} +inline IValue::IValue(std::vector v) +: IValue(GenericList::create(std::move(v))) {} + + inline const std::vector& IValue::toIntListRef() const { return toIntList()->elements(); } @@ -445,5 +517,9 @@ inline const std::vector& IValue::toTensorListRef() const { return toTensorList()->elements(); } +inline const std::vector& IValue::toGenericListRef() const { + return toGenericList()->elements(); +} + }} diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp index 58248acfe17951..0a4649d9c41ad4 100644 --- a/aten/src/ATen/cuda/CUDAContext.cpp +++ b/aten/src/ATen/cuda/CUDAContext.cpp @@ -54,15 +54,13 @@ Allocator* getCUDADeviceAllocator() { } /* Handles */ -#ifndef __HIP_PLATFORM_HCC__ - cusparseHandle_t getCurrentCUDASparseHandle() { - return THCState_getCurrentSparseHandle(at::globalContext().getTHCState()); - } +cusparseHandle_t getCurrentCUDASparseHandle() { + return THCState_getCurrentSparseHandle(at::globalContext().getTHCState()); +} - cublasHandle_t getCurrentCUDABlasHandle() { - return THCState_getCurrentBlasHandle(at::globalContext().getTHCState()); - } -#endif +cublasHandle_t getCurrentCUDABlasHandle() { + return THCState_getCurrentBlasHandle(at::globalContext().getTHCState()); +} } // namespace cuda diff --git a/aten/src/ATen/cuda/CUDAContext.h b/aten/src/ATen/cuda/CUDAContext.h index 83a890da4d535e..3a480d2ca4e4e3 100644 --- a/aten/src/ATen/cuda/CUDAContext.h +++ b/aten/src/ATen/cuda/CUDAContext.h @@ -59,10 +59,8 @@ CAFFE2_API void uncheckedSetCurrentCUDAStream(CUDAStream stream); CAFFE2_API Allocator* getCUDADeviceAllocator(); /* Handles */ -#ifndef __HIP_PLATFORM_HCC__ CAFFE2_API cusparseHandle_t getCurrentCUDASparseHandle(); CAFFE2_API cublasHandle_t getCurrentCUDABlasHandle(); -#endif } // namespace cuda diff --git a/aten/src/ATen/detail/CUDAHooksInterface.cpp b/aten/src/ATen/detail/CUDAHooksInterface.cpp index ec2ac11f305dcf..f3299b34cb7f9b 100644 --- a/aten/src/ATen/detail/CUDAHooksInterface.cpp +++ b/aten/src/ATen/detail/CUDAHooksInterface.cpp @@ -54,6 +54,6 @@ const CUDAHooksInterface& getCUDAHooks() { } } // namespace detail -AT_DEFINE_REGISTRY(CUDAHooksRegistry, CUDAHooksInterface, CUDAHooksArgs) +C10_DEFINE_REGISTRY(CUDAHooksRegistry, CUDAHooksInterface, CUDAHooksArgs) } // namespace at diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h index 69149932ac7b98..b8cff1a7aa125f 100644 --- a/aten/src/ATen/detail/CUDAHooksInterface.h +++ b/aten/src/ATen/detail/CUDAHooksInterface.h @@ -2,9 +2,10 @@ #include #include -#include #include +#include "c10/util/Registry.h" + #include #include #include @@ -131,9 +132,9 @@ struct CAFFE2_API CUDAHooksInterface { // for the "..." in a variadic macro" struct CAFFE2_API CUDAHooksArgs {}; -AT_DECLARE_REGISTRY(CUDAHooksRegistry, CUDAHooksInterface, CUDAHooksArgs) +C10_DECLARE_REGISTRY(CUDAHooksRegistry, CUDAHooksInterface, CUDAHooksArgs); #define REGISTER_CUDA_HOOKS(clsname) \ - AT_REGISTER_CLASS(CUDAHooksRegistry, clsname, clsname) + C10_REGISTER_CLASS(CUDAHooksRegistry, clsname, clsname) namespace detail { CAFFE2_API const CUDAHooksInterface& getCUDAHooks(); diff --git a/aten/src/ATen/detail/ComplexHooksInterface.cpp b/aten/src/ATen/detail/ComplexHooksInterface.cpp index 9755e288ff5fe7..a7ffcf1d625f2b 100644 --- a/aten/src/ATen/detail/ComplexHooksInterface.cpp +++ b/aten/src/ATen/detail/ComplexHooksInterface.cpp @@ -20,6 +20,8 @@ const ComplexHooksInterface& getComplexHooks() { } } // namespace detail -AT_DEFINE_REGISTRY(ComplexHooksRegistry, ComplexHooksInterface, ComplexHooksArgs) - +C10_DEFINE_REGISTRY( + ComplexHooksRegistry, + ComplexHooksInterface, + ComplexHooksArgs) } diff --git a/aten/src/ATen/detail/ComplexHooksInterface.h b/aten/src/ATen/detail/ComplexHooksInterface.h index e5d5c3ec2a83fa..52f835a30cc17b 100644 --- a/aten/src/ATen/detail/ComplexHooksInterface.h +++ b/aten/src/ATen/detail/ComplexHooksInterface.h @@ -1,7 +1,7 @@ #pragma once -#include -#include +#include +#include "c10/util/Registry.h" namespace at { @@ -16,9 +16,12 @@ struct CAFFE2_API ComplexHooksInterface { }; struct CAFFE2_API ComplexHooksArgs {}; -AT_DECLARE_REGISTRY(ComplexHooksRegistry, ComplexHooksInterface, ComplexHooksArgs) +C10_DECLARE_REGISTRY( + ComplexHooksRegistry, + ComplexHooksInterface, + ComplexHooksArgs); #define REGISTER_COMPLEX_HOOKS(clsname) \ - AT_REGISTER_CLASS(ComplexHooksRegistry, clsname, clsname) + C10_REGISTER_CLASS(ComplexHooksRegistry, clsname, clsname) namespace detail { CAFFE2_API const ComplexHooksInterface& getComplexHooks(); diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py index 189cadf0b6d1c6..1955d07b630d74 100644 --- a/aten/src/ATen/function_wrapper.py +++ b/aten/src/ATen/function_wrapper.py @@ -107,16 +107,10 @@ def TypedDict(name, attrs, total=True): # type: ignore # NB: As far as ezyang can tell, we don't *have* to codegen this, # because we will inherit it from the TYPE_METHOD_DEFINITION_CONCRETE in # the superclass. But it doesn't seem to be harmful. -# -# TODO: self_ty is a hack to make things work for native methods which need to -# take a dtype, but also need to dispatch differently for different types. -# Eliminate it at some point. TYPE_DERIVED_DEFINITION_NATIVE = CodeTemplate("""\ ${return_type} ${Type}::${api_name}(${type_method_formals}) const { ${device_guard_declaration} - const auto& self_ty = *this; - (void)self_ty; - ${return_call} at::native::${native_type_method_dispatch}(/* actuals */ ${actuals}); + ${return_call} at::native::${native_type_method_dispatch}(/* actuals */ ${type_derived_call_actuals}); } """) TYPE_DERIVED_DEFINITION_NATIVE_MISSING = CodeTemplate("""\ @@ -1574,8 +1568,15 @@ def process_native(option): TYPE_DERIVED_DEFINITION_NATIVE_MISSING.substitute(env)) else: option['native_type_method_dispatch'] = native_dispatch + type_derived_call_actuals = [] + for actual, arg in zip(option['actuals'], option['arguments']): + if arg.get('is_type_dispatched', False): + type_derived_call_actuals.append('*this') + else: + type_derived_call_actuals.append(actual) type_object_definitions.append( - TYPE_DERIVED_DEFINITION_NATIVE.substitute(env)) + TYPE_DERIVED_DEFINITION_NATIVE.substitute( + env, type_derived_call_actuals=type_derived_call_actuals)) for declaration in declarations: for option in declaration['options']: diff --git a/aten/src/ATen/native/PixelShuffle.cpp b/aten/src/ATen/native/PixelShuffle.cpp index 1f93ecbc8235ab..d16458e5ad80a6 100644 --- a/aten/src/ATen/native/PixelShuffle.cpp +++ b/aten/src/ATen/native/PixelShuffle.cpp @@ -1,7 +1,7 @@ #include "ATen/native/TensorTransformations.h" #include -#include +#include #include #include diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index c470f554c14234..31b8f59a779a65 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -148,6 +148,45 @@ Tensor &as_strided_(Tensor& self, IntList size, IntList stride) { return at::as_strided_(self, size, stride, self.storage_offset()); } +Tensor narrow_copy_sparse(const Tensor& self, int64_t dim, int64_t start, int64_t length){ + int64_t allDim = self.dim(); + int64_t end = start+length; + AT_CHECK(allDim > 0, "narrow() cannot be applied to a 0-dim tensor."); + AT_CHECK(dim >= 0 && dim < allDim, + "Dimension ", dim, " out of range. Expecting 0 <= dim < ", allDim, "."); + AT_CHECK(start >= 0 && length >= 0 && end <= self.size(dim), + "Invalid range to narrow. range(start, start+length) must be a subset of range(0, ", self.size(dim), ").") + LongTensor indices = self._indices(); + int64_t sparseDims = self._sparseDims(); + + std::vector newSizes = self.sizes().vec(); + newSizes[dim]=length; + + Tensor newValues; + LongTensor newIndices; + if(dim < sparseDims){ + Tensor mask = (indices[dim] >= start).__and__((indices[dim] < end)); + newIndices = indices.masked_select(mask).view({sparseDims, -1}); + newIndices[dim].add_(-start); + Tensor nzIndices = mask.nonzero().view(-1); + newValues = self._values().index_select(0, nzIndices); + }else{ + /* This means we are narrowing on a dense dim, which is in effect just a + regular narrow on _values() */ + newIndices = indices; + int64_t ddim = dim - sparseDims + 1; + newValues = self._values().narrow_copy(ddim, start, length); + } + + SparseTensor newTensor = at::sparse_coo_tensor(newIndices, newValues, newSizes, self.type().options()); + _get_sparse_impl(newTensor)->set_coalesced(self.is_coalesced()); + return newTensor; +} + +Tensor narrow_copy_dense(const Tensor& self, int64_t dim, int64_t start, int64_t length){ + return self.narrow(dim, start, length).clone(); +} + Tensor narrow(const Tensor& self, int64_t dim, int64_t start, int64_t length) { AT_CHECK(self.dim() > 0, "narrow() cannot be applied to a 0-dim tensor."); auto cur_size = self.size(dim); diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 2cc0995dabadad..b4ebdfb634e422 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -1174,6 +1174,14 @@ - func: mvlgamma_(Tensor self, int64_t p) -> Tensor variants: method +- func: narrow_copy(Tensor self, int64_t dim, int64_t start, int64_t length) -> Tensor + variants: method + dispatch: + CPU: narrow_copy_dense + CUDA: narrow_copy_dense + SparseCPU: narrow_copy_sparse + SparseCUDA: narrow_copy_sparse + - func: narrow(Tensor self, int64_t dim, int64_t start, int64_t length) -> Tensor variants: function, method @@ -2060,8 +2068,8 @@ SparseCPU: hspmm_sparse_cpu SparseCUDA: hspmm_sparse_cuda -# This "raw copy" doesn't handle conversions NOR does it handle non-blocking. -- func: raw_copy_sparse_(Tensor self, Tensor src) -> Tensor +- func: copy_sparse_to_sparse_(Tensor self, Tensor src, bool non_blocking=false) -> Tensor + variants: function dispatch: SparseCPU: copy_sparse_ SparseCUDA: copy_sparse_ diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp index 83aee52cf81021..7e2340be24a10f 100644 --- a/aten/src/ATen/native/sparse/SparseTensor.cpp +++ b/aten/src/ATen/native/sparse/SparseTensor.cpp @@ -204,7 +204,7 @@ SparseTensor new_with_tensor_and_size_sparse(const LongTensor& indices, const Te SparseTensor clone_sparse(const SparseTensor& self) { SparseTensor other = new_with_dims_and_size_sparse(self.type(), self._sparseDims(), self._denseDims(), self.sizes()); - _copy_into_sparse(other, _get_sparse_impl(self)->indices(), _get_sparse_impl(self)->values()); + _copy_into_sparse(other, _get_sparse_impl(self)->indices(), _get_sparse_impl(self)->values(), true); _get_sparse_impl(other)->set_coalesced(self.is_coalesced()); return other; } @@ -243,11 +243,11 @@ Tensor sparse_to_dense(const SparseTensor& self) { return dst.add_(self); } -SparseTensor& copy_sparse_(SparseTensor& self, const SparseTensor& src) { +SparseTensor& copy_sparse_(SparseTensor& self, const SparseTensor& src, bool non_blocking) { if (isSameTensor(self, src)) return self; _get_sparse_impl(self)->resize_(src._sparseDims(), src._denseDims(), src.sizes()); // NB: This seems to copy the underlying full indices/values buffer - _copy_into_sparse(self, _get_sparse_impl(src)->indices(), _get_sparse_impl(src)->values()); + _copy_into_sparse(self, _get_sparse_impl(src)->indices(), _get_sparse_impl(src)->values(), non_blocking); _get_sparse_impl(self)->set_coalesced(src.is_coalesced()); return self; } diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp index 8a8668fc48b8a1..c71e38450974a6 100644 --- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp +++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp @@ -98,7 +98,7 @@ SparseTensor& log1p_out_sparse(SparseTensor& r, const SparseTensor& t) { r.is_coalesced(), "log1p: in-place on uncoalesced tensors is not supported yet!"); } else { - r = raw_copy_sparse_(r, t.coalesce()); + copy_sparse_to_sparse_(r, t.coalesce()); } r._values().log1p_(); return r; @@ -192,7 +192,7 @@ SparseTensor& add_out_sparse_cpu(SparseTensor& r, const SparseTensor& t, const S AT_CHECK(t.sizes().equals(src.sizes()), "add: expected sizes of 'self' and 'other' to match, but ", t.sizes(), " != ", src.sizes()); if (src._nnz() == 0) { - return raw_copy_sparse_(r, t); + return copy_sparse_to_sparse_(r, t); } if (t._nnz() == 0) { return mul_out_sparse_scalar(r, src, value); diff --git a/aten/src/ATen/native/sparse/SparseUtils.h b/aten/src/ATen/native/sparse/SparseUtils.h index 2626eedebaf5e2..a0fbf4ea904cc4 100644 --- a/aten/src/ATen/native/sparse/SparseUtils.h +++ b/aten/src/ATen/native/sparse/SparseUtils.h @@ -50,8 +50,8 @@ inline void _alias_into_sparse(const SparseTensor& self, const LongTensor& indic // Take indices and values and makes a (data) copy of them to put into the sparse // indices/values. This used to be called THSTensor_(_set) -inline void _copy_into_sparse(const SparseTensor& self, const LongTensor& indices, const Tensor& values) { - _alias_into_sparse(self, indices.clone(), values.clone()); +inline void _copy_into_sparse(const SparseTensor& self, const LongTensor& indices, const Tensor& values, bool non_blocking) { + _alias_into_sparse(self, self._indices().type().copy(indices, non_blocking), self._values().type().copy(values, non_blocking)); } // Does NOT make copies of indices/values diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu index 036666bec82ac2..2abc10e62c3d46 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu @@ -363,7 +363,7 @@ SparseTensor& add_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t, const AT_CHECK(t.sizes().equals(src.sizes()), "add: expected 'self' and 'other' to have same size, but ", t.sizes(), " != ", src.sizes()); if (src._nnz() == 0) { - return raw_copy_sparse_(r_, t); + return copy_sparse_to_sparse_(r_, t); } if (t._nnz() == 0) { return mul_out_sparse_scalar(r_, src, value); diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp index 0891f6d9f4f492..03309f8fe9eee3 100644 --- a/aten/src/ATen/templates/TypeDefault.cpp +++ b/aten/src/ATen/templates/TypeDefault.cpp @@ -18,7 +18,8 @@ namespace at { Tensor & TypeDefault::copy_(Tensor & self, const Tensor & src, bool non_blocking) const { Tensor b_src; - std::tie(b_src) = expand_inplace(self, src, "copy"); + if (is_sparse()) b_src = src; + else std::tie(b_src) = expand_inplace(self, src, "copy"); return s_copy_(self, b_src, non_blocking); } @@ -28,19 +29,11 @@ Tensor TypeDefault::copy(const Tensor & src, bool non_blocking, optional device_guard.set_index(to_device.value().index()); } AT_CHECK(src.defined(), "attempt to copy an undefined tensor"); - if (is_sparse()) { - auto indices = src._indices(); - auto values = src._values(); - auto & this_dense = toBackend(is_cuda() ? Backend::CUDA : Backend::CPU); - auto & this_dense_idx = this_dense.toScalarType(ScalarType::Long); - auto indices_copy = this_dense_idx.copy(indices, non_blocking); - auto values_copy = this_dense.copy(values, non_blocking); - return _sparse_coo_tensor_unsafe(indices_copy, values_copy, src.sizes()); - } else { - Tensor r = this->tensor(src.sizes()); - r.copy_(src, non_blocking); - return r; - } + Tensor r; + if (is_sparse()) r = this->native_tensor(); + else r = this->tensor(src.sizes()); + r.copy_(src, non_blocking); + return r; } void TypeDefault::backward(Tensor & self, at::optional gradient, bool keep_graph, bool create_graph) const { diff --git a/aten/src/ATen/test/apply_test.cpp b/aten/src/ATen/test/apply_test.cpp index fc39eccee3926b..93a2d705bd8a08 100644 --- a/aten/src/ATen/test/apply_test.cpp +++ b/aten/src/ATen/test/apply_test.cpp @@ -1,121 +1,135 @@ -#define CATCH_CONFIG_MAIN -#include "catch_utils.hpp" +#include "gtest/gtest.h" #include "cuda.h" #include "cuda_runtime.h" #include "ATen/cuda/detail/TensorInfo.cuh" - +#define ASSERT_EQ_CUDA(X, Y) \ + { \ + bool _isEQ = X == Y; \ + ASSERT_TRUE(_isEQ); \ + } /* -Tests related to tensor indexing and applying operations. + Tests related to tensor indexing and applying operations. */ #ifndef _WIN32 -CATCH_TEST_CASE("2D Contiguous", "Collapses a 2D contiguous tensor to 1D contiguous") { - int sizes[] = {4, 4}; - int strides[] = {4, 1}; - ::at::cuda::detail::TensorInfo ti{nullptr, 2, sizes, strides}; - ti.collapseDims(); - CATCH_REQUIRE(ti.dims == 1); - CATCH_REQUIRE(ti.sizes[0] == (4 * 4)); +// CATCH_TEST_CASE("2D Contiguous", "Collapses a 2D contiguous tensor to 1D +// contiguous") { +TEST(ApplyTest, Contiguous2D) { + int sizes[] = {4, 4}; + int strides[] = {4, 1}; + ::at::cuda::detail::TensorInfo ti{nullptr, 2, sizes, strides}; + ti.collapseDims(); + ASSERT_EQ_CUDA(ti.dims, 1); + ASSERT_EQ_CUDA(ti.sizes[0], (4 * 4)); } -CATCH_TEST_CASE("3D Contiguous", "Collapses a 3D contiguous tensor to a 1D contiguous") { - int sizes[] = {6, 3, 7}; - int strides[] = {3 * 7, 7, 1}; - ::at::cuda::detail::TensorInfo ti{nullptr, 3, sizes, strides}; - ti.collapseDims(); - CATCH_REQUIRE(ti.dims == 1); - CATCH_REQUIRE(ti.sizes[0] == (6 * 3 * 7)); +// CATCH_TEST_CASE("3D Contiguous", "Collapses a 3D contiguous tensor to a 1D +// contiguous") { +TEST(ApplyTest, Contiguous3D) { + int sizes[] = {6, 3, 7}; + int strides[] = {3 * 7, 7, 1}; + ::at::cuda::detail::TensorInfo ti{nullptr, 3, sizes, strides}; + ti.collapseDims(); + ASSERT_EQ_CUDA(ti.dims, 1); + ASSERT_EQ_CUDA(ti.sizes[0], (6 * 3 * 7)); } - -CATCH_TEST_CASE("3D Partial Collapse", "Collapses a 3D noncontiguous tensor to a 2D tensor") { - int sizes[] = {4, 3, 2}; - int strides[] = {3 * 3, 3, 1}; - ::at::cuda::detail::TensorInfo ti{nullptr, 3, sizes, strides}; - ti.collapseDims(); - CATCH_REQUIRE(ti.dims == 2); - CATCH_REQUIRE(ti.sizes[0] == (4 * 3)); - CATCH_REQUIRE(ti.sizes[1] == 2); +// CATCH_TEST_CASE("3D Partial Collapse", "Collapses a 3D noncontiguous tensor +// to a 2D tensor") { +TEST(ApplyTest, PartialCollapse3D) { + int sizes[] = {4, 3, 2}; + int strides[] = {3 * 3, 3, 1}; + ::at::cuda::detail::TensorInfo ti{nullptr, 3, sizes, strides}; + ti.collapseDims(); + ASSERT_EQ_CUDA(ti.dims, 2); + ASSERT_EQ_CUDA(ti.sizes[0], (4 * 3)); + ASSERT_EQ_CUDA(ti.sizes[1], 2); } -CATCH_TEST_CASE("2D Strided Collapse", "Collapses a 2D skip contiguous tensor to a 1D skip contiguous tensor") { - int sizes[] = {3, 2}; - int strides[] = {2 * 2, 2}; - ::at::cuda::detail::TensorInfo ti{nullptr, 2, sizes, strides}; - ti.collapseDims(); - CATCH_REQUIRE(ti.dims == 1); - CATCH_REQUIRE(ti.sizes[0] == (3 * 2)); - CATCH_REQUIRE(ti.strides[0] == 2); +// Collapses a 2D skip contiguous tensor to a 1D skip contiguous tensor +TEST(ApplyTest, StridedCollapse2D) { + int sizes[] = {3, 2}; + int strides[] = {2 * 2, 2}; + ::at::cuda::detail::TensorInfo ti{nullptr, 2, sizes, strides}; + ti.collapseDims(); + ASSERT_EQ_CUDA(ti.dims, 1); + ASSERT_EQ_CUDA(ti.sizes[0], (3 * 2)); + ASSERT_EQ_CUDA(ti.strides[0], 2); } -CATCH_TEST_CASE("4D Partial Strided Collapse", "Collapses a 4D tensor to a 2D tensor"){ - int sizes[] = {3, 6, 5, 2}; - int strides[] = {6 * 22, 22, 2 * 2, 2}; - ::at::cuda::detail::TensorInfo ti{nullptr, 4, sizes, strides}; - ti.collapseDims(); - CATCH_REQUIRE(ti.dims == 2); - CATCH_REQUIRE(ti.sizes[0] == (3 * 6)); - CATCH_REQUIRE(ti.strides[0] == 22); - CATCH_REQUIRE(ti.sizes[1] == (5 * 2)); - CATCH_REQUIRE(ti.strides[1] == 2); +// Collapses a 4D tensor to a 2D tensor +TEST(ApplyTest, PartialStridedCollapse4D) { + int sizes[] = {3, 6, 5, 2}; + int strides[] = {6 * 22, 22, 2 * 2, 2}; + ::at::cuda::detail::TensorInfo ti{nullptr, 4, sizes, strides}; + ti.collapseDims(); + ASSERT_EQ_CUDA(ti.dims, 2); + ASSERT_EQ_CUDA(ti.sizes[0], (3 * 6)); + ASSERT_EQ_CUDA(ti.strides[0], 22); + ASSERT_EQ_CUDA(ti.sizes[1], (5 * 2)); + ASSERT_EQ_CUDA(ti.strides[1], 2); } -CATCH_TEST_CASE("Collapsing Zeros and Ones", "Collapses a 5D tensor to a 1D tensor") { - int sizes[] = {1, 10, 1, 5, 4}; - int strides[] = {4, 0, 16, 0, 1}; - ::at::cuda::detail::TensorInfo ti{nullptr, 5, sizes, strides}; - ti.collapseDims(); - CATCH_REQUIRE(ti.dims == 2); - CATCH_REQUIRE(ti.sizes[0] == (10 * 5)); - CATCH_REQUIRE(ti.strides[0] == 0); - CATCH_REQUIRE(ti.sizes[1] == 4); - CATCH_REQUIRE(ti.strides[1] == 1); +// Collapses a 5D tensor to a 1D tensor +TEST(ApplyTest, CollapsesZerosAndOnes) { + int sizes[] = {1, 10, 1, 5, 4}; + int strides[] = {4, 0, 16, 0, 1}; + ::at::cuda::detail::TensorInfo ti{nullptr, 5, sizes, strides}; + ti.collapseDims(); + ASSERT_EQ_CUDA(ti.dims, 2); + ASSERT_EQ_CUDA(ti.sizes[0], (10 * 5)); + ASSERT_EQ_CUDA(ti.strides[0], 0); + ASSERT_EQ_CUDA(ti.sizes[1], 4); + ASSERT_EQ_CUDA(ti.strides[1], 1); } -CATCH_TEST_CASE("Collapsing to a Point Tensor", "Collapses a 3D tensor to a point tensor") { - int sizes[] = {1, 1, 1}; - int strides[] = {17, 12, 3}; - ::at::cuda::detail::TensorInfo ti{nullptr, 3, sizes, strides}; - CATCH_REQUIRE(ti.collapseDims() == 0); - CATCH_REQUIRE(ti.dims == 1); - CATCH_REQUIRE(ti.sizes[0] == 1); - CATCH_REQUIRE(ti.strides[0] == 1); +// Collapses a 3D tensor to a point tensor +TEST(ApplyTest, CollapseToPointTensor) { + int sizes[] = {1, 1, 1}; + int strides[] = {17, 12, 3}; + ::at::cuda::detail::TensorInfo ti{nullptr, 3, sizes, strides}; + ASSERT_EQ_CUDA(ti.collapseDims(), 0); + ASSERT_EQ_CUDA(ti.dims, 1); + ASSERT_EQ_CUDA(ti.sizes[0], 1); + ASSERT_EQ_CUDA(ti.strides[0], 1); } -CATCH_TEST_CASE("Excluding in a 4D Contiguous", "Collapses a 4D tensor to a 3D tensor") { - int sizes[] = {3, 6, 5, 2}; - int strides[] = {6 * 22, 22, 2 * 2, 2}; - ::at::cuda::detail::TensorInfo ti{nullptr, 4, sizes, strides}; - CATCH_REQUIRE(ti.collapseDims(1) == 1); - CATCH_REQUIRE(ti.dims == 3); - CATCH_REQUIRE(ti.sizes[0] == 3); - CATCH_REQUIRE(ti.strides[0] == (6 * 22)); - CATCH_REQUIRE(ti.sizes[1] == 6); - CATCH_REQUIRE(ti.strides[1] == 22); - CATCH_REQUIRE(ti.sizes[2] == (5 * 2)); - CATCH_REQUIRE(ti.strides[2] == 2); +// Collapses a 4D tensor to a 3D tensor +TEST(ApplyTest, ExcludingInContiguous4D) { + int sizes[] = {3, 6, 5, 2}; + int strides[] = {6 * 22, 22, 2 * 2, 2}; + ::at::cuda::detail::TensorInfo ti{nullptr, 4, sizes, strides}; + ASSERT_EQ_CUDA(ti.collapseDims(1), 1); + ASSERT_EQ_CUDA(ti.dims, 3); + ASSERT_EQ_CUDA(ti.sizes[0], 3); + ASSERT_EQ_CUDA(ti.strides[0], (6 * 22)); + ASSERT_EQ_CUDA(ti.sizes[1], 6); + ASSERT_EQ_CUDA(ti.strides[1], 22); + ASSERT_EQ_CUDA(ti.sizes[2], (5 * 2)); + ASSERT_EQ_CUDA(ti.strides[2], 2); } -CATCH_TEST_CASE("Roving Exclusion", "Collapses a 4D tensor to a 3D tensor") { - int sizes[] = {3, 6, 5, 2}; - int strides[] = {6 * 22, 22, 2 * 2, 2}; - ::at::cuda::detail::TensorInfo ti{nullptr, 4, sizes, strides}; - CATCH_REQUIRE(ti.collapseDims(2) == 1); - CATCH_REQUIRE(ti.dims == 3); - CATCH_REQUIRE(ti.sizes[0] == (3 * 6)); - CATCH_REQUIRE(ti.strides[0] == 22); - CATCH_REQUIRE(ti.sizes[1] == 5); - CATCH_REQUIRE(ti.strides[1] == 4); - CATCH_REQUIRE(ti.sizes[2] == 2); - CATCH_REQUIRE(ti.strides[2] == 2); +// Collapses a 4D tensor to a 3D tensor +TEST(ApplyTest, RovingExclusion) { + int sizes[] = {3, 6, 5, 2}; + int strides[] = {6 * 22, 22, 2 * 2, 2}; + ::at::cuda::detail::TensorInfo ti{nullptr, 4, sizes, strides}; + ASSERT_EQ_CUDA(ti.collapseDims(2), 1); + ASSERT_EQ_CUDA(ti.dims, 3); + ASSERT_EQ_CUDA(ti.sizes[0], (3 * 6)); + ASSERT_EQ_CUDA(ti.strides[0], 22); + ASSERT_EQ_CUDA(ti.sizes[1], 5); + ASSERT_EQ_CUDA(ti.strides[1], 4); + ASSERT_EQ_CUDA(ti.sizes[2], 2); + ASSERT_EQ_CUDA(ti.strides[2], 2); } -CATCH_TEST_CASE("Invalid Exclusion", "Attempts to exclude a nonexisting dimension") { - int sizes[] = {1, 1, 1}; - int strides[] = {17, 12, 3}; - ::at::cuda::detail::TensorInfo ti{nullptr, 3, sizes, strides}; - _CATCH_REQUIRE_THROWS(ti.collapseDims(5)); -} - +// Attempts to exclude a nonexisting dimension +TEST(ApplyTest, InvalidExclusion) { + int sizes[] = {1, 1, 1}; + int strides[] = {17, 12, 3}; + ::at::cuda::detail::TensorInfo ti{nullptr, 3, sizes, strides}; + ASSERT_ANY_THROW(ti.collapseDims(5)); +} #endif diff --git a/aten/src/ATen/test/apply_utils_test.cpp b/aten/src/ATen/test/apply_utils_test.cpp index ab7e3522bbedae..71715a2d4b0d6e 100644 --- a/aten/src/ATen/test/apply_utils_test.cpp +++ b/aten/src/ATen/test/apply_utils_test.cpp @@ -1,5 +1,4 @@ -#define CATCH_CONFIG_MAIN -#include "catch_utils.hpp" +#include "gtest/gtest.h" #include "ATen/ATen.h" #include "ATen/CPUApplyUtils.h" @@ -108,32 +107,38 @@ void test(Type& type, IntList shape, int64_t a = 0, int64_t b = 1) { }); } -CATCH_TEST_CASE("apply utils test 2-dim small contiguous", "[cpu]") { +// apply utils test 2-dim small contiguous +TEST(ApplyUtilsTest, Contiguous2D) { manual_seed(123, at::kCPU); test(CPU(kDouble), {2, 1}, -1, -1); } -CATCH_TEST_CASE("apply utils test 2-dim small", "[cpu]") { +// apply utils test 2-dim small +TEST(ApplyUtilsTest, Small2D) { manual_seed(123, at::kCPU); test(CPU(kDouble), {2, 1}); } -CATCH_TEST_CASE("apply utils test 2-dim", "[cpu]") { +// apply utils test 2-dim +TEST(ApplyUtilsTest, _2D) { manual_seed(123, at::kCPU); test(CPU(kDouble), {20, 10}); } -CATCH_TEST_CASE("apply utils test 3-dim", "[cpu]") { +// apply utils test 3-dim +TEST(ApplyUtilsTest, _3D) { manual_seed(123, at::kCPU); test(CPU(kDouble), {3, 4, 2}); } -CATCH_TEST_CASE("apply utils test 3-dim medium", "[cpu]") { +// apply utils test 3-dim medium +TEST(ApplyUtilsTest, Medium3D) { manual_seed(123, at::kCPU); test(CPU(kDouble), {3, 40, 2}); } -CATCH_TEST_CASE("apply utils test 10-dim", "[cpu]") { +// apply utils test 10-dim +TEST(ApplyUtilsTest, _10D) { manual_seed(123, at::kCPU); test(CPU(kDouble), {3, 4, 2, 5, 2, 1, 3, 4, 2, 3}); } diff --git a/aten/src/ATen/test/atest.cpp b/aten/src/ATen/test/atest.cpp index edb3f79fd2d55d..96c5ed11897481 100644 --- a/aten/src/ATen/test/atest.cpp +++ b/aten/src/ATen/test/atest.cpp @@ -8,17 +8,17 @@ using namespace std; using namespace at; void trace() { - Tensor foo = rand({12,12}); + Tensor foo = rand({12, 12}); // ASSERT foo is 2-dimensional and holds floats. - auto foo_a = foo.accessor(); + auto foo_a = foo.accessor(); float trace = 0; - for(int i = 0; i < foo_a.size(0); i++) { + for (int i = 0; i < foo_a.size(0); i++) { trace += foo_a[i][i]; } - EXPECT_FLOAT_EQ(foo.trace().item(), trace); + ASSERT_FLOAT_EQ(foo.trace().item(), trace); } // TEST_CASE( "atest", "[]" ) { @@ -26,82 +26,78 @@ TEST(atest, atest) { manual_seed(123, at::kCPU); manual_seed(123, at::kCUDA); - auto foo = rand({12,6}); + auto foo = rand({12, 6}); - EXPECT_EQ(foo.size(0), 12); - EXPECT_EQ(foo.size(1), 6); + ASSERT_EQ(foo.size(0), 12); + ASSERT_EQ(foo.size(1), 6); - foo = foo+foo*3; + foo = foo + foo * 3; foo -= 4; Scalar a = 4; float b = a.to(); - EXPECT_EQ(b, 4); + ASSERT_EQ(b, 4); - foo = (foo*foo) == (foo.pow(3)); - foo = 2 + (foo+1); - //foo = foo[3]; - auto foo_v = foo.accessor(); + foo = (foo * foo) == (foo.pow(3)); + foo = 2 + (foo + 1); + // foo = foo[3]; + auto foo_v = foo.accessor(); - for(int i = 0; i < foo_v.size(0); i++) { - for(int j = 0; j < foo_v.size(1); j++) { + for (int i = 0; i < foo_v.size(0); i++) { + for (int j = 0; j < foo_v.size(1); j++) { foo_v[i][j]++; } } - EXPECT_TRUE(foo.equal(4 * ones({12, 6}, kByte))); + ASSERT_TRUE(foo.equal(4 * ones({12, 6}, kByte))); trace(); - float data[] = { 1, 2, 3, - 4, 5, 6}; + float data[] = {1, 2, 3, 4, 5, 6}; - auto f = CPU(kFloat).tensorFromBlob(data, {1,2,3}); - auto f_a = f.accessor(); + auto f = CPU(kFloat).tensorFromBlob(data, {1, 2, 3}); + auto f_a = f.accessor(); - EXPECT_EQ(f_a[0][0][0], 1.0); - EXPECT_EQ(f_a[0][1][1], 5.0); + ASSERT_EQ(f_a[0][0][0], 1.0); + ASSERT_EQ(f_a[0][1][1], 5.0); - EXPECT_EQ(f.strides()[0], 6); - EXPECT_EQ(f.strides()[1], 3); - EXPECT_EQ(f.strides()[2], 1); - EXPECT_EQ(f.sizes()[0], 1); - EXPECT_EQ(f.sizes()[1], 2); - EXPECT_EQ(f.sizes()[2], 3); + ASSERT_EQ(f.strides()[0], 6); + ASSERT_EQ(f.strides()[1], 3); + ASSERT_EQ(f.strides()[2], 1); + ASSERT_EQ(f.sizes()[0], 1); + ASSERT_EQ(f.sizes()[1], 2); + ASSERT_EQ(f.sizes()[2], 3); // TODO(ezyang): maybe do a more precise exception type. - ASSERT_THROW(f.resize_({3,4,5}), std::exception); + ASSERT_THROW(f.resize_({3, 4, 5}), std::exception); { int isgone = 0; { - auto f2 = CPU(kFloat).tensorFromBlob(data, {1,2,3}, [&](void*) { - isgone++; - }); + auto f2 = + CPU(kFloat).tensorFromBlob(data, {1, 2, 3}, [&](void*) { isgone++; }); } - EXPECT_EQ(isgone, 1); + ASSERT_EQ(isgone, 1); } { int isgone = 0; Tensor a_view; { - auto f2 = CPU(kFloat).tensorFromBlob(data, {1,2,3}, [&](void*) { - isgone++; - }); - a_view = f2.view({3,2,1}); + auto f2 = + CPU(kFloat).tensorFromBlob(data, {1, 2, 3}, [&](void*) { isgone++; }); + a_view = f2.view({3, 2, 1}); } - EXPECT_EQ(isgone, 0); + ASSERT_EQ(isgone, 0); a_view.reset(); - EXPECT_EQ(isgone, 1); + ASSERT_EQ(isgone, 1); } - if(at::hasCUDA()) { + if (at::hasCUDA()) { int isgone = 0; { - auto base = CUDA(kFloat).tensor({1,2,3}); - auto f2 = CUDA(kFloat).tensorFromBlob(base.data_ptr(), {1,2,3}, [&](void*) { - isgone++; - }); + auto base = CUDA(kFloat).tensor({1, 2, 3}); + auto f2 = CUDA(kFloat).tensorFromBlob( + base.data_ptr(), {1, 2, 3}, [&](void*) { isgone++; }); } - EXPECT_EQ(isgone, 1); + ASSERT_EQ(isgone, 1); } } diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp index 361d24b5a6b76f..791d80b1f42f95 100644 --- a/aten/src/ATen/test/basic.cpp +++ b/aten/src/ATen/test/basic.cpp @@ -19,36 +19,35 @@ using namespace at; using Catch::Matchers::StartsWith; -static void test(Type & type) { - CATCH_SECTION( "resize" ) { +static void test(Type& type) { + CATCH_SECTION("resize") { auto a = at::empty({0}, type.options()); - a.resize_({3,4}); + a.resize_({3, 4}); CATCH_REQUIRE(a.numel() == 12); a.resize_({5, 7}); CATCH_REQUIRE(a.numel() == 35); - } - CATCH_SECTION( "ones and dot" ) { + CATCH_SECTION("ones and dot") { Tensor b0 = ones({1, 1}, type); - CATCH_REQUIRE(2 == (b0+b0).sum().item()); + CATCH_REQUIRE(2 == (b0 + b0).sum().item()); Tensor b1 = ones({1, 2}, type); - CATCH_REQUIRE(4 == (b1+b1).sum().item()); + CATCH_REQUIRE(4 == (b1 + b1).sum().item()); Tensor b = ones({3, 4}, type); - CATCH_REQUIRE(24 == (b+b).sum().item()); + CATCH_REQUIRE(24 == (b + b).sum().item()); CATCH_REQUIRE(12 == b.numel()); CATCH_REQUIRE(b.view(-1).dot(b.view(-1)).item() == 12); } - CATCH_SECTION( "rand" ) { - for(auto i = 0; i < 10; i++) { - Tensor a = rand({3,4}, type.toScalarType(i % 2 == 0 ? kFloat : kDouble)); + CATCH_SECTION("rand") { + for (auto i = 0; i < 10; i++) { + Tensor a = rand({3, 4}, type.toScalarType(i % 2 == 0 ? kFloat : kDouble)); } } - CATCH_SECTION( "sort" ) { + CATCH_SECTION("sort") { Tensor b = rand({3, 4}, type); auto z = b.sort(1); @@ -57,93 +56,101 @@ static void test(Type & type) { CATCH_REQUIRE(z_sorted[0][0].item() < z_sorted[0][1].item()); } - if(type.backend() != Backend::CUDA) - CATCH_SECTION( "randperm" ) { - Tensor b = randperm(15, type); - Tensor rv, ri; - std::tie(rv, ri) = sort(b, 0); - CATCH_REQUIRE(rv[0].item() <= rv[1].item()); - } + if (type.backend() != Backend::CUDA) + CATCH_SECTION("randperm") { + Tensor b = randperm(15, type); + Tensor rv, ri; + std::tie(rv, ri) = sort(b, 0); + CATCH_REQUIRE(rv[0].item() <= rv[1].item()); + } - CATCH_SECTION( "context" ) { + CATCH_SECTION("context") { std::stringstream ss; ss << "context: " << std::hex << (int64_t)&globalContext() << std::endl; } - CATCH_SECTION( "add" ) { + CATCH_SECTION("add") { Tensor a = rand({3, 4}, type); Tensor b = rand({3, 4}, type); Tensor c = add(a, add(a, b)); - //TODO:0-dim Tensor d(3.f); + // TODO:0-dim Tensor d(3.f); Scalar d = 3.f; - CATCH_REQUIRE( add(c, d).allclose(a + a + b + d) ); + CATCH_REQUIRE(add(c, d).allclose(a + a + b + d)); } - CATCH_SECTION( "loads of adds" ) { + CATCH_SECTION("loads of adds") { auto begin = std::chrono::high_resolution_clock::now(); Tensor d = ones({3, 4}, type); Tensor r = zeros({3, 4}, type); - for(auto i = 0; i < 100000; i++) { + for (auto i = 0; i < 100000; i++) { add_out(r, r, d); } auto end = std::chrono::high_resolution_clock::now(); - //TODO TEST PERF? - std::cout << std::dec << " " << std::chrono::duration_cast(end-begin).count() << " ms" << std::endl; - CATCH_REQUIRE(norm(100000*d).item() == norm(r).item()); + // TODO TEST PERF? + std::cout << std::dec << " " + << std::chrono::duration_cast( + end - begin) + .count() + << " ms" << std::endl; + CATCH_REQUIRE(norm(100000 * d).item() == norm(r).item()); } - CATCH_SECTION( "loads of adds (with copy)" ) { + CATCH_SECTION("loads of adds (with copy)") { auto begin = std::chrono::high_resolution_clock::now(); Tensor d = ones({3, 4}, type); Tensor r = zeros({3, 4}, type); - for(auto i = 0; i < 100000; i++) { + for (auto i = 0; i < 100000; i++) { r = add(r, d); } auto end = std::chrono::high_resolution_clock::now(); - //TODO TEST PERF? - std::cout << std::dec << " " << std::chrono::duration_cast(end-begin).count() << " ms" << std::endl; - CATCH_REQUIRE(norm(100000*d).item() == norm(r).item()); + // TODO TEST PERF? + std::cout << std::dec << " " + << std::chrono::duration_cast( + end - begin) + .count() + << " ms" << std::endl; + CATCH_REQUIRE(norm(100000 * d).item() == norm(r).item()); } - CATCH_SECTION( "isContiguous" ) { + CATCH_SECTION("isContiguous") { Tensor a = rand({3, 4}, type); CATCH_REQUIRE(a.is_contiguous()); a = a.transpose(0, 1); CATCH_REQUIRE(!a.is_contiguous()); } - CATCH_SECTION( "permute" ) { + CATCH_SECTION("permute") { Tensor a = rand({3, 4, 5}, type); Tensor b = a.permute({1, 2, 0}); CATCH_REQUIRE(b.sizes().equals({4, 5, 3})); CATCH_REQUIRE(b.strides().equals({5, 1, 20})); } - CATCH_SECTION( "mm" ) { + CATCH_SECTION("mm") { Tensor a = rand({3, 4}, type); Tensor b = rand({4}, type); Tensor c = mv(a, b); CATCH_REQUIRE(c.equal(addmv(zeros({3}, type), a, b, 0, 1))); } - CATCH_SECTION( "squeeze" ) { + CATCH_SECTION("squeeze") { Tensor a = rand({2, 1}, type); Tensor b = squeeze(a); CATCH_REQUIRE(b.dim() == 1); a = rand({1}, type); b = squeeze(a); - //TODO 0-dim squeeze + // TODO 0-dim squeeze CATCH_REQUIRE(a[0].equal(b)); } - CATCH_SECTION( "copy" ) { + CATCH_SECTION("copy") { Tensor a = zeros({4, 3}, type); Tensor e = rand({4, 3}, type); a.copy_(e); CATCH_REQUIRE(a.equal(e)); } - CATCH_SECTION( "copy (broadcasting)" ) { + CATCH_SECTION("copy (broadcasting)") { Tensor a = zeros({4, 3}, type); Tensor e = rand({3}, type); a.copy_(e); @@ -152,12 +159,12 @@ static void test(Type & type) { } } - CATCH_SECTION( "abs(value)" ) { + CATCH_SECTION("abs(value)") { Tensor r = at::abs(type.scalarTensor(-3)); CATCH_REQUIRE(r.item() == 3); } -//TODO(zach): operator overloads +// TODO(zach): operator overloads #if 0 { std::cout << "eq (value):" << std::endl; @@ -168,60 +175,60 @@ static void test(Type & type) { } #endif - CATCH_SECTION( "adding a value with a scalar" ) { + CATCH_SECTION("adding a value with a scalar") { Tensor a = rand({4, 3}, type); - CATCH_REQUIRE((ones({4,3}, type) + a).equal(add(a,1))); + CATCH_REQUIRE((ones({4, 3}, type) + a).equal(add(a, 1))); } - CATCH_SECTION( "select" ) { + CATCH_SECTION("select") { Tensor a = rand({3, 7}, type); auto a_13 = select(a, 1, 3); auto a_13_02 = select(select(a, 1, 3), 0, 2); - CATCH_REQUIRE( a[0][3].equal(a_13[0]) ); - CATCH_REQUIRE( a[2][3].equal(a_13_02) ); + CATCH_REQUIRE(a[0][3].equal(a_13[0])); + CATCH_REQUIRE(a[2][3].equal(a_13_02)); } - CATCH_SECTION( "zero-dim" ) { - Tensor a = type.scalarTensor(4); //rand(type, {1}); + CATCH_SECTION("zero-dim") { + Tensor a = type.scalarTensor(4); // rand(type, {1}); - Tensor b = rand({3,4}, type); + Tensor b = rand({3, 4}, type); CATCH_REQUIRE((a + a).dim() == 0); CATCH_REQUIRE((1 + a).dim() == 0); CATCH_REQUIRE((b + a).dim() == 2); CATCH_REQUIRE((a + b).dim() == 2); - auto c = rand({3,4}, type); + auto c = rand({3, 4}, type); CATCH_REQUIRE(c[1][2].dim() == 0); - auto f = rand({3,4}, type); + auto f = rand({3, 4}, type); f[2] = zeros({4}, type); f[1][0] = -1; CATCH_REQUIRE(f[2][0].item() == 0); } - CATCH_SECTION( "tensor from TH" ) { + CATCH_SECTION("tensor from TH") { int a = 4; - THFloatTensor *t = THFloatTensor_newWithSize2d(a, a); + THFloatTensor* t = THFloatTensor_newWithSize2d(a, a); THFloatTensor_fill(t, a); - Tensor tt = CPU(kFloat).unsafeTensorFromTH(t,false); + Tensor tt = CPU(kFloat).unsafeTensorFromTH(t, false); CATCH_REQUIRE_NOTHROW(tt); } - CATCH_SECTION( "item" ) { - Tensor a = zeros({3,4}); - Tensor b = ones({3,7}); - Tensor c = cat({a,b},1); + CATCH_SECTION("item") { + Tensor a = zeros({3, 4}); + Tensor b = ones({3, 7}); + Tensor c = cat({a, b}, 1); CATCH_REQUIRE(c.size(1) == 11); Tensor e = rand({}); CATCH_REQUIRE(*e.data() == e.sum().item()); } - CATCH_SECTION( "to string" ) { - Tensor b = ones({3,7})*.0000001f; + CATCH_SECTION("to string") { + Tensor b = ones({3, 7}) * .0000001f; std::stringstream s; s << b << "\n"; std::string expect = "1e-07 *"; - CATCH_REQUIRE(s.str().substr(0,expect.size()) == expect); + CATCH_REQUIRE(s.str().substr(0, expect.size()) == expect); } CATCH_SECTION("indexing by Scalar") { Tensor tensor = arange(0, 10, kInt); @@ -243,8 +250,7 @@ static void test(Type & type) { } CATCH_REQUIRE_THROWS_WITH( tensor[Scalar(3.14)].equal(one), - StartsWith( - "Can only index tensors with integral scalars")); + StartsWith("Can only index tensors with integral scalars")); } CATCH_SECTION("indexing by zero-dim tensor") { Tensor tensor = arange(0, 10, kInt); @@ -254,8 +260,7 @@ static void test(Type & type) { } CATCH_REQUIRE_THROWS_WITH( tensor[ones({}) * 3.14].equal(one), - StartsWith( - "Can only index tensors with integral scalars")); + StartsWith("Can only index tensors with integral scalars")); CATCH_REQUIRE_THROWS_WITH( tensor[Tensor()].equal(one), StartsWith("Can only index with tensors that are defined")); @@ -275,16 +280,16 @@ static void test(Type & type) { } } -CATCH_TEST_CASE( "basic tests CPU", "[cpu]" ) { +CATCH_TEST_CASE("basic tests CPU", "[cpu]") { manual_seed(123, at::kCPU); test(CPU(kFloat)); } -CATCH_TEST_CASE( "basic tests GPU", "[cuda]" ) { +CATCH_TEST_CASE("basic tests GPU", "[cuda]") { manual_seed(123, at::kCUDA); - if(at::hasCUDA()) { + if (at::hasCUDA()) { test(CUDA(kFloat)); } } diff --git a/aten/src/ATen/test/broadcast_test.cpp b/aten/src/ATen/test/broadcast_test.cpp index 822a1d79df1bda..8bebb7d8fdd907 100644 --- a/aten/src/ATen/test/broadcast_test.cpp +++ b/aten/src/ATen/test/broadcast_test.cpp @@ -1,154 +1,192 @@ -#define CATCH_CONFIG_MAIN -#include "catch_utils.hpp" + +#include "gtest/gtest.h" #include "ATen/ATen.h" #include "test_seed.h" using namespace at; -CATCH_TEST_CASE( "broadcast", "[]" ) { +// can't expand empty tensor +void TestEmptyTensor(Type& T) { + auto empty = randn({0}, T); + ASSERT_ANY_THROW(empty.expand({3})); +} + +// out-place function with 2 args +void TestOut2Basic(Type& T) { + auto a = randn({3, 1}, T); + auto b = randn({5}, T); + std::vector expanded_sizes = {3, 5}; + ASSERT_TRUE( + (a + b).equal(a.expand(expanded_sizes) + b.expand(expanded_sizes))); +} + +// with scalar +void TestOut2WithScalar(Type& T) { + auto aScalar = ones({1}, T); + aScalar.unsafeGetTensorImpl()->maybe_zero_dim(true); + auto b = randn({3, 5}, T); + ASSERT_TRUE( + (aScalar + b).equal(aScalar.expand(b.sizes()) + b.expand(b.sizes()))); +} +// old fallback behavior yields error +void TestOut2OldFallback(Type& T) { + auto a = randn({3, 5}, T); + auto b = randn({5, 3}, T); + ASSERT_ANY_THROW(a + b); +} + +// with mismatched sizes +void TestOut2MismatchedSizes(Type& T) { + auto a = randn({3, 5}, T); + auto b = randn({7, 5}, T); + ASSERT_ANY_THROW(a + b); +} + +// out-place function with 3 args +void TestOut3Basic(Type& T) { + auto a = randn({3, 1, 1}, T); + auto b = randn({1, 2, 1}, T); + auto c = randn({1, 1, 5}, T); + std::vector expanded_sizes = {3, 2, 5}; + ASSERT_TRUE((a + b + c).equal( + a.expand(expanded_sizes) + b.expand(expanded_sizes) + + c.expand(expanded_sizes))); +} + +// with scalar +void TestOut3WithScalar(Type& T) { + auto aTensorScalar = ones({1}, T); + aTensorScalar.unsafeGetTensorImpl()->maybe_zero_dim(true); + auto b = randn({3, 2, 1}, T); + auto c = randn({1, 2, 5}, T); + std::vector expanded_sizes = {3, 2, 5}; + ASSERT_TRUE(aTensorScalar.addcmul(b, c).equal( + aTensorScalar.expand(expanded_sizes) + .addcmul(b.expand(expanded_sizes), c.expand(expanded_sizes)))); +} + +// old fallback behavior yields error +void TestOut3OldFallback(Type& T) { + auto a = randn({3, 2, 5}, T); + auto b = randn({2, 3, 5}, T); + auto c = randn({5, 3, 2}, T); + ASSERT_ANY_THROW(a.addcmul(b, c)); +} + +// with mismatched sizes +void TestOut3MismatchedSizes(Type& T) { + auto a = randn({3, 2, 5}, T); + auto b = randn({2, 3, 5}, T); + auto c = randn({5, 5, 5}, T); + ASSERT_ANY_THROW(a.addcmul(b, c)); +} + +// in-place function with 2 args +void TestIn2Basic(Type& T) { + auto a = randn({3, 5}, T); + auto b = randn({3, 1}, T); + ASSERT_TRUE((a + b).equal(a + b.expand({3, 5}))); +} + +// with scalar +void TestIn2WithScalar(Type& T) { + auto a = randn({3, 5}, T); + auto bScalar = ones({1}, T); + bScalar.unsafeGetTensorImpl()->maybe_zero_dim(true); + ASSERT_TRUE((a + bScalar).equal(a + bScalar.expand(a.sizes()))); +} + +// error: would have to expand inplace arg +void TestIn2ExpandError(Type& T) { + auto a = randn({1, 5}, T); + auto b = randn({3, 1}, T); + ASSERT_ANY_THROW(a.add_(b)); +} + +// in-place function with 3 args +void TestIn3Basic(Type& T) { + auto a = randn({3, 5, 2}, T); + auto b = randn({3, 1, 2}, T); + auto c = randn({1, 5, 1}, T); + auto aClone = a.clone(); + ASSERT_TRUE(a.addcmul_(b, c).equal( + aClone.addcmul_(b.expand(a.sizes()), c.expand(a.sizes())))); +} + +// with scalar +void TestIn3WithScalar(Type& T) { + auto a = randn({3, 5, 2}, T); + auto b = randn({3, 1, 2}, T); + auto c = randn({1, 5, 1}, T); + auto aClone = a.clone(); + auto bScalar = ones({1}, T); + bScalar.unsafeGetTensorImpl()->maybe_zero_dim(true); + ASSERT_TRUE(a.addcmul_(bScalar, c) + .equal(aClone.addcmul_( + bScalar.expand(a.sizes()), c.expand(a.sizes())))); +} + +// error: would have to expand inplace arg +void TestIn3ExpandError(Type& T) { + auto a = randn({1, 3, 5}, T); + auto b = randn({4, 1, 1}, T); + auto c = randn({1, 3, 1}, T); + ASSERT_ANY_THROW(a.addcmul_(b, c)); +} + +// explicit dim specification +void TestExplicitDimBasic(Type& T) { + auto a = randn({1}, T); + auto b = randn({5, 3}, T); + auto c = randn({3, 7}, T); + ASSERT_TRUE(a.addmm(b, c).equal(a.expand({5, 7}).addmm(b, c))); +} + +// with scalar +void TestExplicitDimWithScalar(Type& T) { + auto a = randn({1}, T); + auto b = randn({5, 3}, T); + auto c = randn({3, 7}, T); + Tensor aScalar = ones({1}, T); + aScalar.unsafeGetTensorImpl()->maybe_zero_dim(true); + ASSERT_TRUE(aScalar.addmm(b, c).equal(aScalar.expand({5, 7}).addmm(b, c))); +} + +// with mismatched sizes +void TestExplicitDimWithMismatchedSizes(Type& T) { + auto b = randn({5, 3}, T); + auto c = randn({3, 7}, T); + auto a = randn({3, 3}, T); + ASSERT_ANY_THROW(a.addmm(b, c)); +} + +TEST(BroadcastTest, Broadcast) { manual_seed(123, at::kCPU); + Type& T = CPU(kFloat); + + TestEmptyTensor(T); + + TestOut2Basic(T); + TestOut2WithScalar(T); + TestOut2OldFallback(T); + TestOut2MismatchedSizes(T); + + TestOut3Basic(T); + TestOut3WithScalar(T); + TestOut3OldFallback(T); + TestOut3MismatchedSizes(T); + + TestIn2Basic(T); + TestIn2WithScalar(T); + TestIn2ExpandError(T); + + TestIn3Basic(T); + TestIn3WithScalar(T); + TestIn3ExpandError(T); - Type & T = CPU(kFloat); - - // 0) pre-req tests: - CATCH_SECTION( "can't expand empty tensor" ) { - auto empty = randn({0}, T); - _CATCH_REQUIRE_THROWS(empty.expand({3})); - } - - // 1) out-place function with 2 args - CATCH_SECTION( "out-place function with 2 args" ) { - - CATCH_SECTION( "basic" ) { - auto a = randn({3, 1}, T); - auto b = randn({5}, T); - std::vector expanded_sizes = {3, 5}; - CATCH_REQUIRE((a + b).equal(a.expand(expanded_sizes) + b.expand(expanded_sizes))); - } - - CATCH_SECTION( "with scalar" ) { - auto aScalar = ones({1}, T); - aScalar.unsafeGetTensorImpl()->maybe_zero_dim(true); - auto b = randn({3, 5}, T); - CATCH_REQUIRE((aScalar + b).equal(aScalar.expand(b.sizes()) + b.expand(b.sizes()))); - } - - CATCH_SECTION( "old fallback behavior yields error" ) { - auto a = randn({3, 5}, T); - auto b = randn({5, 3}, T); - _CATCH_REQUIRE_THROWS(a + b); - } - - CATCH_SECTION( "with mismatched sizes" ) { - auto a = randn({3, 5}, T); - auto b = randn({7, 5}, T); - _CATCH_REQUIRE_THROWS(a + b); - } - } - - CATCH_SECTION( "out-place function with 3 args" ) { - - CATCH_SECTION( "basic" ) { - auto a = randn({3, 1, 1}, T); - auto b = randn({1, 2, 1}, T); - auto c = randn({1, 1, 5}, T); - std::vector expanded_sizes = {3, 2, 5}; - CATCH_REQUIRE((a + b + c).equal(a.expand(expanded_sizes) + b.expand(expanded_sizes) + c.expand(expanded_sizes))); - } - - CATCH_SECTION( "with scalar" ) { - auto aTensorScalar = ones({1}, T); - aTensorScalar.unsafeGetTensorImpl()->maybe_zero_dim(true); - auto b = randn({3, 2, 1}, T); - auto c = randn({1, 2, 5}, T); - std::vector expanded_sizes = {3, 2, 5}; - CATCH_REQUIRE(aTensorScalar.addcmul(b, c).equal( - aTensorScalar.expand(expanded_sizes).addcmul(b.expand(expanded_sizes), c.expand(expanded_sizes)))); - } - - CATCH_SECTION( "old fallback behavior yields error" ) { - auto a = randn({3, 2, 5}, T); - auto b = randn({2, 3, 5}, T); - auto c = randn({5, 3, 2}, T); - _CATCH_REQUIRE_THROWS(a.addcmul(b, c)); - } - - CATCH_SECTION( "with mismatched sizes" ){ - auto a = randn({3, 2, 5}, T); - auto b = randn({2, 3, 5}, T); - auto c = randn({5, 5, 5}, T); - _CATCH_REQUIRE_THROWS(a.addcmul(b, c)); - } - } - - CATCH_SECTION( "in-place function with 2 args" ) { - CATCH_SECTION( "basic" ) { - auto a = randn({3, 5}, T); - auto b = randn({3, 1}, T); - CATCH_REQUIRE((a + b).equal(a + b.expand({3, 5}))); - } - - CATCH_SECTION( "with scalar" ) { - auto a = randn({3, 5}, T); - auto bScalar = ones({1}, T); - bScalar.unsafeGetTensorImpl()->maybe_zero_dim(true); - CATCH_REQUIRE((a + bScalar).equal(a + bScalar.expand(a.sizes()))); - } - - CATCH_SECTION( "error: would have to expand inplace arg" ) { - auto a = randn({1, 5}, T); - auto b = randn({3, 1}, T); - _CATCH_REQUIRE_THROWS(a.add_(b)); - } - } - - CATCH_SECTION( "in-place function with 3 args" ) { - - auto a = randn({3, 5, 2}, T); - auto b = randn({3, 1, 2}, T); - auto c = randn({1, 5, 1}, T); - - CATCH_SECTION( "basic" ) { - auto aClone = a.clone(); - CATCH_REQUIRE(a.addcmul_(b, c).equal(aClone.addcmul_(b.expand(a.sizes()), c.expand(a.sizes())))); - } - - CATCH_SECTION( "with scalar" ) { - auto aClone = a.clone(); - auto bScalar = ones({1}, T); - bScalar.unsafeGetTensorImpl()->maybe_zero_dim(true); - CATCH_REQUIRE(a.addcmul_(bScalar, c).equal(aClone.addcmul_(bScalar.expand(a.sizes()), c.expand(a.sizes())))); - } - - CATCH_SECTION( "error: would have to expand inplace arg" ) { - auto a = randn({1, 3, 5}, T); - auto b = randn({4, 1, 1}, T); - auto c = randn({1, 3, 1}, T); - _CATCH_REQUIRE_THROWS(a.addcmul_(b, c)); - } - } - - CATCH_SECTION( "explicit dim specification" ) { - - auto a = randn({1}, T); - auto b = randn({5, 3}, T); - auto c = randn({3, 7}, T); - - CATCH_SECTION( "basic" ) { - CATCH_REQUIRE(a.addmm(b, c).equal(a.expand({5,7}).addmm(b, c))); - } - - CATCH_SECTION( "with scalar" ) { - Tensor aScalar = ones({1}, T); - aScalar.unsafeGetTensorImpl()->maybe_zero_dim(true); - CATCH_REQUIRE(aScalar.addmm(b, c).equal(aScalar.expand({5, 7}).addmm(b, c))); - } - - CATCH_SECTION( "with mismatched sizes" ) { - auto a = randn({3, 3}, T); - _CATCH_REQUIRE_THROWS(a.addmm(b, c)); - } - } + TestExplicitDimBasic(T); + TestExplicitDimWithScalar(T); + TestExplicitDimWithMismatchedSizes(T); } diff --git a/aten/src/ATen/test/catch_utils.hpp b/aten/src/ATen/test/catch_utils.hpp index b9b0a87990a9ce..9e7696b1372263 100644 --- a/aten/src/ATen/test/catch_utils.hpp +++ b/aten/src/ATen/test/catch_utils.hpp @@ -3,6 +3,8 @@ #define CATCH_CONFIG_PREFIX_ALL #include -// CATCH_REQUIRE_THROWS is not defined identically to REQUIRE_THROWS and causes warning; -// define our own version that doesn't warn. -#define _CATCH_REQUIRE_THROWS( ... ) INTERNAL_CATCH_THROWS( "CATCH_REQUIRE_THROWS", Catch::ResultDisposition::Normal, __VA_ARGS__ ) +// CATCH_REQUIRE_THROWS is not defined identically to REQUIRE_THROWS and causes +// warning; define our own version that doesn't warn. +#define _CATCH_REQUIRE_THROWS(...) \ + INTERNAL_CATCH_THROWS( \ + "CATCH_REQUIRE_THROWS", Catch::ResultDisposition::Normal, __VA_ARGS__) diff --git a/aten/src/ATen/test/cuda_half_test.cu b/aten/src/ATen/test/cuda_half_test.cu index cce267100589e1..56ca901931384d 100644 --- a/aten/src/ATen/test/cuda_half_test.cu +++ b/aten/src/ATen/test/cuda_half_test.cu @@ -1,5 +1,4 @@ -#define CATCH_CONFIG_MAIN -#include "catch_utils.hpp" +#include "gtest/gtest.h" #include "ATen/ATen.h" #include "ATen/cuda/NumericLimits.cuh" @@ -12,7 +11,6 @@ using namespace at; __device__ void test(){ - // test half construction and implicit conversions in device assert(Half(3) == Half(3.0f)); assert(static_cast(3.0f) == Half(3.0f)); @@ -24,7 +22,7 @@ __device__ void test(){ __half c = a - Half(b); assert(static_cast(c) == Half(1.0)); - // asserting if the functions used on + // asserting if the functions used on // half types give almost equivalent results when using // functions on double. // The purpose of these asserts are to test the device side @@ -61,17 +59,18 @@ __device__ void test(){ assert(::abs(::abs(Half(-3.0)) - ::abs(-3.0f)) <= threshold); assert(::abs(::round(Half(2.3)) - ::round(2.3f)) <= threshold); assert(::abs(::pow(Half(2.0), Half(10.0)) - ::pow(2.0f, 10.0f)) <= threshold); - assert(::abs(::atan2(Half(7.0), Half(0.0)) - ::atan2(7.0f, 0.0f)) <= threshold); + assert( + ::abs(::atan2(Half(7.0), Half(0.0)) - ::atan2(7.0f, 0.0f)) <= threshold); // note: can't use namespace on isnan and isinf in device code - #ifdef _MSC_VER - // Windows requires this explicit conversion. The reason is unclear - // related issue with clang: https://reviews.llvm.org/D37906 - assert(::abs(::isnan((float)Half(0.0)) - ::isnan(0.0f)) <= threshold); - assert(::abs(::isinf((float)Half(0.0)) - ::isinf(0.0f)) <= threshold); - #else - assert(::abs(::isnan(Half(0.0)) - ::isnan(0.0f)) <= threshold); - assert(::abs(::isinf(Half(0.0)) - ::isinf(0.0f)) <= threshold); - #endif +#ifdef _MSC_VER + // Windows requires this explicit conversion. The reason is unclear + // related issue with clang: https://reviews.llvm.org/D37906 + assert(::abs(::isnan((float)Half(0.0)) - ::isnan(0.0f)) <= threshold); + assert(::abs(::isinf((float)Half(0.0)) - ::isinf(0.0f)) <= threshold); +#else + assert(::abs(::isnan(Half(0.0)) - ::isnan(0.0f)) <= threshold); + assert(::abs(::isinf(Half(0.0)) - ::isinf(0.0f)) <= threshold); +#endif } __global__ void kernel(){ @@ -79,12 +78,13 @@ __global__ void kernel(){ } void launch_function(){ - kernel<<<1,1>>>(); + kernel<<<1, 1>>>(); } -CATCH_TEST_CASE( "half common math functions tests in device", "[cuda]" ) { +// half common math functions tests in device +TEST(HalfCuda, HalfCuda) { launch_function(); cudaError_t err = cudaDeviceSynchronize(); - CATCH_REQUIRE(err == cudaSuccess); + bool isEQ = err == cudaSuccess; + ASSERT_TRUE(isEQ); } - diff --git a/aten/src/ATen/test/cuda_optional_test.cu b/aten/src/ATen/test/cuda_optional_test.cu index b64c530b355914..128e1cf5f5147e 100644 --- a/aten/src/ATen/test/cuda_optional_test.cu +++ b/aten/src/ATen/test/cuda_optional_test.cu @@ -1,5 +1,4 @@ -#define CATCH_CONFIG_MAIN -#include "catch_utils.hpp" +#include "gtest/gtest.h" #include "ATen/ATen.h" #include "ATen/optional.h" @@ -8,15 +7,15 @@ using namespace at; -CATCH_TEST_CASE( "optional in cuda files", "[cuda]" ) { +// optional in cuda files +TEST(OptionalTest, OptionalTestCUDA) { at::optional trivially_destructible; at::optional> non_trivially_destructible; - CATCH_REQUIRE(!trivially_destructible.has_value()); - CATCH_REQUIRE(!non_trivially_destructible.has_value()); + ASSERT_FALSE(trivially_destructible.has_value()); + ASSERT_FALSE(non_trivially_destructible.has_value()); trivially_destructible = {5}; non_trivially_destructible = std::vector{5, 10}; - CATCH_REQUIRE(trivially_destructible.has_value()); - CATCH_REQUIRE(non_trivially_destructible.has_value()); + ASSERT_TRUE(trivially_destructible.has_value()); + ASSERT_TRUE(non_trivially_destructible.has_value()); } - diff --git a/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu b/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu index a529f38d748a1b..32f5f410bb2eb5 100644 --- a/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu +++ b/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu @@ -1,5 +1,4 @@ -#define CATCH_CONFIG_MAIN -#include "catch_utils.hpp" +#include "gtest/gtest.h" #include "ATen/ATen.h" #include "test_seed.h" @@ -10,9 +9,10 @@ using namespace at; -__global__ void test_tensor_packed_accessor_kernel(PackedTensorAccessor resa, - PackedTensorAccessor t1a, - PackedTensorAccessor t2a){ +__global__ void test_tensor_packed_accessor_kernel( + PackedTensorAccessor resa, + PackedTensorAccessor t1a, + PackedTensorAccessor t2a) { for (int64_t i = 0; i < resa.size(0); i++) { float val = 0.0f; for (int64_t j = 0; j < t1a.size(1); j++) { @@ -22,7 +22,8 @@ __global__ void test_tensor_packed_accessor_kernel(PackedTensorAccessor(); auto stream = at::cuda::getCurrentCUDAStream(); - + test_tensor_packed_accessor_kernel<<<1, 1, 0, stream>>>(resa, t1a, t2a); cudaError_t err = cudaDeviceSynchronize(); - CATCH_REQUIRE(err == cudaSuccess); + bool isEQ = err == cudaSuccess; + ASSERT_TRUE(isEQ); auto expected = mv(t1, t2); - CATCH_REQUIRE(res.allclose(expected)); + ASSERT_TRUE(res.allclose(expected)); } diff --git a/aten/src/ATen/test/cuda_rng_test.cpp b/aten/src/ATen/test/cuda_rng_test.cpp index 7b14174d3baeb3..f5645a7978c11f 100644 --- a/aten/src/ATen/test/cuda_rng_test.cpp +++ b/aten/src/ATen/test/cuda_rng_test.cpp @@ -1,5 +1,4 @@ -#define CATCH_CONFIG_MAIN -#include "catch_utils.hpp" +#include "gtest/gtest.h" #include "ATen/ATen.h" #include "cuda.h" @@ -21,7 +20,6 @@ void testCudaRNGMultithread() { } }; -CATCH_TEST_CASE( "CUDA RNG test", "[cuda]" ) { - CATCH_SECTION( "multithread" ) - testCudaRNGMultithread(); +TEST(Cuda_RNGTest, MultithreadRNGTest) { + testCudaRNGMultithread(); } diff --git a/aten/src/ATen/test/cudnn_test.cpp b/aten/src/ATen/test/cudnn_test.cpp index 4391867d166772..54da9420ff60a1 100644 --- a/aten/src/ATen/test/cudnn_test.cpp +++ b/aten/src/ATen/test/cudnn_test.cpp @@ -1,5 +1,4 @@ -#define CATCH_CONFIG_MAIN -#include "catch_utils.hpp" +#include "gtest/gtest.h" #include "ATen/ATen.h" #include "ATen/cudnn/Descriptors.h" @@ -9,7 +8,7 @@ using namespace at; using namespace at::native; -CATCH_TEST_CASE( "cudnn", "[cuda]" ) { +TEST(CUDNNTest, CUDNNTestCUDA) { manual_seed(123, at::kCUDA); #if CUDNN_VERSION < 7000 @@ -17,9 +16,12 @@ CATCH_TEST_CASE( "cudnn", "[cuda]" ) { DropoutDescriptor desc1, desc2; desc1.initialize_rng(at::CUDA(kByte), handle, 0.5, 42); desc2.set(handle, 0.5, desc1.state); - - CATCH_REQUIRE(desc1.desc()->dropout == desc2.desc()->dropout); - CATCH_REQUIRE(desc1.desc()->nstates == desc2.desc()->nstates); - CATCH_REQUIRE(desc1.desc()->states == desc2.desc()->states); + bool isEQ; + isEQ = (desc1.desc()->dropout == desc2.desc()->dropout); + ASSERT_TRUE(isEQ); + isEQ = (desc1.desc()->nstates == desc2.desc()->nstates); + ASSERT_TRUE(isEQ); + isEQ = (desc1.desc()->states == desc2.desc()->states); + ASSERT_TRUE(isEQ); #endif } diff --git a/aten/src/ATen/test/dlconvertor_test.cpp b/aten/src/ATen/test/dlconvertor_test.cpp index bf0cf93f7c4064..71a8d535d01e4e 100644 --- a/aten/src/ATen/test/dlconvertor_test.cpp +++ b/aten/src/ATen/test/dlconvertor_test.cpp @@ -1,5 +1,4 @@ -#define CATCH_CONFIG_MAIN -#include "catch_utils.hpp" +#include "gtest/gtest.h" #include "ATen/ATen.h" #include "ATen/DLConvertor.h" @@ -10,18 +9,13 @@ #include "test_seed.h" using namespace at; - -CATCH_TEST_CASE( "dlconvertor", "[cpu]" ) { - +TEST(TestDlconvertor, TestDlconvertor) { manual_seed(123, at::kCPU); - CATCH_INFO( "convert ATen to DLTensor" ); - - Tensor a = rand({3,4}); + Tensor a = rand({3, 4}); DLManagedTensor* dlMTensor = toDLPack(a); - CATCH_INFO( "convert DLTensor to ATen" ); Tensor b = fromDLPack(dlMTensor); - CATCH_REQUIRE(a.equal(b)); + ASSERT_TRUE(a.equal(b)); } diff --git a/aten/src/ATen/test/half_test.cpp b/aten/src/ATen/test/half_test.cpp index 32177705a2f883..5aa062f125b21a 100644 --- a/aten/src/ATen/test/half_test.cpp +++ b/aten/src/ATen/test/half_test.cpp @@ -1,5 +1,4 @@ -#define CATCH_CONFIG_MAIN -#include "catch_utils.hpp" +#include "gtest/gtest.h" #include #include @@ -12,53 +11,53 @@ using namespace at; -CATCH_TEST_CASE( "half arithmetic", "[]" ) { +TEST(TestHalf, Arithmetic) { Half zero = 0; Half one = 1; - CATCH_REQUIRE(zero + one == one); - CATCH_REQUIRE(zero + zero == zero); - CATCH_REQUIRE(zero * one == zero); - CATCH_REQUIRE(one * one == one); - CATCH_REQUIRE(one / one == one); - CATCH_REQUIRE(one - one == zero); - CATCH_REQUIRE(one - zero == one); - CATCH_REQUIRE(zero - one == -one); - CATCH_REQUIRE(one + one == Half(2)); - CATCH_REQUIRE(one + one == 2); + ASSERT_EQ(zero + one, one); + ASSERT_EQ(zero + zero, zero); + ASSERT_EQ(zero * one, zero); + ASSERT_EQ(one * one, one); + ASSERT_EQ(one / one, one); + ASSERT_EQ(one - one, zero); + ASSERT_EQ(one - zero, one); + ASSERT_EQ(zero - one, -one); + ASSERT_EQ(one + one, Half(2)); + ASSERT_EQ(one + one, 2); } -CATCH_TEST_CASE( "half comparisons", "[]" ) { +TEST(TestHalf, Comparisions) { Half zero = 0; Half one = 1; - CATCH_REQUIRE(zero < one); - CATCH_REQUIRE(zero < 1); - CATCH_REQUIRE(1 > zero); - CATCH_REQUIRE(0 >= zero); - CATCH_REQUIRE(0 != one); - CATCH_REQUIRE(zero == 0); - CATCH_REQUIRE(zero == zero); - CATCH_REQUIRE(zero == -zero); + ASSERT_LT(zero, one); + ASSERT_LT(zero, 1); + ASSERT_GT(1, zero); + ASSERT_GE(0, zero); + ASSERT_NE(0, one); + ASSERT_EQ(zero, 0); + ASSERT_EQ(zero, zero); + ASSERT_EQ(zero, -zero); } -CATCH_TEST_CASE( "half cast", "[]" ) { +TEST(TestHalf, Cast) { Half value = 1.5f; - CATCH_REQUIRE((int)value == 1); - CATCH_REQUIRE((short)value == 1); - CATCH_REQUIRE((long long)value == 1LL); - CATCH_REQUIRE((float)value == 1.5f); - CATCH_REQUIRE((double)value == 1.5); - CATCH_REQUIRE((bool)value == true); - CATCH_REQUIRE((bool)Half(0.0f) == false); + ASSERT_EQ((int)value, 1); + ASSERT_EQ((short)value, 1); + ASSERT_EQ((long long)value, 1LL); + ASSERT_EQ((float)value, 1.5f); + ASSERT_EQ((double)value, 1.5); + ASSERT_EQ((bool)value, true); + ASSERT_EQ((bool)Half(0.0f), false); } -CATCH_TEST_CASE( "half construction", "[]" ) { - CATCH_REQUIRE(Half((short)3) == Half(3.0f)); - CATCH_REQUIRE(Half((unsigned short)3) == Half(3.0f)); - CATCH_REQUIRE(Half(3) == Half(3.0f)); - CATCH_REQUIRE(Half(3U) == Half(3.0f)); - CATCH_REQUIRE(Half(3LL) == Half(3.0f)); - CATCH_REQUIRE(Half(3ULL) == Half(3.0f)); - CATCH_REQUIRE(Half(3.5) == Half(3.5f)); +TEST(TestHalf, Construction) { + ASSERT_EQ(Half((short)3), Half(3.0f)); + ASSERT_EQ(Half((unsigned short)3), Half(3.0f)); + ASSERT_EQ(Half(3), Half(3.0f)); + ASSERT_EQ(Half(3U), Half(3.0f)); + ASSERT_EQ(Half(3LL), Half(3.0f)); + ASSERT_EQ(Half(3ULL), Half(3.0f)); + ASSERT_EQ(Half(3.5), Half(3.5f)); } static std::string to_string(const Half& h) { @@ -67,31 +66,31 @@ static std::string to_string(const Half& h) { return ss.str(); } -CATCH_TEST_CASE( "half to string", "[]" ) { - CATCH_REQUIRE(to_string(Half(3.5f)) == "3.5"); - CATCH_REQUIRE(to_string(Half(-100.0f)) == "-100"); +TEST(TestHalf, Half2String) { + ASSERT_EQ(to_string(Half(3.5f)), "3.5"); + ASSERT_EQ(to_string(Half(-100.0f)), "-100"); } -CATCH_TEST_CASE( "half numeric limits", "[]" ) { +TEST(TestHalf, HalfNumericLimits) { using limits = std::numeric_limits; - CATCH_REQUIRE(limits::lowest() == -65504.0f); - CATCH_REQUIRE(limits::max() == 65504.0f); - CATCH_REQUIRE(limits::min() > 0); - CATCH_REQUIRE(limits::min() < 1); - CATCH_REQUIRE(limits::denorm_min() > 0); - CATCH_REQUIRE(limits::denorm_min() / 2 == 0); - CATCH_REQUIRE(limits::infinity() == std::numeric_limits::infinity()); - CATCH_REQUIRE(limits::quiet_NaN() != limits::quiet_NaN()); - CATCH_REQUIRE(limits::signaling_NaN() != limits::signaling_NaN()); + ASSERT_EQ(limits::lowest(), -65504.0f); + ASSERT_EQ(limits::max(), 65504.0f); + ASSERT_GT(limits::min(), 0); + ASSERT_LT(limits::min(), 1); + ASSERT_GT(limits::denorm_min(), 0); + ASSERT_EQ(limits::denorm_min() / 2, 0); + ASSERT_EQ(limits::infinity(), std::numeric_limits::infinity()); + ASSERT_NE(limits::quiet_NaN(), limits::quiet_NaN()); + ASSERT_NE(limits::signaling_NaN(), limits::signaling_NaN()); } // Check the declared type of members of numeric_limits matches // the declared type of that member on numeric_limits -#define ASSERT_SAME_TYPE(name) \ - static_assert( \ - std::is_same< \ - decltype(std::numeric_limits::name), \ +#define ASSERT_SAME_TYPE(name) \ + static_assert( \ + std::is_same< \ + decltype(std::numeric_limits::name), \ decltype(std::numeric_limits::name)>::value, \ "decltype(" #name ") differs") @@ -119,7 +118,7 @@ ASSERT_SAME_TYPE(max_exponent10); ASSERT_SAME_TYPE(traps); ASSERT_SAME_TYPE(tinyness_before); -CATCH_TEST_CASE( "half common math functions test", "[]" ) { +TEST(TestHalf, CommonMath) { float threshold = 0.00001; assert(std::abs(std::lgamma(Half(10.0)) - std::lgamma(10.0f)) <= threshold); assert(std::abs(std::exp(Half(1.0)) - std::exp(1.0f)) <= threshold); @@ -147,14 +146,22 @@ CATCH_TEST_CASE( "half common math functions test", "[]" ) { assert(std::abs(std::erfc(Half(10.0)) - std::erfc(10.0f)) <= threshold); assert(std::abs(std::abs(Half(-3.0)) - std::abs(-3.0f)) <= threshold); assert(std::abs(std::round(Half(2.3)) - std::round(2.3f)) <= threshold); - assert(std::abs(std::pow(Half(2.0), Half(10.0)) - std::pow(2.0f, 10.0f)) <= threshold); - assert(std::abs(std::atan2(Half(7.0), Half(0.0)) - std::atan2(7.0f, 0.0f)) <= threshold); - #ifdef __APPLE__ - // @TODO: can macos do implicit conversion of Half? - assert(std::abs(std::isnan(static_cast(Half(0.0))) - std::isnan(0.0f)) <= threshold); - assert(std::abs(std::isinf(static_cast(Half(0.0))) - std::isinf(0.0f)) <= threshold); - #else - assert(std::abs(std::isnan(Half(0.0)) - std::isnan(0.0f)) <= threshold); - assert(std::abs(std::isinf(Half(0.0)) - std::isinf(0.0f)) <= threshold); - #endif -} \ No newline at end of file + assert( + std::abs(std::pow(Half(2.0), Half(10.0)) - std::pow(2.0f, 10.0f)) <= + threshold); + assert( + std::abs(std::atan2(Half(7.0), Half(0.0)) - std::atan2(7.0f, 0.0f)) <= + threshold); +#ifdef __APPLE__ + // @TODO: can macos do implicit conversion of Half? + assert( + std::abs(std::isnan(static_cast(Half(0.0))) - std::isnan(0.0f)) <= + threshold); + assert( + std::abs(std::isinf(static_cast(Half(0.0))) - std::isinf(0.0f)) <= + threshold); +#else + assert(std::abs(std::isnan(Half(0.0)) - std::isnan(0.0f)) <= threshold); + assert(std::abs(std::isinf(Half(0.0)) - std::isinf(0.0f)) <= threshold); +#endif +} diff --git a/aten/src/ATen/test/integer_divider_test.cu b/aten/src/ATen/test/integer_divider_test.cu index d09a423d7ca72d..21169e9ee30625 100644 --- a/aten/src/ATen/test/integer_divider_test.cu +++ b/aten/src/ATen/test/integer_divider_test.cu @@ -1,5 +1,4 @@ -#define CATCH_CONFIG_MAIN -#include "catch_utils.hpp" +#include "gtest/gtest.h" // Test IntegerDivider: this tests *all* 32-bit pairs (a, b) where a % b is 0 or // (b-1), so it takes a few minutes to run. @@ -20,24 +19,25 @@ struct TestCase { int steps; TestCase(Value dividend, int divisor_idx, int steps) - : dividend(dividend), divisor_idx(divisor_idx), steps(steps) { } + : dividend(dividend), divisor_idx(divisor_idx), steps(steps) {} }; -template -__global__ void testIntDivider(const IntDivider *dividers, - const TestCase *testCases, - int numCases) -{ +template +__global__ void testIntDivider( + const IntDivider* dividers, + const TestCase* testCases, + int numCases) { int index = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; for (int i = index; i < numCases; i += stride) { - const TestCase &tc = testCases[i]; + const TestCase& tc = testCases[i]; Value dividend = tc.dividend; - const IntDivider ÷r = dividers[tc.divisor_idx]; + const IntDivider& divider = dividers[tc.divisor_idx]; Value divisor = divider.divisor; for (int j = 0; j < tc.steps; j++) { - if (sizeof(Value) == 4 && dividend > INT32_MAX) return; + if (sizeof(Value) == 4 && dividend > INT32_MAX) + return; DivMod qr = divider.divmod(dividend); assert(qr.div == dividend / divisor && qr.mod == dividend % divisor); @@ -62,18 +62,22 @@ class IntDividerTester { cudaError_t err; err = cudaMalloc(÷rsBuf_, NUM_CASES * sizeof(IntDivider)); - CATCH_REQUIRE(err == cudaSuccess); + bool isEQ = err == cudaSuccess; + EXPECT_TRUE(isEQ); err = cudaMalloc(&testCasesBuf_, NUM_CASES * sizeof(TestCase)); - CATCH_REQUIRE(err == cudaSuccess); + isEQ = err == cudaSuccess; + EXPECT_TRUE(isEQ); } ~IntDividerTester() { cudaError_t err; err = cudaFree(dividersBuf_); - CATCH_REQUIRE(err == cudaSuccess); + bool isEQ = err == cudaSuccess; + EXPECT_TRUE(isEQ); err = cudaFree(testCasesBuf_); - CATCH_REQUIRE(err == cudaSuccess); + isEQ = err == cudaSuccess; + EXPECT_TRUE(isEQ); } void addTestCase(Value dividend, Value divisor, int steps) { @@ -85,29 +89,39 @@ class IntDividerTester { testCases_.emplace_back(dividend, dividers_.size() - 1, steps); // Launch the test kernel if the buffer is full. - if (testCases_.size() == NUM_CASES) flush(); + if (testCases_.size() == NUM_CASES) + flush(); } void flush() { cudaError_t err; - - if (testCases_.empty()) return; - CATCH_REQUIRE(!dividers_.empty()); - - CATCH_REQUIRE(dividers_.size() <= NUM_CASES); - CATCH_REQUIRE(testCases_.size() <= NUM_CASES); - err = cudaMemcpy(dividersBuf_, dividers_.data(), - dividers_.size() * sizeof(IntDivider), - cudaMemcpyHostToDevice); - CATCH_REQUIRE(err == cudaSuccess); - err = cudaMemcpy(testCasesBuf_, testCases_.data(), - testCases_.size() * sizeof(TestCase), - cudaMemcpyHostToDevice); - CATCH_REQUIRE(err == cudaSuccess); + bool isTrue; + if (testCases_.empty()) + return; + + ASSERT_FALSE(dividers_.empty()); + + isTrue = dividers_.size() <= NUM_CASES; + ASSERT_TRUE(isTrue); + isTrue = testCases_.size() <= NUM_CASES; + ASSERT_TRUE(isTrue); + err = cudaMemcpy( + dividersBuf_, + dividers_.data(), + dividers_.size() * sizeof(IntDivider), + cudaMemcpyHostToDevice); + isTrue = err == cudaSuccess; + ASSERT_TRUE(isTrue); + err = cudaMemcpy( + testCasesBuf_, + testCases_.data(), + testCases_.size() * sizeof(TestCase), + cudaMemcpyHostToDevice); + isTrue = err == cudaSuccess; + ASSERT_TRUE(isTrue); int numCases = testCases_.size(); - testIntDivider<<<512, 512>>>( - dividersBuf_, testCasesBuf_, numCases); + testIntDivider<<<512, 512>>>(dividersBuf_, testCasesBuf_, numCases); dividers_.clear(); testCases_.clear(); @@ -117,8 +131,8 @@ class IntDividerTester { vector> dividers_; vector> testCases_; - IntDivider *dividersBuf_; - TestCase *testCasesBuf_; + IntDivider* dividersBuf_; + TestCase* testCasesBuf_; }; static void testUint32Divider() @@ -128,15 +142,18 @@ static void testUint32Divider() IntDividerTester tester; for (uint64_t divisor = 1; divisor <= INT32_MAX; divisor++) { - if (divisor < 1000000 && divisor % 10000 == 0) fprintf(stderr, "."); - if (divisor % 10000000 == 0) fprintf(stderr, "-"); + if (divisor < 1000000 && divisor % 10000 == 0) + fprintf(stderr, "."); + if (divisor % 10000000 == 0) + fprintf(stderr, "-"); // In order to save time, we only test when the remainder is zero or // (divisor - 1). uint64_t dividend = 0; while (dividend <= INT32_MAX) { uint64_t steps = (INT32_MAX - dividend) / divisor + 1; - if (steps > MAX_STEPS) steps = MAX_STEPS; + if (steps > MAX_STEPS) + steps = MAX_STEPS; tester.addTestCase(dividend, divisor, steps); tester.addTestCase(dividend + divisor - 1, divisor, steps); @@ -180,11 +197,11 @@ static void testUint64Divider() tester.flush(); } -CATCH_TEST_CASE( "CUDA integer divider", "[cuda]" ) { - +TEST(TestCUDAIntegerDivider, IntegerDivider) { testUint64Divider(); testUint32Divider(); cudaError_t err = cudaDeviceSynchronize(); - CATCH_REQUIRE(err == cudaSuccess); + bool isTrue = err == cudaSuccess; + ASSERT_TRUE(isTrue); } diff --git a/aten/src/ATen/test/native_test.cpp b/aten/src/ATen/test/native_test.cpp index 4c57b7d8ee1d96..6721c69b0e0f36 100644 --- a/aten/src/ATen/test/native_test.cpp +++ b/aten/src/ATen/test/native_test.cpp @@ -1,192 +1,222 @@ -#define CATCH_CONFIG_MAIN -#include "catch_utils.hpp" +#include "gtest/gtest.h" #include "ATen/ATen.h" #include "test_seed.h" using namespace at; -using Catch::Matchers::StartsWith; +#define ASSERT_EQUAL(t1, t2) ASSERT_TRUE(t1.equal(t2)); -#define REQUIRE_EQUAL(t1, t2) \ - CATCH_REQUIRE(t1.equal(t2)); +#define ASSERT_ALLCLOSE(t1, t2) \ + ASSERT_TRUE(t1.is_same_size(t2)); \ + ASSERT_TRUE(t1.allclose(t2)); -#define REQUIRE_ALLCLOSE(t1, t2) \ - CATCH_REQUIRE(t1.is_same_size(t2)); \ - CATCH_REQUIRE(t1.allclose(t2)); - -#define REQUIRE_ALLCLOSE_TOLERANCES(t1, t2, atol, rtol) \ - CATCH_REQUIRE(t1.is_same_size(t2)); \ - CATCH_REQUIRE(t1.allclose(t2, atol, rtol)); +#define ASSERT_ALLCLOSE_TOLERANCES(t1, t2, atol, rtol) \ + ASSERT_TRUE(t1.is_same_size(t2)); \ + ASSERT_TRUE(t1.allclose(t2, atol, rtol)); void requireEqualTensorList(TensorList t1, TensorList t2) { - CATCH_REQUIRE(t1.size() == t2.size()); + ASSERT_EQ(t1.size(), t2.size()); for (size_t i = 0; i < t1.size(); ++i) { - REQUIRE_EQUAL(t1[ i ], t2[ i ]); + ASSERT_EQUAL(t1[i], t2[i]); } } -void test(Type & T, Type & AccT) { - auto t = randn({3, 3}, T); - - CATCH_SECTION( "split: test method, type, namespace give same result" ) { - auto splitMethod = t.split(1, 0); - auto splitType = T.split(t, 1, 0); - auto splitNs = at::split(t, 1, 0); - requireEqualTensorList(splitMethod, splitType); - requireEqualTensorList(splitMethod, splitNs); +// split: test method, type, namespace give same result +void TestSplit(Type& T, Tensor& t) { + auto splitMethod = t.split(1, 0); + auto splitType = T.split(t, 1, 0); + auto splitNs = at::split(t, 1, 0); + requireEqualTensorList(splitMethod, splitType); + requireEqualTensorList(splitMethod, splitNs); - // test rebuilding with cat - REQUIRE_EQUAL(at::cat(splitMethod, 0), t); - } + // test rebuilding with cat + ASSERT_EQUAL(at::cat(splitMethod, 0), t); +} - CATCH_SECTION( "chunk: test method, type, namespace give same result" ) { - // test method, type, namespace give same result - auto chunkMethod = t.chunk(3, 0); - auto chunkType = T.chunk(t, 3, 0); - auto chunkNs = at::chunk(t, 3, 0); - requireEqualTensorList(chunkMethod, chunkType); - requireEqualTensorList(chunkMethod, chunkNs); +// chunk: test method, type, namespace give same result +void TestChunk(Type& T, Tensor& t) { + // test method, type, namespace give same result + auto chunkMethod = t.chunk(3, 0); + auto chunkType = T.chunk(t, 3, 0); + auto chunkNs = at::chunk(t, 3, 0); + requireEqualTensorList(chunkMethod, chunkType); + requireEqualTensorList(chunkMethod, chunkNs); + + // test rebuilding with cat + ASSERT_EQUAL(at::cat(chunkMethod, 0), t); +} - // test rebuilding with cat - REQUIRE_EQUAL(at::cat(chunkMethod, 0), t); +void TestStack(Type& T, Tensor& t) { + auto x = rand({2, 3, 4}); + auto y = rand({2, 3, 4}); + auto z = rand({2, 3, 4}); + for (int64_t dim = 0; dim < 4; ++dim) { + auto res = at::stack({x, y, z}, dim); + auto res_neg = at::stack({x, y, z}, dim - 4); + std::vector expected_size; + expected_size.insert( + expected_size.end(), x.sizes().begin(), x.sizes().begin() + dim); + expected_size.insert(expected_size.end(), 3); + expected_size.insert( + expected_size.end(), x.sizes().begin() + dim, x.sizes().end()); + + ASSERT_EQUAL(res, res_neg); + ASSERT_TRUE(res.sizes().equals(expected_size)); + ASSERT_EQUAL(res.select(dim, 0), x); + ASSERT_EQUAL(res.select(dim, 1), y); + ASSERT_EQUAL(res.select(dim, 2), z); } +} - // stack - CATCH_SECTION( "stack" ) { - auto x = rand({2, 3, 4}); - auto y = rand({2, 3, 4}); - auto z = rand({2, 3, 4}); - for (int64_t dim = 0; dim < 4; ++dim) { - auto res = at::stack({x, y, z}, dim); - auto res_neg = at::stack({x, y, z}, dim - 4); - std::vector expected_size; - expected_size.insert(expected_size.end(), x.sizes().begin(), x.sizes().begin() + dim); - expected_size.insert(expected_size.end(), 3); - expected_size.insert(expected_size.end(), x.sizes().begin() + dim, x.sizes().end()); - - REQUIRE_EQUAL(res, res_neg); - CATCH_REQUIRE(res.sizes().equals(expected_size)); - REQUIRE_EQUAL(res.select(dim, 0), x); - REQUIRE_EQUAL(res.select(dim, 1), y); - REQUIRE_EQUAL(res.select(dim, 2), z); - } - } +// size / stride +void TestSize(Type& T, Tensor& t) { + auto scalar = randn({}, T); + // Throw StartsWith("dimension specified as 0 but tensor has no dimensions") + ASSERT_ANY_THROW(scalar.size(0)); + // Throw StartsWith("dimension specified as -1 but tensor has no dimensions") + ASSERT_ANY_THROW(scalar.size(-1)); + // Throw StartsWith("dimension specified as 0 but tensor has no dimensions") + ASSERT_ANY_THROW(scalar.stride(0)); + // Throw StartsWith("dimension specified as -1 but tensor has no dimensions") + ASSERT_ANY_THROW(scalar.stride(-1)); + + auto empty = randn({0}, T); + ASSERT_EQ(empty.size(0), 0); + ASSERT_EQ(empty.size(-1), 0); + ASSERT_EQ(empty.stride(0), 1); + ASSERT_EQ(empty.stride(-1), 1); +} - CATCH_SECTION( "size / stride" ) { - auto scalar = randn({}, T); - CATCH_REQUIRE_THROWS_WITH(scalar.size(0), StartsWith("dimension specified as 0 but tensor has no dimensions")); - CATCH_REQUIRE_THROWS_WITH(scalar.size(-1), StartsWith("dimension specified as -1 but tensor has no dimensions")); - CATCH_REQUIRE_THROWS_WITH(scalar.stride(0), StartsWith("dimension specified as 0 but tensor has no dimensions")); - CATCH_REQUIRE_THROWS_WITH(scalar.stride(-1), StartsWith("dimension specified as -1 but tensor has no dimensions")); - - auto empty = randn({0}, T); - CATCH_REQUIRE(empty.size(0) == 0); - CATCH_REQUIRE(empty.size(-1) == 0); - CATCH_REQUIRE(empty.stride(0) == 1); - CATCH_REQUIRE(empty.stride(-1) == 1); - } +void TestMatmul(Type& T, Tensor& t, Type& AccT) { + auto scalar = randn({}, T); + auto d1 = randn({3}, T); + auto d2 = randn({2, 3}, T); + + // 0-d + // Throw StartsWith("both arguments to matmul need to be at least 1D") + ASSERT_ANY_THROW(scalar.matmul(d2)); + // Throw StartsWith("both arguments to matmul need to be at least 1D") + ASSERT_ANY_THROW(d2.matmul(scalar)); + + // 1-d + ASSERT_ALLCLOSE(d1.matmul(d1), d1.dot(d1)); + ASSERT_ALLCLOSE(d2.matmul(d1), d2.mv(d1)); + auto d1o = randn({2}, T); + ASSERT_ALLCLOSE(d1o.matmul(d2), d1o.unsqueeze(0).mm(d2).squeeze(0)); + + // 2-d + auto d2o = randn({3, 5}, T); + ASSERT_ALLCLOSE(d2.matmul(d2o), d2.mm(d2o)); + + // > 2-d, 1-d + auto d3 = randn({5, 2, 3}, T); + ASSERT_ALLCLOSE( + d3.matmul(d1), d3.bmm(d1.view({1, 3, 1}).expand({5, 3, 1})).view({5, 2})); + ASSERT_ALLCLOSE(d1o.matmul(d3), d1o.expand({5, 1, 2}).bmm(d3).view({5, 3})); + + auto d5 = randn({3, 2, 4, 2, 3}, T); + ASSERT_ALLCLOSE( + d5.matmul(d1), + d5.view({24, 2, 3}) + .bmm(d1.view({1, 3, 1}).expand({24, 3, 1})) + .view({3, 2, 4, 2})); + ASSERT_ALLCLOSE( + d1o.matmul(d5), + d1o.expand({24, 1, 2}).bmm(d5.view({24, 2, 3})).view({3, 2, 4, 3})); + + // > 2-d, 2-d + // we use a "folding" algorithm in this case of matmul, so the direct + // comparison to bmm doesn't work; instead, compare to the higher precision + // computation (technically, we should always do this). Tolerances are + // selected empirically. + double atol = 1e-04; + double rtol = 1e-06; + d2 = randn({3, 4}, T); + d2o = randn({4, 2}, T); + auto result = d5.matmul(d2).toType(AccT); + + auto d5Acc = d5.toType(AccT); + auto d2Acc = d2.toType(AccT); + auto acc_result = d5Acc.view({24, 2, 3}) + .bmm(d2Acc.expand({24, 3, 4})) + .view({3, 2, 4, 2, 4}); + ASSERT_ALLCLOSE_TOLERANCES(result, acc_result, atol, rtol); + ASSERT_ALLCLOSE( + d2o.matmul(d5), + d2o.expand({24, 4, 2}).bmm(d5.view({24, 2, 3})).view({3, 2, 4, 4, 3})); + + // > 2-d, > 2-d + auto d5o = randn({2, 1, 2, 4, 3, 2}, T); + auto d5_bmm_view = + d5.expand({2, 3, 2, 4, 2, 3}).contiguous().view({48, 2, 3}); + auto d5o_bmm_view = + d5o.expand({2, 3, 2, 4, 3, 2}).contiguous().view({48, 3, 2}); + ASSERT_ALLCLOSE( + d5.matmul(d5o), d5_bmm_view.bmm(d5o_bmm_view).view({2, 3, 2, 4, 2, 2})); + + // non-expandable case + auto d5wrong = randn({2, 4, 2, 4, 3, 2}, T); + // Throw Contains("must match the size") + ASSERT_ANY_THROW(d5.matmul(d5wrong)); +} - // matmul - CATCH_SECTION( "matmul" ) { - auto scalar = randn({}, T); - auto d1 = randn({3}, T); - auto d2 = randn({2, 3}, T); - - // 0-d - CATCH_REQUIRE_THROWS_WITH(scalar.matmul(d2), Catch::StartsWith("both arguments to matmul need to be at least 1D")); - CATCH_REQUIRE_THROWS_WITH(d2.matmul(scalar), Catch::StartsWith("both arguments to matmul need to be at least 1D")); - - // 1-d - REQUIRE_ALLCLOSE(d1.matmul(d1), d1.dot(d1)); - REQUIRE_ALLCLOSE(d2.matmul(d1), d2.mv(d1)); - auto d1o = randn({2}, T); - REQUIRE_ALLCLOSE(d1o.matmul(d2), d1o.unsqueeze(0).mm(d2).squeeze(0)); - - // 2-d - auto d2o = randn({3, 5}, T); - REQUIRE_ALLCLOSE(d2.matmul(d2o), d2.mm(d2o)); - - // > 2-d, 1-d - auto d3 = randn({5, 2, 3}, T); - REQUIRE_ALLCLOSE(d3.matmul(d1), d3.bmm(d1.view({1, 3, 1}).expand({5, 3, 1})).view({5, 2})); - REQUIRE_ALLCLOSE(d1o.matmul(d3), d1o.expand({5, 1, 2}).bmm(d3).view({5, 3})); - - auto d5 = randn({3, 2, 4, 2, 3}, T); - REQUIRE_ALLCLOSE(d5.matmul(d1), d5.view({24, 2, 3}).bmm(d1.view({1, 3, 1}).expand({24, 3, 1})).view({3, 2, 4, 2})); - REQUIRE_ALLCLOSE(d1o.matmul(d5), d1o.expand({24, 1, 2}).bmm(d5.view({24, 2, 3})).view({3, 2, 4, 3})); - - // > 2-d, 2-d - // we use a "folding" algorithm in this case of matmul, so the direct comparison to bmm doesn't work; - // instead, compare to the higher precision computation (technically, we should always do this). - // Tolerances are selected empirically. - double atol = 1e-04; - double rtol = 1e-06; - d2 = randn({3, 4}, T); - d2o = randn({4, 2}, T); - auto result = d5.matmul(d2).toType(AccT); - - auto d5Acc = d5.toType(AccT); - auto d2Acc = d2.toType(AccT); - auto acc_result = d5Acc.view({24, 2, 3}).bmm(d2Acc.expand({24, 3, 4})).view({3, 2, 4, 2, 4}); - REQUIRE_ALLCLOSE_TOLERANCES(result, acc_result, atol, rtol); - REQUIRE_ALLCLOSE(d2o.matmul(d5), d2o.expand({24, 4, 2}).bmm(d5.view({24, 2, 3})).view({3, 2, 4, 4, 3})); - - // > 2-d, > 2-d - auto d5o = randn({2, 1, 2, 4, 3, 2}, T); - auto d5_bmm_view = d5.expand({2, 3, 2, 4, 2, 3}).contiguous().view({48, 2, 3}); - auto d5o_bmm_view = d5o.expand({2, 3, 2, 4, 3, 2}).contiguous().view({48, 3, 2}); - REQUIRE_ALLCLOSE(d5.matmul(d5o), d5_bmm_view.bmm(d5o_bmm_view).view({2, 3, 2, 4, 2, 2})); - - // non-expandable case - auto d5wrong = randn({2, 4, 2, 4, 3, 2}, T); - CATCH_REQUIRE_THROWS_WITH(d5.matmul(d5wrong), Catch::Contains("must match the size")); - } +void TestStandardGammaGrad(Type& T, Tensor& t) { + // check empty + auto empty = ones({0}, T); + ASSERT_EQUAL(empty, at::_standard_gamma_grad(empty, empty)); + + // check scalar equals one element + auto one_scalar = ones({}, T).mul(5); + auto one_with_dim = ones({1}, T).mul(5); + ASSERT_ALLCLOSE( + at::_standard_gamma_grad(one_scalar, one_scalar), + at::_standard_gamma_grad(one_with_dim, one_with_dim).sum()); + + // check mixing types + auto t1 = randn({3, 4}, T); + auto t2 = randn({3, 4}, T).toType(kDouble); + // Throw StartsWith("expected scalar type") + ASSERT_ANY_THROW(at::_standard_gamma_grad(t1, t2)); +} - // _standard_gamma_grad - CATCH_SECTION( "_standard_gamma_grad" ) { - // check empty - auto empty = ones({0}, T); - REQUIRE_EQUAL(empty, at::_standard_gamma_grad(empty, empty)); - - // check scalar equals one element - auto one_scalar = ones({}, T).mul(5); - auto one_with_dim = ones({1}, T).mul(5); - REQUIRE_ALLCLOSE(at::_standard_gamma_grad(one_scalar, one_scalar), - at::_standard_gamma_grad(one_with_dim, one_with_dim).sum()); - - // check mixing types - auto t1 = randn({3, 4}, T); - auto t2 = randn({3, 4}, T).toType(kDouble); - CATCH_REQUIRE_THROWS_WITH(at::_standard_gamma_grad(t1, t2), Catch::StartsWith("expected scalar type")); - } +void TestWhere(Type& T, Tensor& t) { + // empty + auto empty = ones({0}, T); + auto& bT = T.toScalarType(ScalarType::Byte); + auto empty_byte = ones({0}, bT); + ASSERT_EQUAL(empty, at::where(empty_byte, empty, empty)); + + // check scalar equals one element + auto x_scalar = ones({}, T).mul(5); + auto y_scalar = ones({}, T).mul(7); + auto cond_scalar = zeros({}, bT); + auto x_1d = x_scalar.unsqueeze(0); + auto y_1d = y_scalar.unsqueeze(0); + auto cond_1d = cond_scalar.unsqueeze(0); + ASSERT_ALLCLOSE( + at::where(cond_scalar, x_scalar, y_scalar).unsqueeze(0), + at::where(cond_1d, x_1d, y_1d)); +} - CATCH_SECTION( "where" ) { - // empty - auto empty = ones({0}, T); - auto &bT = T.toScalarType(ScalarType::Byte); - auto empty_byte = ones({0}, bT); - REQUIRE_EQUAL(empty, at::where(empty_byte, empty, empty)); - - // check scalar equals one element - auto x_scalar = ones({}, T).mul(5); - auto y_scalar = ones({}, T).mul(7); - auto cond_scalar = zeros({}, bT); - auto x_1d = x_scalar.unsqueeze(0); - auto y_1d = y_scalar.unsqueeze(0); - auto cond_1d = cond_scalar.unsqueeze(0); - REQUIRE_ALLCLOSE(at::where(cond_scalar, x_scalar, y_scalar).unsqueeze(0), - at::where(cond_1d, x_1d, y_1d)); - } +void test(Type& T, Type& AccT) { + auto t = randn({3, 3}, T); + TestSplit(T, t); + TestChunk(T, t); + TestStack(T, t); + TestSize(T, t); + TestMatmul(T, t, AccT); + TestStandardGammaGrad(T, t); + TestWhere(T, t); } -CATCH_TEST_CASE( "native test CPU", "[cpu]" ) { +TEST(TestNative, NativeTestCPU) { manual_seed(123, at::kCPU); test(CPU(kFloat), CPU(kDouble)); } -CATCH_TEST_CASE( "native test CUDA", "[cuda]" ) { +TEST(TestNative, NativeTestGPU) { manual_seed(123, at::kCUDA); if (at::hasCUDA()) { diff --git a/aten/src/ATen/test/scalar_tensor_test.cpp b/aten/src/ATen/test/scalar_tensor_test.cpp index a89ca81da017f7..5bb3aafaff9247 100644 --- a/aten/src/ATen/test/scalar_tensor_test.cpp +++ b/aten/src/ATen/test/scalar_tensor_test.cpp @@ -1,5 +1,4 @@ -#define CATCH_CONFIG_MAIN -#include "catch_utils.hpp" +#include "gtest/gtest.h" #include "ATen/ATen.h" #include "test_seed.h" @@ -9,31 +8,33 @@ using namespace at; -#define TRY_CATCH_ELSE(fn, catc, els) \ - { \ - /* avoid mistakenly passing if els code throws exception*/ \ - bool _passed = false; \ - try { \ - fn; \ - _passed = true; \ - els; \ - } catch (std::exception &e) { \ - CATCH_REQUIRE(!_passed); \ - catc; \ - } \ +#define TRY_CATCH_ELSE(fn, catc, els) \ + { \ + /* avoid mistakenly passing if els code throws exception*/ \ + bool _passed = false; \ + try { \ + fn; \ + _passed = true; \ + els; \ + } catch (std::exception & e) { \ + ASSERT_FALSE(_passed); \ + catc; \ + } \ } void require_equal_size_dim(const Tensor &lhs, const Tensor &rhs) { - CATCH_REQUIRE(lhs.dim() == rhs.dim()); - CATCH_REQUIRE(lhs.sizes().equals(rhs.sizes())); + ASSERT_EQ(lhs.dim(), rhs.dim()); + ASSERT_TRUE(lhs.sizes().equals(rhs.sizes())); } bool should_expand(const IntList &from_size, const IntList &to_size) { - if(from_size.size() > to_size.size()) { + if (from_size.size() > to_size.size()) { return false; } - for (auto from_dim_it = from_size.rbegin(); from_dim_it != from_size.rend(); ++from_dim_it) { - for (auto to_dim_it = to_size.rbegin(); to_dim_it != to_size.rend(); ++to_dim_it) { + for (auto from_dim_it = from_size.rbegin(); from_dim_it != from_size.rend(); + ++from_dim_it) { + for (auto to_dim_it = to_size.rbegin(); to_dim_it != to_size.rend(); + ++to_dim_it) { if (*from_dim_it != 1 && *from_dim_it != *to_dim_it) { return false; } @@ -43,21 +44,22 @@ bool should_expand(const IntList &from_size, const IntList &to_size) { } void test(Type &T) { - std::vector > sizes = { {}, {0}, {1}, {1, 1}, {2}}; + std::vector> sizes = {{}, {0}, {1}, {1, 1}, {2}}; // single-tensor/size tests for (auto s = sizes.begin(); s != sizes.end(); ++s) { // verify that the dim, sizes, strides, etc match what was requested. auto t = ones(*s, T); - CATCH_REQUIRE((size_t)t.dim() == s->size()); - CATCH_REQUIRE((size_t)t.ndimension() == s->size()); - CATCH_REQUIRE(t.sizes().equals(*s)); - CATCH_REQUIRE(t.strides().size() == s->size()); - auto numel = std::accumulate(s->begin(), s->end(), 1, std::multiplies()); - CATCH_REQUIRE(t.numel() == numel); + ASSERT_EQ((size_t)t.dim(), s->size()); + ASSERT_EQ((size_t)t.ndimension(), s->size()); + ASSERT_TRUE(t.sizes().equals(*s)); + ASSERT_EQ(t.strides().size(), s->size()); + auto numel = + std::accumulate(s->begin(), s->end(), 1, std::multiplies()); + ASSERT_EQ(t.numel(), numel); // verify we can output std::stringstream ss; - CATCH_REQUIRE_NOTHROW(ss << t << std::endl); + ASSERT_NO_THROW(ss << t << std::endl); // set_ auto t2 = ones(*s, T); @@ -65,22 +67,22 @@ void test(Type &T) { require_equal_size_dim(t2, ones({0}, T)); // unsqueeze - CATCH_REQUIRE(t.unsqueeze(0).dim() == t.dim() + 1); + ASSERT_EQ(t.unsqueeze(0).dim(), t.dim() + 1); // unsqueeze_ { auto t2 = ones(*s, T); auto r = t2.unsqueeze_(0); - CATCH_REQUIRE(r.dim() == t.dim() + 1); + ASSERT_EQ(r.dim(), t.dim() + 1); } // squeeze (with dimension argument) if (t.dim() == 0 || t.sizes()[0] == 1) { - CATCH_REQUIRE(t.squeeze(0).dim() == std::max(t.dim() - 1, 0)); + ASSERT_EQ(t.squeeze(0).dim(), std::max(t.dim() - 1, 0)); } else { - // In PyTorch, it is a no-op to try to squeeze a dimension that has size != 1; - // in NumPy this is an error. - CATCH_REQUIRE(t.squeeze(0).dim() == t.dim()); + // In PyTorch, it is a no-op to try to squeeze a dimension that has size + // != 1; in NumPy this is an error. + ASSERT_EQ(t.squeeze(0).dim(), t.dim()); } // squeeze (with no dimension argument) @@ -98,12 +100,12 @@ void test(Type &T) { { // squeeze_ (with dimension argument) auto t2 = ones(*s, T); - if (t2.dim() == 0 || t2.sizes()[0] == 1) { - CATCH_REQUIRE(t2.squeeze_(0).dim() == std::max(t.dim() - 1, 0)); + if (t2.dim() == 0 || t2.sizes()[0] == 1) { + ASSERT_EQ(t2.squeeze_(0).dim(), std::max(t.dim() - 1, 0)); } else { - // In PyTorch, it is a no-op to try to squeeze a dimension that has size != 1; - // in NumPy this is an error. - CATCH_REQUIRE(t2.squeeze_(0).dim() == t.dim()); + // In PyTorch, it is a no-op to try to squeeze a dimension that has size + // != 1; in NumPy this is an error. + ASSERT_EQ(t2.squeeze_(0).dim(), t.dim()); } } @@ -122,154 +124,156 @@ void test(Type &T) { // reduce (with dimension argument and with 1 return argument) if (t.numel() != 0) { - CATCH_REQUIRE(t.sum(0).dim() == std::max(t.dim() - 1, 0)); + ASSERT_EQ(t.sum(0).dim(), std::max(t.dim() - 1, 0)); } else { - CATCH_REQUIRE(t.sum(0).equal(at::zeros({}, T))); + ASSERT_TRUE(t.sum(0).equal(at::zeros({}, T))); } // reduce (with dimension argument and with 2 return arguments) if (t.numel() != 0) { auto ret = t.min(0); - CATCH_REQUIRE(std::get<0>(ret).dim() == std::max(t.dim() - 1, 0)); - CATCH_REQUIRE(std::get<1>(ret).dim() == std::max(t.dim() - 1, 0)); + ASSERT_EQ(std::get<0>(ret).dim(), std::max(t.dim() - 1, 0)); + ASSERT_EQ(std::get<1>(ret).dim(), std::max(t.dim() - 1, 0)); } else { - _CATCH_REQUIRE_THROWS(t.min(0)); + ASSERT_ANY_THROW(t.min(0)); } // simple indexing if (t.dim() > 0 && t.numel() != 0) { - CATCH_REQUIRE(t[0].dim() == std::max(t.dim() - 1, 0)); + ASSERT_EQ(t[0].dim(), std::max(t.dim() - 1, 0)); } else { - _CATCH_REQUIRE_THROWS(t[0]); + ASSERT_ANY_THROW(t[0]); } // fill_ (argument to fill_ can only be a 0-dim tensor) - TRY_CATCH_ELSE(t.fill_(t.sum(0)), - CATCH_REQUIRE(t.dim() > 1), - CATCH_REQUIRE(t.dim() <= 1)); + TRY_CATCH_ELSE( + t.fill_(t.sum(0)), ASSERT_GT(t.dim(), 1), ASSERT_LE(t.dim(), 1)); } for (auto lhs_it = sizes.begin(); lhs_it != sizes.end(); ++lhs_it) { for (auto rhs_it = sizes.begin(); rhs_it != sizes.end(); ++rhs_it) { // is_same_size should only match if they are the same shape { - auto lhs = ones(*lhs_it, T); - auto rhs = ones(*rhs_it, T); - if(*lhs_it != *rhs_it) { - CATCH_REQUIRE(!lhs.is_same_size(rhs)); - CATCH_REQUIRE(!rhs.is_same_size(lhs)); - } - } - // forced size functions (resize_, resize_as, set_) - { - // resize_ - { - auto lhs = ones(*lhs_it, T); - auto rhs = ones(*rhs_it, T); - lhs.resize_(*rhs_it); - require_equal_size_dim(lhs, rhs); - } - // resize_as_ - { - auto lhs = ones(*lhs_it, T); - auto rhs = ones(*rhs_it, T); - lhs.resize_as_(rhs); - require_equal_size_dim(lhs, rhs); - } - // set_ - { - { - // with tensor - auto lhs = ones(*lhs_it, T); - auto rhs = ones(*rhs_it, T); - lhs.set_(rhs); - require_equal_size_dim(lhs, rhs); - } - { - // with storage - auto lhs = ones(*lhs_it, T); - auto rhs = ones(*rhs_it, T); - auto storage = T.storage(rhs.numel(), false); - lhs.set_(storage); - // should not be dim 0 because an empty storage is dim 1; all other storages aren't scalars - CATCH_REQUIRE(lhs.dim() != 0); - } - { - // with storage, offset, sizes, strides - auto lhs = ones(*lhs_it, T); - auto rhs = ones(*rhs_it, T); - auto storage = T.storage(rhs.numel(), false); - lhs.set_(storage, rhs.storage_offset(), rhs.sizes(), rhs.strides()); - require_equal_size_dim(lhs, rhs); - } + auto lhs = ones(*lhs_it, T); + auto rhs = ones(*rhs_it, T); + if (*lhs_it != *rhs_it) { + ASSERT_FALSE(lhs.is_same_size(rhs)); + ASSERT_FALSE(rhs.is_same_size(lhs)); } } - - // view + // forced size functions (resize_, resize_as, set_) + {// resize_ + {auto lhs = ones(*lhs_it, T); + auto rhs = ones(*rhs_it, T); + lhs.resize_(*rhs_it); + require_equal_size_dim(lhs, rhs); + } + // resize_as_ + { + auto lhs = ones(*lhs_it, T); + auto rhs = ones(*rhs_it, T); + lhs.resize_as_(rhs); + require_equal_size_dim(lhs, rhs); + } + // set_ + { { + // with tensor auto lhs = ones(*lhs_it, T); auto rhs = ones(*rhs_it, T); - auto rhs_size = *rhs_it; - TRY_CATCH_ELSE(auto result = lhs.view(rhs_size), - CATCH_REQUIRE(lhs.numel() != rhs.numel()), - CATCH_REQUIRE(lhs.numel() == rhs.numel()); require_equal_size_dim(result, rhs);); + lhs.set_(rhs); + require_equal_size_dim(lhs, rhs); } - - // take { + // with storage auto lhs = ones(*lhs_it, T); - auto rhs = zeros(*rhs_it, T).toType(ScalarType::Long); - TRY_CATCH_ELSE(auto result = lhs.take(rhs), - CATCH_REQUIRE(lhs.numel() == 0); CATCH_REQUIRE(rhs.numel() != 0), - require_equal_size_dim(result, rhs)); + auto rhs = ones(*rhs_it, T); + auto storage = T.storage(rhs.numel(), false); + lhs.set_(storage); + // should not be dim 0 because an empty storage is dim 1; all other + // storages aren't scalars + ASSERT_NE(lhs.dim(), 0); } - - - // ger { + // with storage, offset, sizes, strides auto lhs = ones(*lhs_it, T); auto rhs = ones(*rhs_it, T); - TRY_CATCH_ELSE(auto result = lhs.ger(rhs), - CATCH_REQUIRE((lhs.numel() == 0 || rhs.numel() == 0 || lhs.dim() != 1 || rhs.dim() != 1)), - [&]() { - int64_t dim0 = lhs.dim() == 0 ? 1 : lhs.size(0); - int64_t dim1 = rhs.dim() == 0 ? 1 : rhs.size(0); - require_equal_size_dim(result, at::empty({dim0, dim1}, result.options())); - }();); + auto storage = T.storage(rhs.numel(), false); + lhs.set_(storage, rhs.storage_offset(), rhs.sizes(), rhs.strides()); + require_equal_size_dim(lhs, rhs); } + } + } - // expand - { - auto lhs = ones(*lhs_it, T); - auto lhs_size = *lhs_it; - auto rhs = ones(*rhs_it, T); - auto rhs_size = *rhs_it; - bool should_pass = should_expand(lhs_size, rhs_size); - TRY_CATCH_ELSE(auto result = lhs.expand(rhs_size), - CATCH_REQUIRE(!should_pass), - CATCH_REQUIRE(should_pass); require_equal_size_dim(result, rhs);); + // view + { + auto lhs = ones(*lhs_it, T); + auto rhs = ones(*rhs_it, T); + auto rhs_size = *rhs_it; + TRY_CATCH_ELSE(auto result = lhs.view(rhs_size), + ASSERT_NE(lhs.numel(), rhs.numel()), + ASSERT_EQ(lhs.numel(), rhs.numel()); + require_equal_size_dim(result, rhs);); + } - // in-place functions (would be good if we can also do a non-broadcasting one, b/c - // broadcasting functions will always end up operating on tensors of same size; - // is there an example of this outside of assign_ ?) - { - bool should_pass_inplace = should_expand(rhs_size, lhs_size); - TRY_CATCH_ELSE(lhs.add_(rhs), - CATCH_REQUIRE(!should_pass_inplace), - CATCH_REQUIRE(should_pass_inplace); require_equal_size_dim(lhs, ones(*lhs_it, T));); - } - } + // take + { + auto lhs = ones(*lhs_it, T); + auto rhs = zeros(*rhs_it, T).toType(ScalarType::Long); + TRY_CATCH_ELSE(auto result = lhs.take(rhs), ASSERT_EQ(lhs.numel(), 0); + ASSERT_NE(rhs.numel(), 0), + require_equal_size_dim(result, rhs)); + } + + // ger + { + auto lhs = ones(*lhs_it, T); + auto rhs = ones(*rhs_it, T); + TRY_CATCH_ELSE(auto result = lhs.ger(rhs), + ASSERT_TRUE( + (lhs.numel() == 0 || rhs.numel() == 0 || + lhs.dim() != 1 || rhs.dim() != 1)), + [&]() { + int64_t dim0 = lhs.dim() == 0 ? 1 : lhs.size(0); + int64_t dim1 = rhs.dim() == 0 ? 1 : rhs.size(0); + require_equal_size_dim( + result, at::empty({dim0, dim1}, result.options())); + }();); + } + + // expand + { + auto lhs = ones(*lhs_it, T); + auto lhs_size = *lhs_it; + auto rhs = ones(*rhs_it, T); + auto rhs_size = *rhs_it; + bool should_pass = should_expand(lhs_size, rhs_size); + TRY_CATCH_ELSE(auto result = lhs.expand(rhs_size), + ASSERT_FALSE(should_pass), + ASSERT_TRUE(should_pass); + require_equal_size_dim(result, rhs);); + + // in-place functions (would be good if we can also do a non-broadcasting + // one, b/c broadcasting functions will always end up operating on tensors + // of same size; is there an example of this outside of assign_ ?) + { + bool should_pass_inplace = should_expand(rhs_size, lhs_size); + TRY_CATCH_ELSE(lhs.add_(rhs), + ASSERT_FALSE(should_pass_inplace), + ASSERT_TRUE(should_pass_inplace); + require_equal_size_dim(lhs, ones(*lhs_it, T));); } } } +} +} -CATCH_TEST_CASE( "scalar tensor test CPU", "[cpu]" ) { +TEST(TestScalarTensor, TestScalarTensorCPU) { manual_seed(123, at::kCPU); - test(CPU(kFloat)); } -CATCH_TEST_CASE( "scalar tensor test CUDA", "[cuda]" ) { +TEST(TestScalarTensor, TestScalarTensorCUDA) { manual_seed(123, at::kCUDA); if (at::hasCUDA()) { diff --git a/aten/src/ATen/test/scalar_test.cpp b/aten/src/ATen/test/scalar_test.cpp index 10ffa9afc326ff..b188146f213f56 100644 --- a/aten/src/ATen/test/scalar_test.cpp +++ b/aten/src/ATen/test/scalar_test.cpp @@ -1,5 +1,4 @@ -#define CATCH_CONFIG_MAIN -#include "catch_utils.hpp" +#include "gtest/gtest.h" #include // define constants like M_PI and C keywords for MSVC @@ -33,26 +32,25 @@ struct Foo { void test_overflow() { auto s1 = Scalar(M_PI); - CATCH_REQUIRE(s1.toFloat() == static_cast(M_PI)); + ASSERT_EQ(s1.toFloat(), static_cast(M_PI)); s1.toHalf(); s1 = Scalar(100000); - CATCH_REQUIRE(s1.toFloat() == 100000.0); - CATCH_REQUIRE(s1.toInt() == 100000); + ASSERT_EQ(s1.toFloat(), 100000.0); + ASSERT_EQ(s1.toInt(), 100000); - CATCH_REQUIRE_THROWS_AS(s1.toHalf(), std::domain_error); + ASSERT_THROW(s1.toHalf(), std::domain_error); s1 = Scalar(NAN); - CATCH_REQUIRE(std::isnan(s1.toFloat())); - CATCH_REQUIRE_THROWS_AS(s1.toInt(), std::domain_error); + ASSERT_TRUE(std::isnan(s1.toFloat())); + ASSERT_THROW(s1.toInt(), std::domain_error); s1 = Scalar(INFINITY); - CATCH_REQUIRE(std::isinf(s1.toFloat())); - CATCH_REQUIRE_THROWS_AS(s1.toInt(), std::domain_error); + ASSERT_TRUE(std::isinf(s1.toFloat())); + ASSERT_THROW(s1.toInt(), std::domain_error); } -CATCH_TEST_CASE( "scalar test", "[]" ) { - +TEST(TestScalar, TestScalar) { manual_seed(123, at::kCPU); manual_seed(123, at::kCUDA); @@ -60,54 +58,57 @@ CATCH_TEST_CASE( "scalar test", "[]" ) { Scalar bar = 3.0; Half h = bar.toHalf(); Scalar h2 = h; - cout << "H2: " << h2.toDouble() << " " << what.toFloat() << " " << bar.toDouble() << " " << what.isIntegral() << "\n"; - Generator & gen = at::globalContext().defaultGenerator(at::kCPU); - CATCH_REQUIRE_NOTHROW(gen.seed()); - auto && C = at::globalContext(); - if(at::hasCUDA()) { - auto t2 = zeros({4,4}, at::kCUDA); + cout << "H2: " << h2.toDouble() << " " << what.toFloat() << " " + << bar.toDouble() << " " << what.isIntegral() << "\n"; + Generator& gen = at::globalContext().defaultGenerator(at::kCPU); + ASSERT_NO_THROW(gen.seed()); + auto&& C = at::globalContext(); + if (at::hasCUDA()) { + auto t2 = zeros({4, 4}, at::kCUDA); cout << &t2 << "\n"; } - auto t = ones({4,4}); + auto t = ones({4, 4}); - auto wha2 = zeros({4,4}).add(t).sum(); - CATCH_REQUIRE( wha2.item() == 16.0 ); + auto wha2 = zeros({4, 4}).add(t).sum(); + ASSERT_EQ(wha2.item(), 16.0); - CATCH_REQUIRE( t.sizes()[0] == 4 ); - CATCH_REQUIRE( t.sizes()[1] == 4 ); - CATCH_REQUIRE( t.strides()[0] == 4 ); - CATCH_REQUIRE( t.strides()[1] == 1 ); + ASSERT_EQ(t.sizes()[0], 4); + ASSERT_EQ(t.sizes()[1], 4); + ASSERT_EQ(t.strides()[0], 4); + ASSERT_EQ(t.strides()[1], 1); - Type & T = CPU(Float); - Tensor x = randn({1,10}, T); - Tensor prev_h = randn({1,20}, T); - Tensor W_h = randn({20,20}, T); - Tensor W_x = randn({20,10}, T); + Type& T = CPU(Float); + Tensor x = randn({1, 10}, T); + Tensor prev_h = randn({1, 20}, T); + Tensor W_h = randn({20, 20}, T); + Tensor W_x = randn({20, 10}, T); Tensor i2h = at::mm(W_x, x.t()); Tensor h2h = at::mm(W_h, prev_h.t()); Tensor next_h = i2h.add(h2h); next_h = next_h.tanh(); - _CATCH_REQUIRE_THROWS(at::_local_scalar(Tensor{})); + ASSERT_ANY_THROW(at::_local_scalar(Tensor{})); test_overflow(); - if(at::hasCUDA()) { + if (at::hasCUDA()) { auto r = CUDA(Float).copy(next_h); - CATCH_REQUIRE(CPU(Float).copy(r).equal(next_h)); + ASSERT_TRUE(CPU(Float).copy(r).equal(next_h)); } - CATCH_REQUIRE_NOTHROW(randn({10,10,2}, T)); + ASSERT_NO_THROW(randn({10, 10, 2}, T)); // check Scalar.toTensor on Scalars backed by different data types - CATCH_REQUIRE(scalar_to_tensor(bar).type().scalarType() == kDouble); - CATCH_REQUIRE(scalar_to_tensor(what).type().scalarType() == kLong); - CATCH_REQUIRE(scalar_to_tensor(ones({})._local_scalar()).type().scalarType() == kDouble); + ASSERT_EQ(scalar_to_tensor(bar).type().scalarType(), kDouble); + ASSERT_EQ(scalar_to_tensor(what).type().scalarType(), kLong); + ASSERT_EQ( + scalar_to_tensor(ones({})._local_scalar()).type().scalarType(), kDouble); if (x.type().scalarType() != ScalarType::Half) { AT_DISPATCH_ALL_TYPES(x.type(), "foo", [&] { scalar_t s = 1; std::stringstream ss; - CATCH_REQUIRE_NOTHROW(ss << "hello, dispatch" << x.type().toString() << s << "\n"); + ASSERT_NO_THROW( + ss << "hello, dispatch" << x.type().toString() << s << "\n"); auto data = (scalar_t*)x.data_ptr(); (void)data; }); @@ -115,11 +116,11 @@ CATCH_TEST_CASE( "scalar test", "[]" ) { // test direct C-scalar type conversions { - auto x = ones({1,2}, T); - _CATCH_REQUIRE_THROWS(x.item()); + auto x = ones({1, 2}, T); + ASSERT_ANY_THROW(x.item()); } auto float_one = ones({}, T); - CATCH_REQUIRE(float_one.item() == 1); - CATCH_REQUIRE(float_one.item() == 1); - CATCH_REQUIRE((float_one.item() == 1)); + ASSERT_EQ(float_one.item(), 1); + ASSERT_EQ(float_one.item(), 1); + ASSERT_EQ(float_one.item(), 1); } diff --git a/aten/src/ATen/test/stream_test.cpp b/aten/src/ATen/test/stream_test.cpp index 8dc015dd1d06ae..5b6ec01a1577c4 100644 --- a/aten/src/ATen/test/stream_test.cpp +++ b/aten/src/ATen/test/stream_test.cpp @@ -1,5 +1,4 @@ -#define CATCH_CONFIG_MAIN -#include "catch_utils.hpp" +#include "gtest/gtest.h" #include "ATen/cuda/CUDAContext.h" #include "ATen/cuda/CUDAGuard.h" @@ -11,12 +10,23 @@ #include #include +#define ASSERT_EQ_CUDA(X, Y) \ + { \ + bool isTRUE = X == Y; \ + ASSERT_TRUE(isTRUE); \ + } + +#define ASSERT_NE_CUDA(X, Y) \ + { \ + bool isFALSE = X == Y; \ + ASSERT_FALSE(isFALSE); \ + } + /* -Tests related to ATen streams. -*/ -CATCH_TEST_CASE( - "Copying and Moving Streams", - "Verifies streams are live through copying and moving") { + Tests related to ATen streams. + */ +// Verifies streams are live through copying and moving +TEST(TestStream, CopyAndMoveTest) { int32_t device = -1; cudaStream_t cuda_stream; @@ -29,14 +39,14 @@ CATCH_TEST_CASE( copyStream = s; - CATCH_REQUIRE(copyStream.internals() == s.internals()); - CATCH_REQUIRE(copyStream.device() == device); - CATCH_REQUIRE(copyStream.stream() == cuda_stream); + ASSERT_EQ_CUDA(copyStream.internals(), s.internals()); + ASSERT_EQ_CUDA(copyStream.device(), device); + ASSERT_EQ_CUDA(copyStream.stream(), cuda_stream); } - CATCH_REQUIRE(copyStream.internals()); - CATCH_REQUIRE(copyStream.device() == device); - CATCH_REQUIRE(copyStream.stream() == cuda_stream); + ASSERT_TRUE(copyStream.internals()); + ASSERT_EQ_CUDA(copyStream.device(), device); + ASSERT_EQ_CUDA(copyStream.stream(), cuda_stream); // Tests that moving works as expected and preserves the stream at::cuda::CUDAStream moveStream; @@ -47,43 +57,43 @@ CATCH_TEST_CASE( moveStream = std::move(s); - CATCH_REQUIRE(moveStream.device() == device); - CATCH_REQUIRE(moveStream.stream() == cuda_stream); + ASSERT_EQ_CUDA(moveStream.device(), device); + ASSERT_EQ_CUDA(moveStream.stream(), cuda_stream); } - CATCH_REQUIRE(moveStream.internals()); - CATCH_REQUIRE(moveStream.device() == device); - CATCH_REQUIRE(moveStream.stream() == cuda_stream); + ASSERT_TRUE(moveStream.internals()); + ASSERT_EQ_CUDA(moveStream.device(), device); + ASSERT_EQ_CUDA(moveStream.stream(), cuda_stream); } -CATCH_TEST_CASE("Getting and Setting Streams", "Verifies streams are set properly") { +// Verifies streams are set properly +TEST(TestStream, GetAndSetTest) { at::cuda::CUDAStream myStream = at::cuda::createCUDAStream(); // Sets and gets at::cuda::setCurrentCUDAStream(myStream); at::cuda::CUDAStream curStream = at::cuda::getCurrentCUDAStream(); - CATCH_REQUIRE(myStream == curStream); + ASSERT_EQ_CUDA(myStream, curStream); // Gets, sets, and gets default stream at::cuda::CUDAStream defaultStream = at::cuda::getDefaultCUDAStream(); at::cuda::setCurrentCUDAStream(defaultStream); curStream = at::cuda::getCurrentCUDAStream(); - CATCH_REQUIRE(defaultStream != myStream); - CATCH_REQUIRE(curStream == defaultStream); + ASSERT_NE_CUDA(defaultStream, myStream); + ASSERT_EQ_CUDA(curStream, defaultStream); } void thread_fun(at::cuda::CUDAStream& cur_thread_stream) { auto new_stream = at::cuda::createCUDAStream(); at::cuda::setCurrentCUDAStream(new_stream); cur_thread_stream = at::cuda::getCurrentCUDAStream(); - CATCH_REQUIRE(cur_thread_stream == new_stream); + ASSERT_EQ_CUDA(cur_thread_stream, new_stream); } -CATCH_TEST_CASE( - "Multithread Getting and Setting", - "Ensures streams are thread local") { +// Ensures streams are thread local +TEST(TestStream, MultithreadGetAndSetTest) { at::cuda::CUDAStream s0, s1; std::thread t0{thread_fun, std::ref(s0)}; @@ -94,25 +104,25 @@ CATCH_TEST_CASE( at::cuda::CUDAStream cur_stream = at::cuda::getCurrentCUDAStream(); at::cuda::CUDAStream default_stream = at::cuda::getDefaultCUDAStream(); - CATCH_REQUIRE(cur_stream == default_stream); - CATCH_REQUIRE(cur_stream != s0); - CATCH_REQUIRE(cur_stream != s1); - CATCH_REQUIRE(s0 != s1); + ASSERT_EQ_CUDA(cur_stream, default_stream); + ASSERT_NE_CUDA(cur_stream, s0); + ASSERT_NE_CUDA(cur_stream, s1); + ASSERT_NE_CUDA(s0, s1); } -CATCH_TEST_CASE("CUDAGuard") { +// CUDA Guard +TEST(TestStream, CUDAGuardTest) { if (at::cuda::getNumGPUs() < 2) { return; } // -- begin setup - CATCH_REQUIRE(at::cuda::current_device() == 0); + ASSERT_EQ_CUDA(at::cuda::current_device(), 0); std::vector streams0 = { - at::cuda::getDefaultCUDAStream(), - at::cuda::createCUDAStream()}; - CATCH_REQUIRE(streams0[0].device() == 0); - CATCH_REQUIRE(streams0[1].device() == 0); + at::cuda::getDefaultCUDAStream(), at::cuda::createCUDAStream()}; + ASSERT_EQ_CUDA(streams0[0].device(), 0); + ASSERT_EQ_CUDA(streams0[1].device(), 0); at::cuda::setCurrentCUDAStream(streams0[0]); std::vector streams1; @@ -121,47 +131,46 @@ CATCH_TEST_CASE("CUDAGuard") { streams1.push_back(at::cuda::getDefaultCUDAStream()); streams1.push_back(at::cuda::createCUDAStream()); } - CATCH_REQUIRE(streams1[0].device() == 1); - CATCH_REQUIRE(streams1[1].device() == 1); + ASSERT_EQ_CUDA(streams1[0].device(), 1); + ASSERT_EQ_CUDA(streams1[1].device(), 1); at::cuda::setCurrentCUDAStream(streams1[0]); - CATCH_REQUIRE(at::cuda::current_device() == 0); + ASSERT_EQ_CUDA(at::cuda::current_device(), 0); // -- end setup // Test that all original streams are recorded. { at::cuda::CUDAGuard guard; - CATCH_REQUIRE(guard.original_streams().empty()); + ASSERT_TRUE(guard.original_streams().empty()); guard.set_stream(streams0[0]); - CATCH_REQUIRE( - guard.original_streams().size() == at::cuda::getNumGPUs()); - CATCH_REQUIRE(guard.original_streams()[0] == streams0[0]); - CATCH_REQUIRE(guard.original_streams()[1] == streams1[0]); + ASSERT_EQ_CUDA(guard.original_streams().size(), at::cuda::getNumGPUs()); + ASSERT_EQ_CUDA(guard.original_streams()[0], streams0[0]); + ASSERT_EQ_CUDA(guard.original_streams()[1], streams1[0]); } // Setting a stream changes the current device and the stream on that device { at::cuda::CUDAGuard guard(streams1[1]); - CATCH_REQUIRE(guard.last_device() == 1); - CATCH_REQUIRE(at::cuda::current_device() == 1); - CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[1]); + ASSERT_EQ_CUDA(guard.last_device(), 1); + ASSERT_EQ_CUDA(at::cuda::current_device(), 1); + ASSERT_EQ_CUDA(at::cuda::getCurrentCUDAStream(1), streams1[1]); } // Device and stream are now reset - CATCH_REQUIRE(at::cuda::current_device() == 0); - CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]); + ASSERT_EQ_CUDA(at::cuda::current_device(), 0); + ASSERT_EQ_CUDA(at::cuda::getCurrentCUDAStream(1), streams1[0]); // Setting only the device changes only the current device and not the stream { at::cuda::CUDAGuard guard(/*device=*/1); - CATCH_REQUIRE(guard.last_device() == 1); - CATCH_REQUIRE(at::cuda::current_device() == 1); - CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]); + ASSERT_EQ_CUDA(guard.last_device(), 1); + ASSERT_EQ_CUDA(at::cuda::current_device(), 1); + ASSERT_EQ_CUDA(at::cuda::getCurrentCUDAStream(1), streams1[0]); } - CATCH_REQUIRE(at::cuda::current_device() == 0); - CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(0) == streams0[0]); + ASSERT_EQ_CUDA(at::cuda::current_device(), 0); + ASSERT_EQ_CUDA(at::cuda::getCurrentCUDAStream(0), streams0[0]); // Setting the stream first, and then the device, first changes the devices // back, and then resets the stream on the initial device. @@ -171,12 +180,13 @@ CATCH_TEST_CASE("CUDAGuard") { guard.set_device(1); } - CATCH_REQUIRE(at::cuda::current_device() == 0); - CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(0) == streams0[0]); - CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]); + ASSERT_EQ_CUDA(at::cuda::current_device(), 0); + ASSERT_EQ_CUDA(at::cuda::getCurrentCUDAStream(0), streams0[0]); + ASSERT_EQ_CUDA(at::cuda::getCurrentCUDAStream(1), streams1[0]); } -CATCH_TEST_CASE("CUDAGuardIsMovable") { +// CUDAGuardIsMovable +TEST(TestStream, CUDAGuardMovableTest) { if (at::cuda::getNumGPUs() < 2) { return; } @@ -185,17 +195,18 @@ CATCH_TEST_CASE("CUDAGuardIsMovable") { at::cuda::CUDAGuard first(stream); first.set_device(1); at::cuda::CUDAGuard second(std::move(first)); - CATCH_REQUIRE(second.original_streams().size() == device_count); - CATCH_REQUIRE(second.original_device() == 0); - CATCH_REQUIRE(second.last_device() == 1); + ASSERT_EQ_CUDA(second.original_streams().size(), device_count); + ASSERT_EQ_CUDA(second.original_device(), 0); + ASSERT_EQ_CUDA(second.last_device(), 1); at::cuda::CUDAGuard third; third = std::move(second); - CATCH_REQUIRE(third.original_streams().size() == device_count); - CATCH_REQUIRE(third.original_device() == 0); - CATCH_REQUIRE(third.last_device() == 1); + ASSERT_EQ_CUDA(third.original_streams().size(), device_count); + ASSERT_EQ_CUDA(third.original_device(), 0); + ASSERT_EQ_CUDA(third.last_device(), 1); } -CATCH_TEST_CASE("Streampool Round Robin") { +// Streampool Round Robin +TEST(TestStream, StreamPoolTest) { std::vector streams{}; for (int i = 0; i < 200; ++i) { streams.emplace_back(at::cuda::detail::CUDAStream_createStream()); @@ -206,14 +217,17 @@ CATCH_TEST_CASE("Streampool Round Robin") { for (auto i = decltype(streams.size()){0}; i < streams.size(); ++i) { cudaStream_t cuda_stream = streams[i]; auto result_pair = stream_set.insert(cuda_stream); - if (!result_pair.second) hasDuplicates = true; + if (!result_pair.second) + hasDuplicates = true; } - CATCH_REQUIRE(hasDuplicates); + ASSERT_TRUE(hasDuplicates); } -CATCH_TEST_CASE("Multi-GPU") { - if (at::cuda::getNumGPUs() < 2) return; +// Multi-GPU +TEST(TestStream, MultiGPUTest) { + if (at::cuda::getNumGPUs() < 2) + return; at::cuda::CUDAStream s0 = at::cuda::createCUDAStream(true, 0); at::cuda::CUDAStream s1 = at::cuda::createCUDAStream(false, 1); @@ -221,17 +235,18 @@ CATCH_TEST_CASE("Multi-GPU") { at::cuda::setCurrentCUDAStream(s0); at::cuda::setCurrentCUDAStream(s1); - CATCH_REQUIRE(s0 == at::cuda::getCurrentCUDAStream()); + ASSERT_EQ_CUDA(s0, at::cuda::getCurrentCUDAStream()); at::DeviceGuard device_guard{1}; - CATCH_REQUIRE(s1 == at::cuda::getCurrentCUDAStream()); + ASSERT_EQ_CUDA(s1, at::cuda::getCurrentCUDAStream()); } -CATCH_TEST_CASE("CUDAEvent Syncs") { +// CUDAEvent Syncs +TEST(TestStream, CUDAEventSyncTest) { const auto stream = at::cuda::createCUDAStream(); at::cuda::CUDAEvent event; - CATCH_REQUIRE(!event.happened()); + ASSERT_FALSE(event.happened()); event.recordOnce(stream); @@ -242,11 +257,13 @@ CATCH_TEST_CASE("CUDAEvent Syncs") { wait_stream1.synchronize_with(event); cudaStreamSynchronize(wait_stream0); - CATCH_REQUIRE(event.happened()); + ASSERT_TRUE(event.happened()); } -CATCH_TEST_CASE("Cross-Device Events") { - if (at::cuda::getNumGPUs() < 2) return; +// Cross-Device Events +TEST(TestStream, CrossDeviceTest) { + if (at::cuda::getNumGPUs() < 2) + return; const auto stream0 = at::cuda::createCUDAStream(); at::cuda::CUDAEvent event0; @@ -257,13 +274,13 @@ CATCH_TEST_CASE("Cross-Device Events") { event0.record(stream0); event1.record(stream1); - + event0 = std::move(event1); - - CATCH_REQUIRE(event0.device() == 1); + + ASSERT_EQ_CUDA(event0.device(), 1); stream0.synchronize_with(event0); - + cudaStreamSynchronize(stream0); - CATCH_REQUIRE(event0.happened()); + ASSERT_TRUE(event0.happened()); } diff --git a/aten/src/ATen/test/test_parallel.cpp b/aten/src/ATen/test/test_parallel.cpp index 81701733b53693..99421ca225a361 100644 --- a/aten/src/ATen/test/test_parallel.cpp +++ b/aten/src/ATen/test/test_parallel.cpp @@ -1,5 +1,4 @@ -#define CATCH_CONFIG_MAIN -#include "catch_utils.hpp" +#include "gtest/gtest.h" #include "ATen/ATen.h" #include "ATen/DLConvertor.h" @@ -11,12 +10,11 @@ using namespace at; -CATCH_TEST_CASE( "parallel", "[cpu]" ) { - +TEST(TestParallel, TestParallel) { manual_seed(123, at::kCPU); set_num_threads(1); - Tensor a = rand({1,3}); + Tensor a = rand({1, 3}); a[0][0] = 1; a[0][1] = 0; a[0][2] = 0; @@ -24,5 +22,5 @@ CATCH_TEST_CASE( "parallel", "[cpu]" ) { as[0] = 1; as[1] = 0; as[2] = 0; - CATCH_REQUIRE(a.sum(0).equal(as)); + ASSERT_TRUE(a.sum(0).equal(as)); } diff --git a/aten/src/ATen/test/undefined_tensor_test.cpp b/aten/src/ATen/test/undefined_tensor_test.cpp index c01dff2d0038b1..8518c4f4358365 100644 --- a/aten/src/ATen/test/undefined_tensor_test.cpp +++ b/aten/src/ATen/test/undefined_tensor_test.cpp @@ -1,5 +1,4 @@ -#define CATCH_CONFIG_MAIN -#include "catch_utils.hpp" +#include "gtest/gtest.h" #include "ATen/ATen.h" #include "ATen/core/UndefinedTensorImpl.h" @@ -8,7 +7,7 @@ using namespace at; -CATCH_TEST_CASE( "undefined tensor test", "[]" ) { +TEST(TestUndefined, UndefinedTest) { manual_seed(123, at::kCPU); // mainly test ops on undefined tensors don't segfault and give a reasonable errror message. @@ -17,36 +16,36 @@ CATCH_TEST_CASE( "undefined tensor test", "[]" ) { std::stringstream ss; ss << und << std::endl; - CATCH_REQUIRE(!und.defined()); - CATCH_REQUIRE(std::string("UndefinedType") == und.toString()); - - _CATCH_REQUIRE_THROWS(und.strides()); - _CATCH_REQUIRE_THROWS(und.dim()); - _CATCH_REQUIRE_THROWS([]() {return Tensor();}() = Scalar(5)); - _CATCH_REQUIRE_THROWS(und.add(und)); - _CATCH_REQUIRE_THROWS(und.add(ft)); - _CATCH_REQUIRE_THROWS(ft.add(und)); - _CATCH_REQUIRE_THROWS(und.add(5)); - _CATCH_REQUIRE_THROWS(und.mm(und)); + ASSERT_FALSE(und.defined()); + ASSERT_EQ(std::string("UndefinedType"), und.toString()); + + ASSERT_ANY_THROW(und.strides()); + ASSERT_ANY_THROW(und.dim()); + ASSERT_ANY_THROW([]() { return Tensor(); }() = Scalar(5)); + ASSERT_ANY_THROW(und.add(und)); + ASSERT_ANY_THROW(und.add(ft)); + ASSERT_ANY_THROW(ft.add(und)); + ASSERT_ANY_THROW(und.add(5)); + ASSERT_ANY_THROW(und.mm(und)); und.toType(und.type()); - _CATCH_REQUIRE_THROWS(und.toType(ft.type())); - _CATCH_REQUIRE_THROWS(ft.toType(und.type())); + ASSERT_ANY_THROW(und.toType(ft.type())); + ASSERT_ANY_THROW(ft.toType(und.type())); und.toType(ScalarType::Undefined); - _CATCH_REQUIRE_THROWS(und.toType(ScalarType::Float)); - _CATCH_REQUIRE_THROWS(ft.toType(ScalarType::Undefined)); + ASSERT_ANY_THROW(und.toType(ScalarType::Float)); + ASSERT_ANY_THROW(ft.toType(ScalarType::Undefined)); // copy_ - _CATCH_REQUIRE_THROWS(und.copy_(und)); - _CATCH_REQUIRE_THROWS(und.copy_(ft)); - _CATCH_REQUIRE_THROWS(ft.copy_(und)); + ASSERT_ANY_THROW(und.copy_(und)); + ASSERT_ANY_THROW(und.copy_(ft)); + ASSERT_ANY_THROW(ft.copy_(und)); und.toBackend(Backend::Undefined); - _CATCH_REQUIRE_THROWS(und.toBackend(Backend::CPU)); - _CATCH_REQUIRE_THROWS(ft.toBackend(Backend::Undefined)); + ASSERT_ANY_THROW(und.toBackend(Backend::CPU)); + ASSERT_ANY_THROW(ft.toBackend(Backend::Undefined)); Tensor to_move = ones({1}, CPU(kFloat)); Tensor m(std::move(to_move)); - CATCH_REQUIRE(!to_move.defined()); - CATCH_REQUIRE(to_move.unsafeGetTensorImpl() == UndefinedTensorImpl::singleton()); + ASSERT_FALSE(to_move.defined()); + ASSERT_EQ(to_move.unsafeGetTensorImpl(), UndefinedTensorImpl::singleton()); } diff --git a/aten/src/ATen/test/weakref_test.cpp b/aten/src/ATen/test/weakref_test.cpp index 42c9f61b19b5e1..3539db77d65517 100644 --- a/aten/src/ATen/test/weakref_test.cpp +++ b/aten/src/ATen/test/weakref_test.cpp @@ -1,5 +1,4 @@ -#define CATCH_CONFIG_MAIN -#include "catch_utils.hpp" +#include "gtest/gtest.h" #include "ATen/ATen.h" @@ -10,53 +9,55 @@ using at::Tensor; using at::WeakTensor; -CATCH_TEST_CASE( "Weak pointer tests", "" ) { - CATCH_SECTION("gets invalidated") { - Tensor a = at::ones({2, 2}); +// Weak pointer tests +// gets invalidated +TEST(TestWeakPointer, WeakPointerGetsInvalidated) { + Tensor a = at::ones({2, 2}); + WeakTensor b = a; + a.reset(); + ASSERT_FALSE(b.lock().defined()); +} + +// can successfully lock +TEST(TestWeakPointer, WeakPointerLock) { + Tensor a = at::ones({2, 2}); + WeakTensor b = a; + auto c = b.lock(); + ASSERT_TRUE(c.defined()); + + a.reset(); + ASSERT_TRUE(b.lock().defined()); + c.reset(); + ASSERT_FALSE(b.lock().defined()); +} + +// updates refcounts correctly +TEST(TestWeakPointer, WeakUpdatesRefcountsTest) { + Tensor a = at::ones({2, 2}); + ASSERT_EQ(a.use_count(), 1); + ASSERT_EQ(a.weak_use_count(), 1); + { WeakTensor b = a; - a.reset(); - CATCH_REQUIRE_FALSE(b.lock().defined()); + ASSERT_EQ(a.use_count(), 1); + ASSERT_EQ(a.weak_use_count(), 2); } - - CATCH_SECTION("can successfully lock") { - Tensor a = at::ones({2, 2}); + ASSERT_EQ(a.use_count(), 1); + ASSERT_EQ(a.weak_use_count(), 1); + { WeakTensor b = a; - auto c = b.lock(); - CATCH_REQUIRE(c.defined()); - - a.reset(); - CATCH_REQUIRE(b.lock().defined()); - c.reset(); - CATCH_REQUIRE_FALSE(b.lock().defined()); + ASSERT_EQ(a.use_count(), 1); + auto locked = b.lock(); + ASSERT_TRUE(locked.defined()); + ASSERT_EQ(a.use_count(), 2); } - - CATCH_SECTION("updates refcounts correctly") { - Tensor a = at::ones({2, 2}); - CATCH_REQUIRE(a.use_count() == 1); - CATCH_REQUIRE(a.weak_use_count() == 1); - { - WeakTensor b = a; - CATCH_REQUIRE(a.use_count() == 1); - CATCH_REQUIRE(a.weak_use_count() == 2); - } - CATCH_REQUIRE(a.use_count() == 1); - CATCH_REQUIRE(a.weak_use_count() == 1); - { - WeakTensor b = a; - CATCH_REQUIRE(a.use_count() == 1); - auto locked = b.lock(); - CATCH_REQUIRE(locked.defined()); - CATCH_REQUIRE(a.use_count() == 2); - } - CATCH_REQUIRE(a.use_count() == 1); - CATCH_REQUIRE(a.weak_use_count() == 1); - { - WeakTensor b = a; - CATCH_REQUIRE(a.use_count() == 1); - CATCH_REQUIRE(a.weak_use_count() == 2); - a.reset(); - CATCH_REQUIRE(b.use_count() == 0); - CATCH_REQUIRE(b.weak_use_count() == 1); - } + ASSERT_EQ(a.use_count(), 1); + ASSERT_EQ(a.weak_use_count(), 1); + { + WeakTensor b = a; + ASSERT_EQ(a.use_count(), 1); + ASSERT_EQ(a.weak_use_count(), 2); + a.reset(); + ASSERT_EQ(b.use_count(), 0); + ASSERT_EQ(b.weak_use_count(), 1); } } diff --git a/aten/src/ATen/test/wrapdim_test.cpp b/aten/src/ATen/test/wrapdim_test.cpp index f76dac212a0921..f08071424625b3 100644 --- a/aten/src/ATen/test/wrapdim_test.cpp +++ b/aten/src/ATen/test/wrapdim_test.cpp @@ -1,43 +1,45 @@ -#define CATCH_CONFIG_MAIN -#include "catch_utils.hpp" +#include "gtest/gtest.h" #include "ATen/ATen.h" #include "test_seed.h" using namespace at; +void TestSimpleCase(Type& T) { + auto a = randn({2, 3, 4, 5}, T); + ASSERT_TRUE(a.prod(-4).equal(a.prod(0))); + ASSERT_TRUE(a.prod(3).equal(a.prod(-1))); +} + +void TestExpressionSpecification(Type& T) { + auto a = randn({2, 3, 4, 5}, T); + ASSERT_TRUE(a.unsqueeze(-5).equal(a.unsqueeze(0))); + ASSERT_TRUE(a.unsqueeze(4).equal(a.unsqueeze(-1))); + + // can unsqueeze scalar + auto b = randn(1, T); + b.unsafeGetTensorImpl()->maybe_zero_dim(true); + ASSERT_TRUE(b.unsqueeze(0).equal(b.unsqueeze(-1))); +} + +void TestEmptyTensor(Type& T) { + auto a = randn(0, T); + ASSERT_TRUE(a.prod(0).equal(at::ones({}, T))); +} + +void TestScalarVs1Dim1Size(Type& T) { + auto a = randn(1, T); + ASSERT_TRUE(a.prod(0).equal(a.prod(-1))); + a.unsafeGetTensorImpl()->maybe_zero_dim(true); + ASSERT_EQ(a.dim(), 0); + ASSERT_TRUE(a.prod(0).equal(a.prod(-1))); +} -CATCH_TEST_CASE( "wrapdim test", "[]" ) { +TEST(TestWrapdim, TestWrapdim) { manual_seed(123, at::kCPU); + Type& T = CPU(kFloat); - Type & T = CPU(kFloat); - - CATCH_SECTION( "simple case" ) { - auto a = randn({2, 3, 4, 5}, T); - CATCH_REQUIRE(a.prod(-4).equal(a.prod(0))); - CATCH_REQUIRE(a.prod(3).equal(a.prod(-1))); - } - - CATCH_SECTION( "expression specification" ) { - auto a = randn({2, 3, 4, 5}, T); - CATCH_REQUIRE(a.unsqueeze(-5).equal(a.unsqueeze(0))); - CATCH_REQUIRE(a.unsqueeze(4).equal(a.unsqueeze(-1))); - - // can unsqueeze scalar - auto b = randn(1, T); - b.unsafeGetTensorImpl()->maybe_zero_dim(true); - CATCH_REQUIRE(b.unsqueeze(0).equal(b.unsqueeze(-1))); - } - - CATCH_SECTION( "empty tensor" ) { - auto a = randn(0, T); - CATCH_REQUIRE(a.prod(0).equal(at::ones({}, T))); - } - - CATCH_SECTION( "scalar vs 1-dim, 1-size" ) { - auto a = randn(1, T); - CATCH_REQUIRE(a.prod(0).equal(a.prod(-1))); - a.unsafeGetTensorImpl()->maybe_zero_dim(true); - CATCH_REQUIRE(a.dim() == 0); - CATCH_REQUIRE(a.prod(0).equal(a.prod(-1))); - } + TestSimpleCase(T); + TestEmptyTensor(T); + TestScalarVs1Dim1Size(T); + TestExpressionSpecification(T); } diff --git a/aten/src/THC/THCAtomics.cuh b/aten/src/THC/THCAtomics.cuh index 756fa0f905ac13..8752b0df458cfd 100644 --- a/aten/src/THC/THCAtomics.cuh +++ b/aten/src/THC/THCAtomics.cuh @@ -96,19 +96,24 @@ static inline __device__ void atomicAdd(int64_t *address, int64_t val) { } static inline __device__ void atomicAdd(at::Half *address, at::Half val) { - unsigned int * address_as_ui = - (unsigned int *) ((char *)address - ((size_t)address & 2)); - unsigned int old = *address_as_ui; - unsigned int assumed; + #if ((CUDA_VERSION < 10000) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))) + unsigned int * address_as_ui = + (unsigned int *) ((char *)address - ((size_t)address & 2)); + unsigned int old = *address_as_ui; + unsigned int assumed; + + do { + assumed = old; + at::Half hsum; + hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); + hsum = THCNumerics::add(hsum, val); + old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x; + old = atomicCAS(address_as_ui, assumed, old); + } while (assumed != old); + #else + atomicAdd(reinterpret_cast<__half*>(address), val); + #endif - do { - assumed = old; - at::Half hsum; - hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); - hsum = THCNumerics::add(hsum, val); - old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x; - old = atomicCAS(address_as_ui, assumed, old); - } while (assumed != old); } #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000) diff --git a/aten/src/THC/THCNumerics.cuh b/aten/src/THC/THCNumerics.cuh index 157a324f6e45b8..27ec95adbaa82e 100644 --- a/aten/src/THC/THCNumerics.cuh +++ b/aten/src/THC/THCNumerics.cuh @@ -1,6 +1,7 @@ #ifndef THC_NUMERICS_INC #define THC_NUMERICS_INC +#include #include #include #include diff --git a/binaries/benchmark_helper.cc b/binaries/benchmark_helper.cc index f481a6292c7f56..ecbae477282c15 100644 --- a/binaries/benchmark_helper.cc +++ b/binaries/benchmark_helper.cc @@ -81,8 +81,8 @@ void setOperatorEngine(caffe2::NetDef* net_def, const string& backend) { : backend == "cuda" ? "CUDA" : backend == "dnnlowp" ? "DNNLOWP" - : backend == "dnnlowp_16" - ? "DNNLOWP_16" + : backend == "dnnlowp_acc16" + ? "DNNLOWP_ACC16" : backend == "default" ? "" : "NONE"; CAFFE_ENFORCE(engine != "NONE", "Backend is not supported"); for (int i = 0; i < net_def->op_size(); i++) { diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt index 4b7bab4f42eeb9..490baa56a8acee 100644 --- a/c10/CMakeLists.txt +++ b/c10/CMakeLists.txt @@ -16,6 +16,9 @@ configure_file( # check with the core PyTorch developers as the dependendency will be # transitively passed on to all libraries dependent on PyTorch. file(GLOB_RECURSE C10_SRCS *.cpp) +# exclude test files +file(GLOB_RECURSE C10_ALL_TEST_FILES test/*.cpp) +exclude(C10_SRCS "${C10_SRCS}" ${C10_ALL_TEST_FILES}) file(GLOB_RECURSE C10_HEADERS *.h) add_library(c10 ${C10_SRCS} ${C10_HEADERS}) # If building shared library, set dllimport/dllexport proper. @@ -31,6 +34,8 @@ target_include_directories( $ $) +add_subdirectory(test) + # ---[ Installation # Note: for now, we will put all export path into one single Caffe2Targets group # to deal with the cmake deployment need. Inside the Caffe2Targets set, the diff --git a/c10/c10_dummy.cpp b/c10/c10_dummy.cpp deleted file mode 100644 index df4e73171da3ff..00000000000000 --- a/c10/c10_dummy.cpp +++ /dev/null @@ -1,7 +0,0 @@ -#include "c10/c10_dummy.h" - -namespace c10 { -bool HasC10() { - return true; -} -} // namespace c10 diff --git a/c10/c10_dummy.h b/c10/c10_dummy.h deleted file mode 100644 index cf6c6b30c14bbf..00000000000000 --- a/c10/c10_dummy.h +++ /dev/null @@ -1,7 +0,0 @@ -#pragma once - -#include "c10/macros/Macros.h" - -namespace c10 { -C10_API bool HasC10(); -} diff --git a/c10/macros/Export.h b/c10/macros/Export.h index 8e593e0100bbf9..4527150c8f6803 100644 --- a/c10/macros/Export.h +++ b/c10/macros/Export.h @@ -1,3 +1,6 @@ +#ifndef C10_MACROS_EXPORT_H_ +#define C10_MACROS_EXPORT_H_ + /* Header file to define the common scaffolding for exported symbols. * * Export is by itself a quite tricky situation to deal with, and if you are @@ -9,8 +12,6 @@ * Do NOT include this file directly. Instead, use c10/macros/Macros.h */ -#pragma once - // You do not need to edit this part of file unless you are changing the core // pytorch export abstractions. // @@ -74,3 +75,5 @@ #else #define CAFFE2_API C10_IMPORT #endif + +#endif // C10_MACROS_MACROS_H_ diff --git a/c10/macros/Legacy.h b/c10/macros/Legacy.h deleted file mode 100644 index 86752a838acd32..00000000000000 --- a/c10/macros/Legacy.h +++ /dev/null @@ -1,7 +0,0 @@ -/* A centralized location to provide legacy macro support, and a warning about - * when this legacy compatibility symbol is going to removed in the future. - * - * Do NOT include this file directly. Instead, use c10/macros/Macros.h - */ - -#pragma once diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h index 2b438d670f00de..ad9fafd4ab8f55 100644 --- a/c10/macros/Macros.h +++ b/c10/macros/Macros.h @@ -1,11 +1,12 @@ +#ifndef C10_MACROS_MACROS_H_ +#define C10_MACROS_MACROS_H_ + /* Main entry for c10/macros. * * In your code, include c10/macros/Macros.h directly, instead of individual * files in this folder. */ -#pragma once - // For build systems that do not directly depend on CMake and directly build // from the source directory (such as Buck), one may not have a cmake_macros.h // file at all. In this case, the build system is responsible for providing @@ -28,5 +29,4 @@ classname(const classname&) = delete; \ classname& operator=(const classname&) = delete -// Finally, file that provides legacy support for macros -#include "c10/macros/Legacy.h" +#endif // C10_MACROS_MACROS_H_ diff --git a/c10/macros/cmake_macros.h.in b/c10/macros/cmake_macros.h.in index 73bc803f063551..c211c54bdd7af6 100644 --- a/c10/macros/cmake_macros.h.in +++ b/c10/macros/cmake_macros.h.in @@ -1,6 +1,9 @@ +#ifndef C10_MACROS_CMAKE_MACROS_H_ +#define C10_MACROS_CMAKE_MACROS_H_ + // Automatically generated header file for the C10 library. // Do not include this file directly. Instead, include c10/macros/Macros.h. -#pragma once - #cmakedefine C10_BUILD_SHARED_LIBS + +#endif // C10_MACROS_CMAKE_MACROS_H_ diff --git a/c10/test/CMakeLists.txt b/c10/test/CMakeLists.txt new file mode 100644 index 00000000000000..a2a29f59eb5bd8 --- /dev/null +++ b/c10/test/CMakeLists.txt @@ -0,0 +1,15 @@ +# ---[ Test binaries. + +file(GLOB C10_ALL_TEST_FILES *.cpp) +if (BUILD_TEST) + foreach(test_src ${C10_ALL_TEST_FILES}) + get_filename_component(test_file_name ${test_src} NAME_WE) + set(test_name "c10_${test_file_name}") + add_executable(${test_name} "${test_src}") + target_link_libraries(${test_name} c10 gtest_main) + add_test(NAME ${test_name} COMMAND $) + if (INSTALL_TEST) + install(TARGETS ${test_name} DESTINATION test) + endif() + endforeach() +endif() diff --git a/c10/test/registry_test.cpp b/c10/test/registry_test.cpp new file mode 100644 index 00000000000000..c6e7f620e602b5 --- /dev/null +++ b/c10/test/registry_test.cpp @@ -0,0 +1,49 @@ +#include +#include +#include + +#include "c10/util/Registry.h" + +// Note: we use a different namespace to test if the macros defined in +// Registry.h actuall works with a different namespace from c10. +namespace c10_test { + +class Foo { + public: + explicit Foo(int x) { + // LOG(INFO) << "Foo " << x; + } +}; + +C10_DECLARE_REGISTRY(FooRegistry, Foo, int); +C10_DEFINE_REGISTRY(FooRegistry, Foo, int); +#define REGISTER_FOO(clsname) C10_REGISTER_CLASS(FooRegistry, clsname, clsname) + +class Bar : public Foo { + public: + explicit Bar(int x) : Foo(x) { + // LOG(INFO) << "Bar " << x; + } +}; +REGISTER_FOO(Bar); + +class AnotherBar : public Foo { + public: + explicit AnotherBar(int x) : Foo(x) { + // LOG(INFO) << "AnotherBar " << x; + } +}; +REGISTER_FOO(AnotherBar); + +TEST(RegistryTest, CanRunCreator) { + std::unique_ptr bar(FooRegistry()->Create("Bar", 1)); + EXPECT_TRUE(bar != nullptr) << "Cannot create bar."; + std::unique_ptr another_bar(FooRegistry()->Create("AnotherBar", 1)); + EXPECT_TRUE(another_bar != nullptr); +} + +TEST(RegistryTest, ReturnNullOnNonExistingCreator) { + EXPECT_EQ(FooRegistry()->Create("Non-existing bar", 1), nullptr); +} + +} // namespace c10_test diff --git a/c10/util/Registry.h b/c10/util/Registry.h new file mode 100644 index 00000000000000..9f310c73483263 --- /dev/null +++ b/c10/util/Registry.h @@ -0,0 +1,226 @@ +#ifndef C10_UTIL_REGISTRY_H_ +#define C10_UTIL_REGISTRY_H_ + +/** + * Simple registry implementation that uses static variables to + * register object creators during program initialization time. + */ + +// NB: This Registry works poorly when you have other namespaces. +// Make all macro invocations from inside the at namespace. + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "c10/util/Type.h" + +namespace c10 { + +template +inline void PrintOffendingKey(const KeyType& /*key*/) { + printf("[key type printing not supported]\n"); +} + +template <> +inline void PrintOffendingKey(const std::string& key) { + printf("Offending key: %s.\n", key.c_str()); +} + +/** + * @brief A template class that allows one to register classes by keys. + * + * The keys are usually a std::string specifying the name, but can be anything + * that can be used in a std::map. + * + * You should most likely not use the Registry class explicitly, but use the + * helper macros below to declare specific registries as well as registering + * objects. + */ +template +class Registry { + public: + typedef std::function Creator; + + Registry() : registry_() {} + + void Register(const SrcType& key, Creator creator) { + std::lock_guard lock(register_mutex_); + // The if statement below is essentially the same as the following line: + // CHECK_EQ(registry_.count(key), 0) << "Key " << key + // << " registered twice."; + // However, CHECK_EQ depends on google logging, and since registration is + // carried out at static initialization time, we do not want to have an + // explicit dependency on glog's initialization function. + if (registry_.count(key) != 0) { + printf("Key already registered.\n"); + PrintOffendingKey(key); + std::exit(1); + } + registry_[key] = creator; + } + + void Register( + const SrcType& key, + Creator creator, + const std::string& help_msg) { + Register(key, creator); + help_message_[key] = help_msg; + } + + inline bool Has(const SrcType& key) { + return (registry_.count(key) != 0); + } + + ObjectPtrType Create(const SrcType& key, Args... args) { + if (registry_.count(key) == 0) { + // Returns nullptr if the key is not registered. + return nullptr; + } + return registry_[key](args...); + } + + /** + * Returns the keys currently registered as a std::vector. + */ + std::vector Keys() const { + std::vector keys; + for (const auto& it : registry_) { + keys.push_back(it.first); + } + return keys; + } + + inline const std::unordered_map& HelpMessage() const { + return help_message_; + } + + const char* HelpMessage(const SrcType& key) const { + auto it = help_message_.find(key); + if (it == help_message_.end()) { + return nullptr; + } + return it->second.c_str(); + } + + private: + std::unordered_map registry_; + std::unordered_map help_message_; + std::mutex register_mutex_; + + C10_DISABLE_COPY_AND_ASSIGN(Registry); +}; + +template +class Registerer { + public: + Registerer( + const SrcType& key, + Registry* registry, + typename Registry::Creator creator, + const std::string& help_msg = "") { + registry->Register(key, creator, help_msg); + } + + template + static ObjectPtrType DefaultCreator(Args... args) { + return ObjectPtrType(new DerivedType(args...)); + } +}; + +/** + * C10_ANONYMOUS_VARIABLE(str) introduces an identifier starting with + * str and ending with a number that varies with the line. + */ +#define C10_CONCATENATE_IMPL(s1, s2) s1##s2 +#define C10_CONCATENATE(s1, s2) C10_CONCATENATE_IMPL(s1, s2) +#ifdef __COUNTER__ +#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __COUNTER__) +#else +#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __LINE__) +#endif + +/** + * C10_DECLARE_TYPED_REGISTRY is a macro that expands to a function + * declaration, as well as creating a convenient typename for its corresponding + * registerer. + */ +// Note on C10_IMPORT and C10_EXPORT below: we need to explicitly mark DECLARE +// as import and DEFINE as export, because these registry macros will be used +// in downstream shared libraries as well, and one cannot use *_API - the API +// macro will be defined on a per-shared-library basis. Semantically, when one +// declares a typed registry it is always going to be IMPORT, and when one +// defines a registry (which should happen ONLY ONCE and ONLY IN SOURCE FILE), +// the instantiation unit is always going to be exported. +// +// The only unique condition is when in the same file one does DECLARE and +// DEFINE - in Windows compilers, this generates a warning that dllimport and +// dllexport are mixed, but the warning is fine and linker will be properly +// exporting the symbol. Same thing happens in the gflags flag declaration and +// definition caes. +#define C10_DECLARE_TYPED_REGISTRY( \ + RegistryName, SrcType, ObjectType, PtrType, ...) \ + C10_IMPORT ::c10::Registry, ##__VA_ARGS__>* \ + RegistryName(); \ + typedef ::c10::Registerer, ##__VA_ARGS__> \ + Registerer##RegistryName + +#define C10_DEFINE_TYPED_REGISTRY( \ + RegistryName, SrcType, ObjectType, PtrType, ...) \ + C10_EXPORT ::c10::Registry, ##__VA_ARGS__>* \ + RegistryName() { \ + static ::c10::Registry, ##__VA_ARGS__>* \ + registry = new ::c10:: \ + Registry, ##__VA_ARGS__>(); \ + return registry; \ + } + +// Note(Yangqing): The __VA_ARGS__ below allows one to specify a templated +// creator with comma in its templated arguments. +#define C10_REGISTER_TYPED_CREATOR(RegistryName, key, ...) \ + static Registerer##RegistryName C10_ANONYMOUS_VARIABLE(g_##RegistryName)( \ + key, RegistryName(), ##__VA_ARGS__); + +#define C10_REGISTER_TYPED_CLASS(RegistryName, key, ...) \ + static Registerer##RegistryName C10_ANONYMOUS_VARIABLE(g_##RegistryName)( \ + key, \ + RegistryName(), \ + Registerer##RegistryName::DefaultCreator<__VA_ARGS__>, \ + ::c10::demangle_type<__VA_ARGS__>()); + +// C10_DECLARE_REGISTRY and C10_DEFINE_REGISTRY are hard-wired to use +// std::string as the key type, because that is the most commonly used cases. +#define C10_DECLARE_REGISTRY(RegistryName, ObjectType, ...) \ + C10_DECLARE_TYPED_REGISTRY( \ + RegistryName, std::string, ObjectType, std::unique_ptr, ##__VA_ARGS__) + +#define C10_DEFINE_REGISTRY(RegistryName, ObjectType, ...) \ + C10_DEFINE_TYPED_REGISTRY( \ + RegistryName, std::string, ObjectType, std::unique_ptr, ##__VA_ARGS__) + +#define C10_DECLARE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \ + C10_DECLARE_TYPED_REGISTRY( \ + RegistryName, std::string, ObjectType, std::shared_ptr, ##__VA_ARGS__) + +#define C10_DEFINE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \ + C10_DEFINE_TYPED_REGISTRY( \ + RegistryName, std::string, ObjectType, std::shared_ptr, ##__VA_ARGS__) + +// C10_REGISTER_CREATOR and C10_REGISTER_CLASS are hard-wired to use std::string +// as the key +// type, because that is the most commonly used cases. +#define C10_REGISTER_CREATOR(RegistryName, key, ...) \ + C10_REGISTER_TYPED_CREATOR(RegistryName, #key, __VA_ARGS__) + +#define C10_REGISTER_CLASS(RegistryName, key, ...) \ + C10_REGISTER_TYPED_CLASS(RegistryName, #key, __VA_ARGS__) + +} // namespace c10 + +#endif // C10_UTIL_REGISTRY_H_ diff --git a/c10/util/Type.cpp b/c10/util/Type.cpp new file mode 100644 index 00000000000000..3e00055c699104 --- /dev/null +++ b/c10/util/Type.cpp @@ -0,0 +1,59 @@ +#include "c10/util/Type.h" + +#include +#include +#include + +#if defined(__ANDROID__) || defined(_WIN32) || defined(__EMSCRIPTEN__) +#define HAS_DEMANGLE 0 +#elif defined(__APPLE__) && \ + (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE) +#define HAS_DEMANGLE 0 +#else +#define HAS_DEMANGLE 1 +#endif + +#if HAS_DEMANGLE + +#include +#include + +namespace c10 { + +std::string demangle(const char* name) { + int status = -1; + + // This function will demangle the mangled function name into a more human + // readable format, e.g. _Z1gv -> g(). + // More information: + // https://github.com/gcc-mirror/gcc/blob/master/libstdc%2B%2B-v3/libsupc%2B%2B/cxxabi.h + // NOTE: `__cxa_demangle` returns a malloc'd string that we have to free + // ourselves. + std::unique_ptr> demangled( + abi::__cxa_demangle( + name, + /*__output_buffer=*/nullptr, + /*__length=*/0, + &status), + /*deleter=*/free); + + // Demangling may fail, for example when the name does not follow the + // standard C++ (Itanium ABI) mangling scheme. This is the case for `main` + // or `clone` for example, so the mangled name is a fine default. + if (status == 0) { + return demangled.get(); + } else { + return name; + } +} + +} // namespace c10 + +#else // HAS_DEMANGLE +namespace c10 { +std::string demangle(const char* name) { + return std::string(name); +} +} // namespace c10 + +#endif // HAS_DEMANGLE diff --git a/c10/util/Type.h b/c10/util/Type.h new file mode 100644 index 00000000000000..ddaa0c258753a7 --- /dev/null +++ b/c10/util/Type.h @@ -0,0 +1,28 @@ +#ifndef C10_UTIL_TYPE_H_ +#define C10_UTIL_TYPE_H_ + +#include +#include +#include + +#include "c10/macros/Macros.h" + +namespace c10 { + +/// Utility to demangle a C++ symbol name. +C10_API std::string demangle(const char* name); + +/// Returns the printable name of the type. +template +inline const char* demangle_type() { +#ifdef __GXX_RTTI + static const std::string name = demangle(typeid(T).name()); + return name.c_str(); +#else // __GXX_RTTI + return "(RTTI disabled, cannot show name)"; +#endif // __GXX_RTTI +} + +} // namespace c10 + +#endif // C10_UTIL_TYPE_H_ diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 885ca028fb2464..07f69d9f7bab98 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -400,9 +400,6 @@ if (BUILD_TEST) target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS} gtest_main) target_include_directories(${test_name} PRIVATE $) target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE}) - if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.0) - target_compile_features(${test_name} PRIVATE cxx_range_for) - endif() add_test(NAME ${test_name} COMMAND $) if (INSTALL_TEST) install(TARGETS ${test_name} DESTINATION test) @@ -416,9 +413,6 @@ if (BUILD_TEST) target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS} gtest_main) target_include_directories(${test_name} PRIVATE $) target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE}) - if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.0) - target_compile_features(${test_name} PRIVATE cxx_range_for) - endif() add_test(NAME ${test_name} COMMAND $) if (INSTALL_TEST) install(TARGETS ${test_name} DESTINATION test) @@ -434,9 +428,6 @@ if (BUILD_TEST) target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS} gtest_main) target_include_directories(${test_name} PRIVATE $) target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE}) - if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.0) - target_compile_features(${test_name} PRIVATE cxx_range_for) - endif() add_test(NAME ${test_name} COMMAND $) if (INSTALL_TEST) install(TARGETS ${test_name} DESTINATION test) diff --git a/caffe2/core/allocator.h b/caffe2/core/allocator.h index 96bc720ccd59d1..aa41595ae06b66 100644 --- a/caffe2/core/allocator.h +++ b/caffe2/core/allocator.h @@ -1,6 +1,7 @@ #ifndef CAFFE2_CORE_ALLOCATOR_H_ #define CAFFE2_CORE_ALLOCATOR_H_ +#include #include #include "caffe2/core/logging.h" diff --git a/caffe2/core/blob.h b/caffe2/core/blob.h index e09a54cbd2df56..06f278aac2ae86 100644 --- a/caffe2/core/blob.h +++ b/caffe2/core/blob.h @@ -21,13 +21,13 @@ inline bool BlobIsTensorType(const Blob& blob, DeviceType device_type) { return false; } const Tensor* tensor = &blob.Get(); - return tensor && tensor->GetDeviceType() == device_type; + return tensor && *tensor && tensor->GetDeviceType() == device_type; } inline Tensor* BlobGetMutableTensor(Blob* blob, DeviceType device_type) { if (blob->IsType()) { Tensor* tensor = blob->GetMutable(); - if (tensor->GetDeviceType() == device_type) { + if (*tensor && tensor->GetDeviceType() == device_type) { return tensor; } } diff --git a/caffe2/core/blob_serialization.cc b/caffe2/core/blob_serialization.cc index d4ef19db69ce4f..8126b3d59425a1 100644 --- a/caffe2/core/blob_serialization.cc +++ b/caffe2/core/blob_serialization.cc @@ -196,7 +196,7 @@ void TensorSerializer::Serialize( const TensorProto::DataType data_type = TypeMetaToDataType(input.meta()); proto.set_data_type(data_type); StoreDeviceDetail(input, &proto); - auto uniq_ptr = input.GetStaticContext()->CreateContext(); + auto uniq_ptr = CreateContext(input.GetDevice()); // A lot of copypaste is error prone. Should we create a macro for this? switch (data_type) { case TensorProto_DataType_FLOAT: @@ -322,13 +322,13 @@ void TensorSerializer::StoreDeviceDetail( input.ExtractDeviceOption(proto->mutable_device_detail()); } // The actual serialization registry objects. -CAFFE_DEFINE_TYPED_REGISTRY( +C10_DEFINE_TYPED_REGISTRY( BlobSerializerRegistry, TypeIdentifier, BlobSerializerBase, std::unique_ptr); -CAFFE_DEFINE_REGISTRY(BlobDeserializerRegistry, BlobDeserializerBase); +C10_DEFINE_REGISTRY(BlobDeserializerRegistry, BlobDeserializerBase); void DeserializeBlob(const string& content, Blob* result) { BlobProto blob_proto; @@ -371,8 +371,7 @@ void TensorDeserializer::Deserialize(const BlobProto& blob_proto, Blob* blob) { void TensorDeserializer::Deserialize(const TensorProto& proto, Tensor* tensor) { // We create a local context for deserializing. Since Caffe2 contexts are // usually lightweight, this should not involve too much overhead. - auto uniq_ptr = - tensor->GetStaticContext()->CreateContext(proto.device_detail()); + auto uniq_ptr = CreateContext(OptionToDevice(proto.device_detail())); auto context = uniq_ptr.get(); context->SwitchToDevice(0); vector dims; diff --git a/caffe2/core/blob_serializer_base.h b/caffe2/core/blob_serializer_base.h index b51f3da21a30f4..4e0e3e4d6d18fe 100644 --- a/caffe2/core/blob_serializer_base.h +++ b/caffe2/core/blob_serializer_base.h @@ -3,8 +3,8 @@ #include #include +#include "c10/util/Registry.h" #include "caffe2/core/common.h" -#include "caffe2/core/registry.h" #include "caffe2/proto/caffe2_pb.h" namespace caffe2 { @@ -57,13 +57,13 @@ class BlobSerializerBase { }; // The Blob serialization registry and serializer creator functions. -CAFFE_DECLARE_TYPED_REGISTRY( +C10_DECLARE_TYPED_REGISTRY( BlobSerializerRegistry, TypeIdentifier, BlobSerializerBase, std::unique_ptr); #define REGISTER_BLOB_SERIALIZER(id, ...) \ - CAFFE_REGISTER_TYPED_CLASS(BlobSerializerRegistry, id, __VA_ARGS__) + C10_REGISTER_TYPED_CLASS(BlobSerializerRegistry, id, __VA_ARGS__) // Creates an operator with the given operator definition. inline unique_ptr CreateSerializer(TypeIdentifier id) { return BlobSerializerRegistry()->Create(id); @@ -82,9 +82,9 @@ class CAFFE2_API BlobDeserializerBase { virtual void Deserialize(const BlobProto& proto, Blob* blob) = 0; }; -CAFFE_DECLARE_REGISTRY(BlobDeserializerRegistry, BlobDeserializerBase); +C10_DECLARE_REGISTRY(BlobDeserializerRegistry, BlobDeserializerBase); #define REGISTER_BLOB_DESERIALIZER(name, ...) \ - CAFFE_REGISTER_CLASS(BlobDeserializerRegistry, name, __VA_ARGS__) + C10_REGISTER_CLASS(BlobDeserializerRegistry, name, __VA_ARGS__) // Creates an operator with the given operator definition. inline unique_ptr CreateDeserializer(const string& type) { return BlobDeserializerRegistry()->Create(type); diff --git a/caffe2/core/blob_stats.h b/caffe2/core/blob_stats.h index 67f9e88e2edc62..5c9f80f518f91c 100644 --- a/caffe2/core/blob_stats.h +++ b/caffe2/core/blob_stats.h @@ -1,7 +1,7 @@ #pragma once +#include "c10/util/Registry.h" #include "caffe2/core/blob.h" -#include "caffe2/core/registry.h" #include "caffe2/core/typeid.h" #include @@ -33,7 +33,7 @@ struct BlobStatRegistry { #define REGISTER_BLOB_STAT_GETTER(Type, BlobStatGetterClass) \ static BlobStatRegistry::Registrar \ - CAFFE_ANONYMOUS_VARIABLE(BlobStatRegistry) + C10_ANONYMOUS_VARIABLE(BlobStatRegistry) namespace BlobStat { diff --git a/caffe2/core/blob_test.cc b/caffe2/core/blob_test.cc index bb2f4ba6a91818..d856655433aa3e 100644 --- a/caffe2/core/blob_test.cc +++ b/caffe2/core/blob_test.cc @@ -3,6 +3,7 @@ #include #include +#include "c10/util/Registry.h" #include "caffe2/core/blob.h" #include "caffe2/core/blob_serialization.h" #include "caffe2/core/common.h" @@ -11,7 +12,6 @@ #include "caffe2/core/operator.h" #include "caffe2/core/qtensor.h" #include "caffe2/core/qtensor_serialization.h" -#include "caffe2/core/registry.h" #include "caffe2/core/tensor.h" #include "caffe2/core/types.h" #include "caffe2/core/workspace.h" @@ -967,7 +967,7 @@ CAFFE_KNOWN_TYPE(DummyType); namespace { REGISTER_BLOB_SERIALIZER((TypeMeta::Id()), DummyTypeSerializer); -CAFFE_REGISTER_TYPED_CLASS( +C10_REGISTER_TYPED_CLASS( BlobDeserializerRegistry, "DummyType", DummyTypeDeserializer); diff --git a/caffe2/core/common.h b/caffe2/core/common.h index 93bbf341b5061a..d1803a6a2d2812 100644 --- a/caffe2/core/common.h +++ b/caffe2/core/common.h @@ -24,16 +24,13 @@ // Macros used during the build of this caffe2 instance. This header file // is automatically generated by the cmake script during build. +#include "caffe2/core/common.h" #include "caffe2/core/macros.h" #include "c10/macros/Macros.h" namespace caffe2 { -// Data type for caffe2 Index/Size. We use size_t to be safe here as well as for -// large matrices that are common in sparse math. -typedef int64_t TIndex; - // Note(Yangqing): NVCC does not play well with unordered_map on some platforms, // forcing us to use std::map instead of unordered_map. This may affect speed // in some cases, but in most of the computation code we do not access map very diff --git a/caffe2/core/common_gpu.cc b/caffe2/core/common_gpu.cc index 9e39a85721186f..e2794bbd39d92f 100644 --- a/caffe2/core/common_gpu.cc +++ b/caffe2/core/common_gpu.cc @@ -2,6 +2,7 @@ #include #include +#include #include #include "caffe2/core/asan.h" diff --git a/caffe2/core/context.cc b/caffe2/core/context.cc index 30819afdc4ce3f..94047eb71ee0b6 100644 --- a/caffe2/core/context.cc +++ b/caffe2/core/context.cc @@ -5,6 +5,10 @@ #include #endif +namespace at { + +REGISTER_CONTEXT(DeviceType::CPU, caffe2::CPUContext); +} // namespace at namespace caffe2 { uint32_t RandomNumberSeed() { diff --git a/caffe2/core/context.h b/caffe2/core/context.h index aff66534d22198..af66396af72c44 100644 --- a/caffe2/core/context.h +++ b/caffe2/core/context.h @@ -50,6 +50,8 @@ class CAFFE2_API CPUContext final : public BaseContext { : RandomNumberSeed()) { CAFFE_ENFORCE_EQ(option.device_type(), PROTO_CPU); } + explicit CPUContext(const at::Device& device) + : CPUContext(DeviceToOption(device)) {} ~CPUContext() noexcept override {} @@ -192,15 +194,6 @@ class CAFFE2_API CPUStaticContext : public BaseStaticContext { return data_and_deleter; } - std::unique_ptr CreateContext() override { - return caffe2::make_unique(); - } - - std::unique_ptr CreateContext( - const DeviceOption& option) override { - return caffe2::make_unique(option); - } - DeviceType GetDeviceType() override { return CPU; } diff --git a/caffe2/core/context_base.cc b/caffe2/core/context_base.cc index b61b73cbad1cb5..99996d9e165b9b 100644 --- a/caffe2/core/context_base.cc +++ b/caffe2/core/context_base.cc @@ -1,4 +1,5 @@ #include "context_base.h" namespace caffe2 { + } // namespace caffe2 diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu index 1eaa579ee0cdbe..0d9e2686212a1e 100644 --- a/caffe2/core/context_gpu.cu +++ b/caffe2/core/context_gpu.cu @@ -57,6 +57,11 @@ CAFFE2_DEFINE_int( 128, "The threshold in MB on how frequently to report memory changes"); +namespace at { + +REGISTER_CONTEXT(DeviceType::CUDA, caffe2::CUDAContext); +} // namespace at + namespace caffe2 { ThreadLocalCUDAObjects& CUDAContext::getCudaObjects() { diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h index 5fcdb98b100794..ce73f5f942828b 100644 --- a/caffe2/core/context_gpu.h +++ b/caffe2/core/context_gpu.h @@ -142,6 +142,8 @@ class CAFFE2_CUDA_API CUDAContext final : public BaseContext { // The default cuda context constructor. explicit CUDAContext(const int gpu_id = -1); explicit CUDAContext(const DeviceOption& option); + explicit CUDAContext(const at::Device& device) + : CUDAContext(DeviceToOption(device)) {} ~CUDAContext() override { if (curand_generator_) { @@ -385,19 +387,6 @@ class CAFFE2_CUDA_API CUDAStaticContext final : public BaseStaticContext { public: std::pair New(size_t nbytes) const override; - std::unique_ptr CreateContext() override { - return caffe2::make_unique(); - } - - std::unique_ptr CreateContext( - const DeviceOption& option) override { - return caffe2::make_unique(option); - } - - std::unique_ptr CreateContext(int gpu_id = -1) { - return caffe2::make_unique(gpu_id); - } - DeviceType GetDeviceType() override { return CUDA; } diff --git a/caffe2/core/db.cc b/caffe2/core/db.cc index 720c2dcaa46de1..c0031cb0661ec8 100644 --- a/caffe2/core/db.cc +++ b/caffe2/core/db.cc @@ -12,7 +12,7 @@ CAFFE_KNOWN_TYPE(db::Cursor); namespace db { -CAFFE_DEFINE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode); +C10_DEFINE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode); // Below, we provide a bare minimum database "minidb" as a reference // implementation as well as a portable choice to store data. diff --git a/caffe2/core/db.h b/caffe2/core/db.h index 39f8b6f3f02b0d..f6044ff35f8273 100644 --- a/caffe2/core/db.h +++ b/caffe2/core/db.h @@ -3,8 +3,8 @@ #include +#include "c10/util/Registry.h" #include "caffe2/core/blob_serialization.h" -#include "caffe2/core/registry.h" #include "caffe2/proto/caffe2_pb.h" namespace caffe2 { @@ -104,9 +104,9 @@ class CAFFE2_API DB { // Database classes are registered by their names so we can do optional // dependencies. -CAFFE_DECLARE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode); +C10_DECLARE_REGISTRY(Caffe2DBRegistry, DB, const string&, Mode); #define REGISTER_CAFFE2_DB(name, ...) \ - CAFFE_REGISTER_CLASS(Caffe2DBRegistry, name, __VA_ARGS__) + C10_REGISTER_CLASS(Caffe2DBRegistry, name, __VA_ARGS__) /** * Returns a database object of the given database type, source and mode. The diff --git a/caffe2/core/flags.cc b/caffe2/core/flags.cc index a84d298466dc03..43131d8beebd27 100644 --- a/caffe2/core/flags.cc +++ b/caffe2/core/flags.cc @@ -1,6 +1,7 @@ #include "caffe2/core/flags.h" #include +#include #include #include "caffe2/core/logging.h" @@ -33,8 +34,7 @@ C10_EXPORT bool CommandLineFlagsHasBeenParsed() { #else // CAFFE2_USE_GFLAGS - -CAFFE_DEFINE_REGISTRY(Caffe2FlagsRegistry, Caffe2FlagParser, const string&); +C10_DEFINE_REGISTRY(Caffe2FlagsRegistry, Caffe2FlagParser, const string&); namespace { static bool gCommandLineFlagsParsed = false; diff --git a/caffe2/core/flags.h b/caffe2/core/flags.h index 4e39c7bdebf137..98b137c2f723ef 100644 --- a/caffe2/core/flags.h +++ b/caffe2/core/flags.h @@ -20,7 +20,8 @@ #ifndef CAFFE2_CORE_FLAGS_H_ #define CAFFE2_CORE_FLAGS_H_ -#include "caffe2/core/registry.h" +#include "c10/util/Registry.h" +#include "caffe2/core/common.h" namespace caffe2 { /** @@ -142,7 +143,7 @@ class CAFFE2_API Caffe2FlagParser { bool success_; }; -CAFFE_DECLARE_REGISTRY(Caffe2FlagsRegistry, Caffe2FlagParser, const string&); +C10_DECLARE_REGISTRY(Caffe2FlagsRegistry, Caffe2FlagParser, const string&); } // namespace caffe2 diff --git a/caffe2/core/hip/context_hip.cc b/caffe2/core/hip/context_hip.cc index 0fabb20a642c94..3eadaf0e71b118 100644 --- a/caffe2/core/hip/context_hip.cc +++ b/caffe2/core/hip/context_hip.cc @@ -50,6 +50,11 @@ CAFFE2_DEFINE_int(caffe2_gpu_memory_report_interval_mb, 128, "The threshold in MB on how frequently to report memory changes"); +namespace at { + +REGISTER_CONTEXT(DeviceType::HIP, caffe2::HIPContext); +} // namespace at + namespace caffe2 { thread_local ThreadLocalHIPObjects HIPContext::hip_objects_; @@ -408,13 +413,12 @@ void HIPStaticContext::Delete(void* ptr) { g_hip_device_affiliation.erase(it); break; } - case HipMemoryPoolType::THC: - { - HIP_ENFORCE(g_thc_allocator->Free(ptr)); - if (FLAGS_caffe2_gpu_memory_tracking) { - g_hip_device_affiliation.erase(g_hip_device_affiliation.find(ptr)); - } - break; + case HipMemoryPoolType::THC: { + HIP_ENFORCE(g_thc_allocator->Free(ptr)); + if (FLAGS_caffe2_gpu_memory_tracking) { + g_hip_device_affiliation.erase(g_hip_device_affiliation.find(ptr)); + } + break; } } } diff --git a/caffe2/core/hip/context_hip.h b/caffe2/core/hip/context_hip.h index 5a7613cf934fd0..fb04336354e704 100644 --- a/caffe2/core/hip/context_hip.h +++ b/caffe2/core/hip/context_hip.h @@ -127,6 +127,8 @@ class HIPContext final : public BaseContext { // The default HIP context constructor. explicit HIPContext(const int gpu_id = -1); explicit HIPContext(const DeviceOption& option); + explicit HIPContext(const at::Device& device) + : HIPContext(DeviceToOption(device)) {} ~HIPContext() override { if (hiprand_generator_) { @@ -374,19 +376,6 @@ class HIPStaticContext final : public BaseStaticContext { public: std::pair New(size_t nbytes) const override; - std::unique_ptr CreateContext() override { - return caffe2::make_unique(); - } - - std::unique_ptr CreateContext( - const DeviceOption& option) override { - return caffe2::make_unique(option); - } - - std::unique_ptr CreateContext(int gpu_id = -1) { - return caffe2::make_unique(gpu_id); - } - DeviceType GetDeviceType() override { return HIP; } diff --git a/caffe2/core/hip/net_async_hip_thread_pool_hip.cc b/caffe2/core/hip/net_async_hip_thread_pool_hip.cc index e1b4ff2aebb0fc..3ad9336e6d2aee 100644 --- a/caffe2/core/hip/net_async_hip_thread_pool_hip.cc +++ b/caffe2/core/hip/net_async_hip_thread_pool_hip.cc @@ -55,6 +55,6 @@ GetAsyncNetHIPThreadPool(int hip_gpu_id, int pool_size, bool create_new) { } } -CAFFE_REGISTER_CREATOR(ThreadPoolRegistry, HIP, GetAsyncNetHIPThreadPool); +C10_REGISTER_CREATOR(ThreadPoolRegistry, HIP, GetAsyncNetHIPThreadPool); } // namespace caffe2 diff --git a/caffe2/core/logging.cc b/caffe2/core/logging.cc index cd057444d31cf4..30603888ad1c44 100644 --- a/caffe2/core/logging.cc +++ b/caffe2/core/logging.cc @@ -3,6 +3,7 @@ #include #include +#include #include // Common code that we use regardless of whether we use glog or not. @@ -12,6 +13,11 @@ CAFFE2_DEFINE_bool(caffe2_use_fatal_for_enforce, false, "of throwing an exception."); namespace caffe2 { +namespace enforce_detail { +/* implicit */ EnforceFailMessage::EnforceFailMessage(std::string&& msg) { + msg_ = new std::string(std::move(msg)); +} +} // namespace enforce_detail size_t ReplaceAll(string& s, const char* from, const char* to) { CAFFE_ENFORCE(from && *from); diff --git a/caffe2/core/logging.h b/caffe2/core/logging.h index 288c34afd5dbe7..859ee4765683a1 100644 --- a/caffe2/core/logging.h +++ b/caffe2/core/logging.h @@ -187,9 +187,8 @@ class CAFFE2_API EnforceFailMessage { "like `Equals`. Use CAFFE_ENFORCE for simple boolean checks."); } - /* implicit */ EnforceFailMessage(std::string&& msg) { - msg_ = new std::string(std::move(msg)); - } + /* implicit */ EnforceFailMessage(std::string&& msg); + inline bool bad() const { return msg_ != nullptr; } diff --git a/caffe2/core/net.cc b/caffe2/core/net.cc index 77934f6be12d45..c72c34e37e8c00 100644 --- a/caffe2/core/net.cc +++ b/caffe2/core/net.cc @@ -19,7 +19,7 @@ CAFFE2_DEFINE_string( namespace caffe2 { -CAFFE_DEFINE_REGISTRY( +C10_DEFINE_REGISTRY( NetRegistry, NetBase, const std::shared_ptr&, diff --git a/caffe2/core/net.h b/caffe2/core/net.h index 57fd53f1de4f12..30ef4bde50cab7 100644 --- a/caffe2/core/net.h +++ b/caffe2/core/net.h @@ -9,12 +9,12 @@ #include #include +#include "c10/util/Registry.h" #include "caffe2/core/blob.h" #include "caffe2/core/common.h" #include "caffe2/core/logging.h" #include "caffe2/core/observer.h" #include "caffe2/core/operator_schema.h" -#include "caffe2/core/registry.h" #include "caffe2/core/tensor.h" #include "caffe2/core/workspace.h" #include "caffe2/proto/caffe2_pb.h" @@ -134,15 +134,15 @@ class CAFFE2_API ExecutorHelper { virtual ~ExecutorHelper() {} }; -CAFFE_DECLARE_REGISTRY( +C10_DECLARE_REGISTRY( NetRegistry, NetBase, const std::shared_ptr&, Workspace*); #define REGISTER_NET_CREATOR(key, ...) \ - CAFFE_REGISTER_CREATOR(NetRegistry, key, __VA_ARGS__) + C10_REGISTER_CREATOR(NetRegistry, key, __VA_ARGS__) #define REGISTER_NET(name, ...) \ - CAFFE_REGISTER_CLASS(NetRegistry, name, __VA_ARGS__) + C10_REGISTER_CLASS(NetRegistry, name, __VA_ARGS__) /** * @brief Creates a network, accessing / creating blobs in the given workspace. diff --git a/caffe2/core/net_async_base.cc b/caffe2/core/net_async_base.cc index b40a8fa33778a7..fe4b57cd3326d4 100644 --- a/caffe2/core/net_async_base.cc +++ b/caffe2/core/net_async_base.cc @@ -119,7 +119,7 @@ bool AsyncNetBase::RunAsync() { return DoRunAsync(); } -TaskThreadPool* AsyncNetBase::pool_getter( +TaskThreadPool* AsyncNetBase::poolGetter( PoolsMap& pools, int device_type, int device_id, @@ -136,7 +136,7 @@ TaskThreadPool* AsyncNetBase::pool_getter( TaskThreadPool* AsyncNetBase::pool(const DeviceOption& device_option) { if (use_single_pool_) { - return pool_getter(cpu_pools_, PROTO_CPU, -1, num_workers_); + return poolGetter(cpu_pools_, PROTO_CPU, -1, num_workers_); } static const std::unordered_set cpu_types{ PROTO_CPU, @@ -155,13 +155,13 @@ TaskThreadPool* AsyncNetBase::pool(const DeviceOption& device_option) { FLAGS_caffe2_net_async_max_numa_nodes, "Invalid NUMA node id: ", numa_node_id); - return pool_getter(cpu_pools_, PROTO_CPU, numa_node_id, num_workers_); + return poolGetter(cpu_pools_, PROTO_CPU, numa_node_id, num_workers_); } else if (device_option.device_type() == PROTO_CUDA) { auto gpu_id = device_option.cuda_gpu_id(); CAFFE_ENFORCE( gpu_id >= 0 && gpu_id < FLAGS_caffe2_net_async_max_gpus, "Invalid GPU id: " + caffe2::to_string(gpu_id)); - return pool_getter(gpu_pools_, PROTO_CUDA, gpu_id, num_workers_); + return poolGetter(gpu_pools_, PROTO_CUDA, gpu_id, num_workers_); } else { CAFFE_THROW( "Unsupported device type " + @@ -281,10 +281,20 @@ bool AsyncNetBase::testAndSetScheduled(int task_id) { return !task_op_node.scheduled_.test_and_set(); } -int AsyncNetBase::num_ops(int task_id) const { +int AsyncNetBase::numOps(int task_id) const { return chains_[task_id].size(); } +const OperatorBase* AsyncNetBase::firstTaskOp(int task_id) const { + auto op_id = chains_[task_id].front(); + return operator_nodes_[op_id].operator_.get(); +} + +const OperatorBase* AsyncNetBase::lastTaskOp(int task_id) const { + auto op_id = chains_[task_id].back(); + return operator_nodes_[op_id].operator_.get(); +} + void AsyncNetBase::asyncWait( int task_id, int stream_id, @@ -408,14 +418,9 @@ void AsyncNetBase::finalizeEvents() { AsyncNetBase::~AsyncNetBase() {} -CAFFE_DEFINE_SHARED_REGISTRY( - ThreadPoolRegistry, - TaskThreadPool, - int, - int, - bool); +C10_DEFINE_SHARED_REGISTRY(ThreadPoolRegistry, TaskThreadPool, int, int, bool); -CAFFE_REGISTER_CREATOR(ThreadPoolRegistry, CPU, GetAsyncNetCPUThreadPool); +C10_REGISTER_CREATOR(ThreadPoolRegistry, CPU, GetAsyncNetCPUThreadPool); /* static */ std::shared_ptr diff --git a/caffe2/core/net_async_base.h b/caffe2/core/net_async_base.h index 502233e7f045b4..30948853dfb410 100644 --- a/caffe2/core/net_async_base.h +++ b/caffe2/core/net_async_base.h @@ -1,11 +1,11 @@ #ifndef CAFFE2_CORE_NET_ASYNC_BASE_H_ #define CAFFE2_CORE_NET_ASYNC_BASE_H_ +#include "c10/util/Registry.h" #include "caffe2/core/common.h" #include "caffe2/core/net.h" #include "caffe2/core/net_async_base.h" #include "caffe2/core/net_dag_utils.h" -#include "caffe2/core/registry.h" #include "caffe2/core/stats.h" #include "caffe2/core/timer.h" #include "caffe2/core/workspace.h" @@ -65,7 +65,9 @@ class CAFFE2_API AsyncNetBase : public NetBase { int updateParentCount(int child_id); int getParentCount(int child_id); bool testAndSetScheduled(int task_id); - int num_ops(int task_id) const; + int numOps(int task_id) const; + const OperatorBase* firstTaskOp(int task_id) const; + const OperatorBase* lastTaskOp(int task_id) const; void asyncWait( int task_id, @@ -131,7 +133,7 @@ class CAFFE2_API AsyncNetBase : public NetBase { void storeExceptionPtr(); TaskThreadPool* - pool_getter(PoolsMap& pools, int device_type, int device_id, int pool_size); + poolGetter(PoolsMap& pools, int device_type, int device_id, int pool_size); std::unique_ptr helper_; @@ -139,12 +141,7 @@ class CAFFE2_API AsyncNetBase : public NetBase { friend class tracing::Tracer; }; -CAFFE_DECLARE_SHARED_REGISTRY( - ThreadPoolRegistry, - TaskThreadPool, - int, - int, - bool); +C10_DECLARE_SHARED_REGISTRY(ThreadPoolRegistry, TaskThreadPool, int, int, bool); class AsyncNetExecutorHelper : public ExecutorHelper { public: diff --git a/caffe2/core/net_async_gpu_thread_pool_gpu.cc b/caffe2/core/net_async_gpu_thread_pool_gpu.cc index ca3f691bc49764..dc0bf118ab7956 100644 --- a/caffe2/core/net_async_gpu_thread_pool_gpu.cc +++ b/caffe2/core/net_async_gpu_thread_pool_gpu.cc @@ -6,7 +6,7 @@ CAFFE2_DEFINE_int(caffe2_threads_per_gpu, 1, "Number of CPU threads per GPU"); namespace caffe2 { -CAFFE_REGISTER_CREATOR(ThreadPoolRegistry, CUDA, GetAsyncNetGPUThreadPool); +C10_REGISTER_CREATOR(ThreadPoolRegistry, CUDA, GetAsyncNetGPUThreadPool); std::shared_ptr GetAsyncNetGPUThreadPool(int gpu_id, int pool_size, bool create_new) { diff --git a/caffe2/core/net_async_scheduling.cc b/caffe2/core/net_async_scheduling.cc index 7feb3631abfd66..80d5807295f75a 100644 --- a/caffe2/core/net_async_scheduling.cc +++ b/caffe2/core/net_async_scheduling.cc @@ -35,6 +35,17 @@ void AsyncSchedulingNet::Wait() { } } +bool AsyncSchedulingNet::isInlineTask(int parent_id, int child_id) const { + if (!use_dfs_scheduling_) { + return false; + } + const auto* last_parent_op = lastTaskOp(parent_id); + const auto* first_child_op = firstTaskOp(child_id); + // check that we do not cross device boundary + return IsSameDevice( + last_parent_op->device_option(), first_child_op->device_option()); +} + void AsyncSchedulingNet::schedule(int task_id, bool run_inline) { if (!testAndSetScheduled(task_id)) { return; @@ -63,7 +74,7 @@ void AsyncSchedulingNet::schedule(int task_id, bool run_inline) { canSchedule(child_id)) { // if DFS scheduling is enabled, run children inline, // ignore DFS scheduling in callbacks - schedule(child_id, use_dfs_scheduling_); + schedule(child_id, isInlineTask(task_id, child_id)); } else { bool parent_failed = false; bool parent_needs_polling = false; @@ -102,7 +113,7 @@ void AsyncSchedulingNet::schedule(int task_id, bool run_inline) { if (parent_failed) { // one of parents failed, set failure flag and wrap up execution success_ = false; - schedule(child_id, use_dfs_scheduling_); + schedule(child_id, isInlineTask(task_id, child_id)); } else if (parent_needs_polling) { // some parents are blocking us from scheduling a child and don't // support callbacks, using polling @@ -119,7 +130,7 @@ void AsyncSchedulingNet::schedule(int task_id, bool run_inline) { } } else { // we're ready to schedule a child - schedule(child_id, use_dfs_scheduling_); + schedule(child_id, isInlineTask(task_id, child_id)); } } } diff --git a/caffe2/core/net_async_scheduling.h b/caffe2/core/net_async_scheduling.h index 4fcdf4b7316818..69563c4f20b325 100644 --- a/caffe2/core/net_async_scheduling.h +++ b/caffe2/core/net_async_scheduling.h @@ -22,6 +22,7 @@ class CAFFE2_API AsyncSchedulingNet : public AsyncNetBase { void reset() override; virtual void finishRun(); void parentCallback(int parent_id); + bool isInlineTask(int parent_id, int child_id) const; std::mutex running_mutex_; std::condition_variable running_cv_; diff --git a/caffe2/core/net_dag.h b/caffe2/core/net_dag.h index ab3ce0f6f3fa10..7c66217a23ec4d 100644 --- a/caffe2/core/net_dag.h +++ b/caffe2/core/net_dag.h @@ -9,6 +9,7 @@ #include #include +#include "c10/util/Registry.h" #include "caffe2/core/blob.h" #include "caffe2/core/common.h" #include "caffe2/core/logging.h" @@ -16,7 +17,6 @@ #include "caffe2/core/net_dag_utils.h" #include "caffe2/core/observer.h" #include "caffe2/core/operator_schema.h" -#include "caffe2/core/registry.h" #include "caffe2/core/stats.h" #include "caffe2/core/tensor.h" #include "caffe2/core/timer.h" diff --git a/caffe2/core/net_dag_utils.h b/caffe2/core/net_dag_utils.h index 6debfbf7bd8053..0259f10f954652 100644 --- a/caffe2/core/net_dag_utils.h +++ b/caffe2/core/net_dag_utils.h @@ -10,13 +10,13 @@ #include #include +#include "c10/util/Registry.h" #include "caffe2/core/blob.h" #include "caffe2/core/common.h" #include "caffe2/core/logging.h" #include "caffe2/core/net.h" #include "caffe2/core/observer.h" #include "caffe2/core/operator_schema.h" -#include "caffe2/core/registry.h" #include "caffe2/core/tensor.h" #include "caffe2/core/workspace.h" #include "caffe2/proto/caffe2_pb.h" diff --git a/caffe2/core/net_simple.h b/caffe2/core/net_simple.h index c114fd8d224f21..5b8bc29be4dfae 100644 --- a/caffe2/core/net_simple.h +++ b/caffe2/core/net_simple.h @@ -3,10 +3,10 @@ #include +#include "c10/util/Registry.h" #include "caffe2/core/common.h" #include "caffe2/core/logging.h" #include "caffe2/core/net.h" -#include "caffe2/core/registry.h" #include "caffe2/core/tensor.h" #include "caffe2/core/workspace.h" #include "caffe2/proto/caffe2_pb.h" diff --git a/caffe2/core/net_simple_async.h b/caffe2/core/net_simple_async.h index ea5aae959870f6..abe16f2013789f 100644 --- a/caffe2/core/net_simple_async.h +++ b/caffe2/core/net_simple_async.h @@ -3,10 +3,10 @@ #include +#include "c10/util/Registry.h" #include "caffe2/core/common.h" #include "caffe2/core/logging.h" #include "caffe2/core/net.h" -#include "caffe2/core/registry.h" #include "caffe2/core/tensor.h" #include "caffe2/core/workspace.h" #include "caffe2/proto/caffe2_pb.h" diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h index e7a889980365c5..523f29225aa07b 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h @@ -354,6 +354,11 @@ std::vector nodeIterator(G& g) { return out; } +template +inline std::vector filter(NNModule& nn) { + return nodeIterator(nn.dataFlow); +} + template std::vector> dataIterator(G& g) { std::vector> out; diff --git a/caffe2/core/observer_test.cc b/caffe2/core/observer_test.cc index fa8aee6d818366..b21246a6611789 100644 --- a/caffe2/core/observer_test.cc +++ b/caffe2/core/observer_test.cc @@ -1,11 +1,11 @@ #include +#include "c10/util/Registry.h" #include "caffe2/core/common.h" #include "caffe2/core/net.h" #include "caffe2/core/net_dag.h" #include "caffe2/core/net_simple.h" #include "caffe2/core/observer.h" #include "caffe2/core/operator.h" -#include "caffe2/core/registry.h" #include "caffe2/core/scope_guard.h" namespace caffe2 { diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc index 5f3f653b5a4b21..79be08c03b2325 100644 --- a/caffe2/core/operator.cc +++ b/caffe2/core/operator.cc @@ -316,31 +316,32 @@ std::map* gDeviceTypeRegistry() { return &g_device_type_registry; } -CAFFE_DEFINE_REGISTRY( +C10_DEFINE_REGISTRY( CPUOperatorRegistry, OperatorBase, const OperatorDef&, Workspace*); CAFFE_REGISTER_DEVICE_TYPE(CPU, CPUOperatorRegistry); -CAFFE_DEFINE_REGISTRY( +C10_DEFINE_REGISTRY( CUDAOperatorRegistry, OperatorBase, const OperatorDef&, Workspace*); CAFFE_REGISTER_DEVICE_TYPE(CUDA, CUDAOperatorRegistry); -CAFFE_DEFINE_REGISTRY( +C10_DEFINE_REGISTRY( HIPOperatorRegistry, OperatorBase, const OperatorDef&, Workspace*); CAFFE_REGISTER_DEVICE_TYPE(HIP, HIPOperatorRegistry); -CAFFE_DEFINE_REGISTRY( +C10_DEFINE_REGISTRY( GradientRegistry, GradientMakerBase, - const OperatorDef&, const vector&); + const OperatorDef&, + const vector&); GradientOpsMeta GetGradientForOp( const OperatorDef& def, const vector& g_output) { diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h index 1a968c4c3755fe..8208eb271bdc1b 100644 --- a/caffe2/core/operator.h +++ b/caffe2/core/operator.h @@ -9,13 +9,13 @@ #include #include +#include "c10/util/Registry.h" #include "caffe2/core/blob.h" #include "caffe2/core/common.h" #include "caffe2/core/net.h" #include "caffe2/core/observer.h" #include "caffe2/core/operator_gradient.h" #include "caffe2/core/operator_schema.h" -#include "caffe2/core/registry.h" #include "caffe2/core/tensor.h" #include "caffe2/core/types.h" #include "caffe2/core/workspace.h" @@ -778,13 +778,13 @@ CAFFE2_DEFINE_TENSOR_TYPES_DISPATCHER( // registry function. // (2) Then, one can call the operator registry function to further create the // operators. -typedef Registry< +typedef c10::Registry< std::string, std::unique_ptr, const OperatorDef&, Workspace*> OperatorRegistry; -typedef Registry< +typedef c10::Registry< std::string, std::unique_ptr, const OperatorDef&, @@ -806,7 +806,7 @@ struct CAFFE2_API DeviceTypeRegisterer { #define CAFFE_REGISTER_DEVICE_TYPE(type, registry_function) \ namespace { \ - static DeviceTypeRegisterer CAFFE_ANONYMOUS_VARIABLE( \ + static DeviceTypeRegisterer C10_ANONYMOUS_VARIABLE( \ DeviceType)(type, ®istry_function); \ } @@ -817,69 +817,67 @@ struct CAFFE2_API DeviceTypeRegisterer { // not depend on specific cuda or cudnn libraries. This means that we will be // able to compile it even when there is no cuda available - we simply do not // link any cuda or cudnn operators. -CAFFE_DECLARE_REGISTRY( +C10_DECLARE_REGISTRY( CPUOperatorRegistry, OperatorBase, const OperatorDef&, Workspace*); #define REGISTER_CPU_OPERATOR_CREATOR(key, ...) \ - CAFFE_REGISTER_CREATOR(CPUOperatorRegistry, key, __VA_ARGS__) + C10_REGISTER_CREATOR(CPUOperatorRegistry, key, __VA_ARGS__) #define REGISTER_CPU_OPERATOR(name, ...) \ C10_IMPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(); \ static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_CPU##name() { \ CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(); \ } \ - CAFFE_REGISTER_CLASS(CPUOperatorRegistry, name, __VA_ARGS__) + C10_REGISTER_CLASS(CPUOperatorRegistry, name, __VA_ARGS__) #define REGISTER_CPU_OPERATOR_STR(str_name, ...) \ - CAFFE_REGISTER_TYPED_CLASS(CPUOperatorRegistry, str_name, __VA_ARGS__) + C10_REGISTER_TYPED_CLASS(CPUOperatorRegistry, str_name, __VA_ARGS__) #define REGISTER_CPU_OPERATOR_WITH_ENGINE(name, engine, ...) \ - CAFFE_REGISTER_CLASS(CPUOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__) + C10_REGISTER_CLASS(CPUOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__) -CAFFE_DECLARE_REGISTRY( +C10_DECLARE_REGISTRY( CUDAOperatorRegistry, OperatorBase, const OperatorDef&, Workspace*); #define REGISTER_CUDA_OPERATOR_CREATOR(key, ...) \ - CAFFE_REGISTER_CREATOR(CUDAOperatorRegistry, key, __VA_ARGS__) + C10_REGISTER_CREATOR(CUDAOperatorRegistry, key, __VA_ARGS__) #define REGISTER_CUDA_OPERATOR(name, ...) \ C10_IMPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(); \ static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_CUDA##name() { \ CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(); \ } \ - CAFFE_REGISTER_CLASS(CUDAOperatorRegistry, name, __VA_ARGS__) + C10_REGISTER_CLASS(CUDAOperatorRegistry, name, __VA_ARGS__) #define REGISTER_CUDA_OPERATOR_STR(str_name, ...) \ - CAFFE_REGISTER_TYPED_CLASS(CUDAOperatorRegistry, str_name, __VA_ARGS__) + C10_REGISTER_TYPED_CLASS(CUDAOperatorRegistry, str_name, __VA_ARGS__) #define REGISTER_CUDA_OPERATOR_WITH_ENGINE(name, engine, ...) \ - CAFFE_REGISTER_CLASS( \ - CUDAOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__) + C10_REGISTER_CLASS(CUDAOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__) // Macros for cudnn since we use it often #define REGISTER_CUDNN_OPERATOR(name, ...) \ REGISTER_CUDA_OPERATOR_WITH_ENGINE(name, CUDNN, __VA_ARGS__) // Macros for HIP operators -CAFFE_DECLARE_REGISTRY( +C10_DECLARE_REGISTRY( HIPOperatorRegistry, OperatorBase, const OperatorDef&, Workspace*); #define REGISTER_HIP_OPERATOR_CREATOR(key, ...) \ - CAFFE_REGISTER_CREATOR(HIPOperatorRegistry, key, __VA_ARGS__) + C10_REGISTER_CREATOR(HIPOperatorRegistry, key, __VA_ARGS__) #define REGISTER_HIP_OPERATOR(name, ...) \ C10_IMPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(); \ static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_HIP##name() { \ CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(); \ } \ - CAFFE_REGISTER_CLASS(HIPOperatorRegistry, name, __VA_ARGS__) + C10_REGISTER_CLASS(HIPOperatorRegistry, name, __VA_ARGS__) #define REGISTER_HIP_OPERATOR_STR(str_name, ...) \ - CAFFE_REGISTER_TYPED_CLASS(HIPOperatorRegistry, str_name, __VA_ARGS__) + C10_REGISTER_TYPED_CLASS(HIPOperatorRegistry, str_name, __VA_ARGS__) #define REGISTER_HIP_OPERATOR_WITH_ENGINE(name, engine, ...) \ - CAFFE_REGISTER_CLASS( \ - HIPOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__) + C10_REGISTER_CLASS(HIPOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__) #define REGISTER_MIOPEN_OPERATOR(name, ...) \ REGISTER_HIP_OPERATOR_WITH_ENGINE(name, MIOPEN, __VA_ARGS__) diff --git a/caffe2/core/operator_c10wrapper.cc b/caffe2/core/operator_c10wrapper.cc index 6fd62ec1cf63b4..523c467b170d58 100644 --- a/caffe2/core/operator_c10wrapper.cc +++ b/caffe2/core/operator_c10wrapper.cc @@ -2,7 +2,7 @@ namespace caffe2 { -CAFFE_DEFINE_REGISTRY( +C10_DEFINE_REGISTRY( C10OperatorRegistry, OperatorBase, const OperatorDef&, diff --git a/caffe2/core/operator_c10wrapper.h b/caffe2/core/operator_c10wrapper.h index 695319266901a8..57a3c370ba5e32 100644 --- a/caffe2/core/operator_c10wrapper.h +++ b/caffe2/core/operator_c10wrapper.h @@ -284,7 +284,7 @@ struct ParameterHelper final { } }; -CAFFE_DECLARE_REGISTRY( +C10_DECLARE_REGISTRY( C10OperatorRegistry, OperatorBase, const OperatorDef&, @@ -293,14 +293,14 @@ CAFFE_DECLARE_REGISTRY( // TODO Currently we only register the CPU variant. This is going to be fixed // once the tensor detemplatization lands. #define REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH(OpSchemaDef, State, Name) \ - CAFFE_REGISTER_CLASS( \ + C10_REGISTER_CLASS( \ C10OperatorRegistry, \ Name, \ C10OperatorWrapper>) #define REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_WITH_PARAMETERS( \ OpSchemaDef, State, Name, ...) \ - CAFFE_REGISTER_CLASS( \ + C10_REGISTER_CLASS( \ C10OperatorRegistry, \ Name, \ C10OperatorWrapper< \ @@ -312,14 +312,14 @@ CAFFE_DECLARE_REGISTRY( #define REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_WITH_ARRAY_INPUT( \ OpSchemaDef, State, Name) \ - CAFFE_REGISTER_CLASS( \ + C10_REGISTER_CLASS( \ C10OperatorRegistry, \ Name, \ C10OperatorWrapper>) #define REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_WITH_ARRAY_INPUT_AND_PARAMETERS( \ OpSchemaDef, State, Name, ...) \ - CAFFE_REGISTER_CLASS( \ + C10_REGISTER_CLASS( \ C10OperatorRegistry, \ Name, \ C10OperatorWrapper< \ diff --git a/caffe2/core/operator_gradient.h b/caffe2/core/operator_gradient.h index 3eea164c21b840..2eb5b581092c30 100644 --- a/caffe2/core/operator_gradient.h +++ b/caffe2/core/operator_gradient.h @@ -1,8 +1,8 @@ #ifndef CAFFE2_CORE_OPERATOR_GRADIENT_H_ #define CAFFE2_CORE_OPERATOR_GRADIENT_H_ +#include "c10/util/Registry.h" #include "caffe2/core/operator_schema.h" -#include "caffe2/core/registry.h" #include "caffe2/proto/caffe2_pb.h" #include "caffe2/utils/proto_utils.h" @@ -295,16 +295,16 @@ struct GradientNotImplementedYet : public GradientMakerBase { } }; -CAFFE_DECLARE_REGISTRY( +C10_DECLARE_REGISTRY( GradientRegistry, GradientMakerBase, const OperatorDef&, const vector&); #define REGISTER_GRADIENT(name, ...) \ - CAFFE_REGISTER_CLASS(GradientRegistry, name, __VA_ARGS__) + C10_REGISTER_CLASS(GradientRegistry, name, __VA_ARGS__) #define REGISTER_GRADIENT_STR(str_name, ...) \ - CAFFE_REGISTER_TYPED_CLASS(GradientRegistry, str_name, __VA_ARGS__) + C10_REGISTER_TYPED_CLASS(GradientRegistry, str_name, __VA_ARGS__) // NO_GRADIENT means that the operator does not need any gradient computation. #define NO_GRADIENT(name) REGISTER_GRADIENT(name, NoGradient) diff --git a/caffe2/core/operator_schema.h b/caffe2/core/operator_schema.h index 54a6a17b8a0d24..a938d8f56afc93 100644 --- a/caffe2/core/operator_schema.h +++ b/caffe2/core/operator_schema.h @@ -9,9 +9,9 @@ #include #include +#include "c10/util/Registry.h" #include "caffe2/core/common.h" #include "caffe2/core/logging.h" -#include "caffe2/core/registry.h" #include "caffe2/proto/caffe2_pb.h" #include "caffe2/utils/filler.h" @@ -578,14 +578,14 @@ OpSchema::Cost PointwiseCostInference( #define OPERATOR_SCHEMA(name) \ C10_EXPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(){}; \ - static OpSchema* CAFFE_ANONYMOUS_VARIABLE(name) CAFFE2_UNUSED = \ + static OpSchema* C10_ANONYMOUS_VARIABLE(name) CAFFE2_UNUSED = \ &OpSchemaRegistry::NewSchema(#name, __FILE__, __LINE__) #else // CAFFE2_NO_OPERATOR_SCHEMA #define OPERATOR_SCHEMA(name) \ C10_EXPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(){}; \ - static OpSchema* CAFFE_ANONYMOUS_VARIABLE(name) CAFFE2_UNUSED = \ + static OpSchema* C10_ANONYMOUS_VARIABLE(name) CAFFE2_UNUSED = \ 1 ? nullptr : &OpSchemaRegistry::NewSchema(#name, __FILE__, __LINE__) #endif // CAFFE2_NO_OPERATOR_SCHEMA diff --git a/caffe2/core/plan_executor.cc b/caffe2/core/plan_executor.cc index 8e48b6b7beabca..51faaed9e7eec9 100644 --- a/caffe2/core/plan_executor.cc +++ b/caffe2/core/plan_executor.cc @@ -489,7 +489,9 @@ bool RunPlanOnWorkspace( NetDefMap net_defs; for (const NetDef& net_def : plan.network()) { - LOG(INFO) << "Processing net '" << net_def.name() << "'"; + LOG(INFO) << "Processing net '" << net_def.name() << "', type: '" + << net_def.type() << "', #ops: " << net_def.op_size() + << ", num_workers: " << net_def.num_workers(); CAFFE_ENFORCE( net_defs.count(net_def.name()) == 0, "Your plan contains networks of the same name \"", diff --git a/caffe2/core/registry.h b/caffe2/core/registry.h deleted file mode 100644 index f026795b23c3e1..00000000000000 --- a/caffe2/core/registry.h +++ /dev/null @@ -1,207 +0,0 @@ -/** - * Simple registry implementation in Caffe2 that uses static variables to - * register object creators during program initialization time. - * - * WARNING: this registry is not entirely thread-safe, as reads to - * the registry are not protected by a mutex. The safest mode of use - * is to dlopen() *all* dynamic libraries that may write to the library - * and synchronize prior to performing any reads on the registry. - */ -#ifndef CAFFE2_CORE_REGISTRY_H_ -#define CAFFE2_CORE_REGISTRY_H_ - -#include -#include -#include -#include -#include -#include - -#include - -#include "caffe2/core/common.h" -#include "caffe2/core/typeid.h" - -namespace caffe2 { - -/** - * @brief A template class that allows one to register classes by keys. - * - * The keys are usually a string specifying the name, but can be anything that - * can be used in a std::map. - * - * You should most likely not use the Registry class explicitly, but use the - * helper macros below to declare specific registries as well as registering - * objects. - */ -template -class Registry { - public: - typedef std::function Creator; - - Registry() : registry_() {} - - void Register(const SrcType& key, Creator creator) { - // The if statement below is essentially the same as the following line: - // CHECK_EQ(registry_.count(key), 0) << "Key " << key - // << " registered twice."; - // However, CHECK_EQ depends on google logging, and since registration is - // carried out at static initialization time, we do not want to have an - // explicit dependency on glog's initialization function. - std::lock_guard lock(register_mutex_); - if (registry_.count(key) != 0) { - printf("Key already registered.\n"); - at::PrintOffendingKey(key); - std::exit(1); - } - registry_[key] = creator; - } - - void Register(const SrcType& key, Creator creator, const string& help_msg) { - Register(key, creator); - help_message_[key] = help_msg; - } - - inline bool Has(const SrcType& key) { return (registry_.count(key) != 0); } - - ObjectPtrType Create(const SrcType& key, Args... args) { - if (registry_.count(key) == 0) { - // Returns nullptr if the key is not registered. - return nullptr; - } - return registry_[key](args...); - } - - /** - * Returns the keys currently registered as a vector. - */ - vector Keys() { - vector keys; - for (const auto& it : registry_) { - keys.push_back(it.first); - } - return keys; - } - - const CaffeMap& HelpMessage() const { - return help_message_; - } - - const char* HelpMessage(const SrcType& key) const { - auto it = help_message_.find(key); - if (it == help_message_.end()) { - return nullptr; - } - return it->second.c_str(); - } - - private: - CaffeMap registry_; - CaffeMap help_message_; - std::mutex register_mutex_; - - C10_DISABLE_COPY_AND_ASSIGN(Registry); -}; - -template -class Registerer { - public: - Registerer( - const SrcType& key, - Registry* registry, - typename Registry::Creator creator, - const string& help_msg = "") { - registry->Register(key, creator, help_msg); - } - - template - static ObjectPtrType DefaultCreator(Args... args) { - // TODO(jiayq): old versions of NVCC does not handle make_unique well - // so we are forced to use a unique_ptr constructor here. Check if it is - // fine to use make_unique in the future. - // return make_unique(args...); - return ObjectPtrType(new DerivedType(args...)); - } -}; - -/** - * CAFFE_ANONYMOUS_VARIABLE(str) introduces an identifier starting with - * str and ending with a number that varies with the line. - * Pretty much a copy from 'folly/Preprocessor.h' - */ -#define CAFFE_CONCATENATE_IMPL(s1, s2) s1##s2 -#define CAFFE_CONCATENATE(s1, s2) CAFFE_CONCATENATE_IMPL(s1, s2) -#ifdef __COUNTER__ -#define CAFFE_ANONYMOUS_VARIABLE(str) CAFFE_CONCATENATE(str, __COUNTER__) -#else -#define CAFFE_ANONYMOUS_VARIABLE(str) CAFFE_CONCATENATE(str, __LINE__) -#endif - -/** - * CAFFE_DECLARE_TYPED_REGISTRY is a macro that expands to a function - * declaration, as well as creating a convenient typename for its corresponding - * registerer. - */ -#define CAFFE_DECLARE_TYPED_REGISTRY( \ - RegistryName, SrcType, ObjectType, PtrType, ...) \ - C10_EXPORT Registry, ##__VA_ARGS__>* \ - RegistryName(); \ - typedef Registerer, ##__VA_ARGS__> \ - Registerer##RegistryName; - -#define CAFFE_DEFINE_TYPED_REGISTRY( \ - RegistryName, SrcType, ObjectType, PtrType, ...) \ - C10_EXPORT Registry, ##__VA_ARGS__>* \ - RegistryName() { \ - static Registry, ##__VA_ARGS__>* registry = \ - new Registry, ##__VA_ARGS__>(); \ - return registry; \ - } - -// Note(Yangqing): The __VA_ARGS__ below allows one to specify a templated -// creator with comma in its templated arguments. -#define CAFFE_REGISTER_TYPED_CREATOR(RegistryName, key, ...) \ - namespace { \ - static Registerer##RegistryName CAFFE_ANONYMOUS_VARIABLE(g_##RegistryName)( \ - key, RegistryName(), __VA_ARGS__); \ - } - -#define CAFFE_REGISTER_TYPED_CLASS(RegistryName, key, ...) \ - namespace { \ - static Registerer##RegistryName CAFFE_ANONYMOUS_VARIABLE(g_##RegistryName)( \ - key, \ - RegistryName(), \ - Registerer##RegistryName::DefaultCreator<__VA_ARGS__>, \ - at::demangle_type<__VA_ARGS__>()); \ - } - -// CAFFE_DECLARE_REGISTRY and CAFFE_DEFINE_REGISTRY are hard-wired to use string -// as the key -// type, because that is the most commonly used cases. -#define CAFFE_DECLARE_REGISTRY(RegistryName, ObjectType, ...) \ - CAFFE_DECLARE_TYPED_REGISTRY( \ - RegistryName, std::string, ObjectType, std::unique_ptr, ##__VA_ARGS__) - -#define CAFFE_DEFINE_REGISTRY(RegistryName, ObjectType, ...) \ - CAFFE_DEFINE_TYPED_REGISTRY( \ - RegistryName, std::string, ObjectType, std::unique_ptr, ##__VA_ARGS__) - -#define CAFFE_DECLARE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \ - CAFFE_DECLARE_TYPED_REGISTRY( \ - RegistryName, std::string, ObjectType, std::shared_ptr, ##__VA_ARGS__) - -#define CAFFE_DEFINE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \ - CAFFE_DEFINE_TYPED_REGISTRY( \ - RegistryName, std::string, ObjectType, std::shared_ptr, ##__VA_ARGS__) - -// CAFFE_REGISTER_CREATOR and CAFFE_REGISTER_CLASS are hard-wired to use string -// as the key -// type, because that is the most commonly used cases. -#define CAFFE_REGISTER_CREATOR(RegistryName, key, ...) \ - CAFFE_REGISTER_TYPED_CREATOR(RegistryName, #key, __VA_ARGS__) - -#define CAFFE_REGISTER_CLASS(RegistryName, key, ...) \ - CAFFE_REGISTER_TYPED_CLASS(RegistryName, #key, __VA_ARGS__) - -} // namespace caffe2 -#endif // CAFFE2_CORE_REGISTRY_H_ diff --git a/caffe2/core/registry_test.cc b/caffe2/core/registry_test.cc deleted file mode 100644 index 7ad8ead553463a..00000000000000 --- a/caffe2/core/registry_test.cc +++ /dev/null @@ -1,46 +0,0 @@ -#include -#include - -#include "caffe2/core/registry.h" -#include -#include "caffe2/core/logging.h" - -namespace caffe2 { -namespace { - -class Foo { - public: - explicit Foo(int x) { LOG(INFO) << "Foo " << x; } -}; - -CAFFE_DECLARE_REGISTRY(FooRegistry, Foo, int); -CAFFE_DEFINE_REGISTRY(FooRegistry, Foo, int); -#define REGISTER_FOO(clsname) \ - CAFFE_REGISTER_CLASS(FooRegistry, clsname, clsname) - -class Bar : public Foo { - public: - explicit Bar(int x) : Foo(x) { LOG(INFO) << "Bar " << x; } -}; -REGISTER_FOO(Bar); - -class AnotherBar : public Foo { - public: - explicit AnotherBar(int x) : Foo(x) { - LOG(INFO) << "AnotherBar " << x; - } -}; -REGISTER_FOO(AnotherBar); - -TEST(RegistryTest, CanRunCreator) { - unique_ptr bar(FooRegistry()->Create("Bar", 1)); - EXPECT_TRUE(bar != nullptr) << "Cannot create bar."; - unique_ptr another_bar(FooRegistry()->Create("AnotherBar", 1)); - EXPECT_TRUE(another_bar != nullptr); -} - -TEST(RegistryTest, ReturnNullOnNonExistingCreator) { - EXPECT_EQ(FooRegistry()->Create("Non-existing bar", 1), nullptr); -} -} -} // namespace caffe2 diff --git a/caffe2/core/stats.h b/caffe2/core/stats.h index 86c6827e3039a1..f037ca6e175606 100644 --- a/caffe2/core/stats.h +++ b/caffe2/core/stats.h @@ -350,8 +350,8 @@ _ScopeGuard ScopeGuard(T f) { ##__VA_ARGS__); \ } -#define CAFFE_DURATION(stats, field, ...) \ - if (auto g = detail::ScopeGuard([&](int64_t nanos) { \ - CAFFE_EVENT(stats, field, nanos, ##__VA_ARGS__); \ +#define CAFFE_DURATION(stats, field, ...) \ + if (auto g = ::caffe2::detail::ScopeGuard([&](int64_t nanos) { \ + CAFFE_EVENT(stats, field, nanos, ##__VA_ARGS__); \ })) } // namespace caffe2 diff --git a/caffe2/core/tensor.cc b/caffe2/core/tensor.cc index caa0ba9ea55f49..0e531c83fcb7ad 100644 --- a/caffe2/core/tensor.cc +++ b/caffe2/core/tensor.cc @@ -6,8 +6,6 @@ namespace caffe2 { CAFFE_DEFINE_KNOWN_TYPE(Tensor); -UndefinedTensorImpl UndefinedTensorImpl::singleton_; - TensorPrinter::TensorPrinter( const std::string& tensor_name, const std::string& file_name, diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h index 1e4cac2788b560..bb478e415a8ce6 100644 --- a/caffe2/core/tensor.h +++ b/caffe2/core/tensor.h @@ -5,34 +5,19 @@ #include "caffe2/core/tensor_impl.h" #include +#include namespace caffe2 { -class CAFFE2_API UndefinedTensorImpl final : public TensorImpl { - UndefinedTensorImpl() : TensorImpl(at::Storage()){}; - - public: - // Without this, we get: - // error: identifier "at::UndefinedTensor::_singleton" is undefined in device code - // (ostensibly because the constexpr tricks MSVC into trying to compile this - // function for device as well). -#ifdef _WIN32 - static inline TensorImpl * singleton() { -#else - static constexpr inline TensorImpl * singleton() { -#endif - return &singleton_; - } - - private: - static UndefinedTensorImpl singleton_; -}; +using at::UndefinedTensorImpl; /** * @brief Tensor class holds a shared pointer to the implementation TensorImpl, * redirects API calls to TensorImpl; * Copying of Tensor results in sharing the same underlying implementation * object + * + * NB: See TensorImpl for documentation on these methods. */ class CAFFE2_API Tensor final { protected: @@ -130,28 +115,52 @@ class CAFFE2_API Tensor final { return impl_.get()->GetStaticContext(); } - std::unique_ptr CreateContext() const { - return impl_.get()->CreateContext(); + DeviceType GetDeviceType() const { + return impl_->device_type(); } - DeviceType GetDeviceType() const { - return impl_.get()->GetDeviceType(); + at::Device GetDevice() const { + return impl_.get()->GetDevice(); } void CopyFrom(const Tensor& src, BaseContext* context = nullptr) const { impl_.get()->CopyFrom(*src.impl_.get(), context); } + /** + * @brief Extend the outer-most dimension of this tensor + * to dimension of `num`. + */ void ExtendTo(int64_t num, float growthPct, BaseContext* context) const { - impl_.get()->ExtendTo(num, growthPct, context); + CAFFE_ENFORCE_GE_WITH_CALLER(impl_->dim(), 1); + CAFFE_ENFORCE_GE_WITH_CALLER(growthPct, 0); + CAFFE_ENFORCE(context != nullptr, "Context must be provided."); + Extend(num - impl_->size(0), growthPct, context); } void Extend(int64_t num, float growthPct, BaseContext* context) const { impl_.get()->Extend(num, growthPct, context); } + /** + * @brief Shrinks the outer-most dimension to given size, keeping the data. + * + * This method guarantees that no re-allocations are carried out, which means + * that the extra capacity after the end of the shrunk tensor is maintained. + * Notably, this function does NOT respect caffe2_keep_on_shrink. + */ void ShrinkTo(int64_t outer_dim) const { - impl_.get()->ShrinkTo(outer_dim); + CAFFE_ENFORCE_WITH_CALLER( + impl_->is_contiguous(), + "Right now ShrinkTo is only supported on contiguous Tensor."); + CAFFE_ENFORCE_WITH_CALLER(impl_->dim() >= 1, "Tensor must be at least 1D"); + CAFFE_ENFORCE_WITH_CALLER( + outer_dim <= impl_->size(0), + "New outer dimension must be smaller than current."); + CAFFE_ENFORCE( + impl_->storage().unique(), + "Can't call ShrinkTo on shared storage, please call Resize instead."); + impl_.get()->set_size(0, outer_dim); } template @@ -164,8 +173,18 @@ class CAFFE2_API Tensor final { impl_.get()->Resize(dim_source...); } + /** + * Resize the tensor like the source tensor. Note that this is just a + * sugar wrapper that essentially calls Resize(src_tensor.dims()). + * This method respects caffe2_keep_on_shrink. + */ inline void ResizeLike(const Tensor& src_tensor) const { - impl_.get()->ResizeLike(*src_tensor.impl_.get()); + CAFFE_ENFORCE_WITH_CALLER( + src_tensor.is_contiguous(), + "Right now ResizeLike is only supported for contiguous Tensor."); + if (impl_ != src_tensor.impl_) { + impl_.get()->Resize(src_tensor.dims()); + } } inline void Reshape(const vector& dims) const { @@ -173,15 +192,27 @@ class CAFFE2_API Tensor final { } inline void Reshape(const vector& dims) const { - impl_.get()->Reshape(dims); + impl_.get()->Reshape(ToVectorint64_t(dims)); } inline void FreeMemory() const { impl_.get()->FreeMemory(); } + /** + * A utility function to print the debug string for the tensor. Note that this + * is very slow since it involves quite some string operations, so do not use + * it in your performance-critical code. + */ string DebugString() const { - return impl_.get()->DebugString(); + std::stringstream ss; + ss << "A Tensor of item size " << impl_->storage().itemsize() << " and type " + << impl_->dtype().name() << " and dimension ("; + for (int d : impl_->sizes()) { + ss << d << ","; + } + ss << ")."; + return ss.str(); } // NB: a.swap(b) is not equivalent to std::swap(a, b); @@ -196,25 +227,42 @@ class CAFFE2_API Tensor final { impl_.get()->ShareData(*src.impl_.get()); } + /** + * @brief Shares the data with an externally managed pointer. + * + * This is similar to ShareData() but the source is a pointer with an advanced + * deleter option. In default, no deletion takes place, and one needs to make + * sure that the external memory is deallocated only after the tensor finishes + * using it. If a Deleter object is passed in, when this tensor is reallocated + * or freed, the deleter function is going to be called. + */ template void ShareExternalPointer( T* src, size_t capacity = 0, MemoryDeleter d = nullptr) const { - impl_.get()->ShareExternalPointer(src, capacity, d); + ShareExternalPointer((void*)src, caffe2::TypeMeta::Make(), capacity, d); } template void ShareExternalPointer(at::DataPtr&& data_ptr, size_t capacity = 0) const { - impl_.get()->ShareExternalPointer(std::move(data_ptr), capacity); + ShareExternalPointer(std::move(data_ptr), caffe2::TypeMeta::Make(), capacity); } void ShareExternalPointer( void* src, - const TypeMeta& meta, + const TypeMeta& data_type, size_t capacity = 0, MemoryDeleter d = nullptr) const { - impl_.get()->ShareExternalPointer(src, meta, capacity, d); + CAFFE_ENFORCE_WITH_CALLER( + impl_->is_contiguous(), + "Right now ShareExternalPointer is only supported for contiguous Tensor."); + CAFFE_ENFORCE_WITH_CALLER( + data_type.id() != caffe2::TypeIdentifier::uninitialized(), + "To share with a raw external pointer you need to pass in an " + "initialized data_type(TypeMeta)."); + impl_.get()->ShareExternalPointer( + at::DataPtr(src, src, d, impl_->device_type()), data_type, capacity); } void ShareExternalPointer( @@ -224,8 +272,12 @@ class CAFFE2_API Tensor final { impl_.get()->ShareExternalPointer(std::move(data_ptr), data_type, capacity); } + /** + * Returns a const raw void* pointer of the underlying storage. mutable_data() + * or raw_mutable_data() must have been called prior to this function call. + */ inline const void* raw_data() const { - return impl_.get()->raw_data(); + return impl_->data(); } template @@ -237,8 +289,22 @@ class CAFFE2_API Tensor final { return impl_.get()->raw_mutable_data(meta); } + /** + * Returns a mutable raw pointer of the underlying storage. This can only be + * used when you know for sure that the underlying storage of the tensor is + * already created via an earlier raw_mutable_data(meta) call or a + * mutable_data() call. + * + * If the existing data does not match the desired type, it will be deleted + * and a new storage will be created. + */ inline void* raw_mutable_data() const { - return impl_.get()->raw_mutable_data(); + const auto& data_type = impl_->dtype(); + CAFFE_ENFORCE_WITH_CALLER( + data_type.id() != caffe2::TypeIdentifier::uninitialized(), + "Calling raw_mutable_data() without meta, but the current meta is " + "of unknown type."); + return raw_mutable_data(data_type); } template @@ -246,20 +312,34 @@ class CAFFE2_API Tensor final { return impl_.get()->mutable_data(); } + /** + * Returns the number of dimensions of the data. + */ inline int ndim() const { - return impl_.get()->ndim(); + return impl_->dim(); } + /** + * Returns the size (i.e. the number of items) of the tensor. + */ inline int64_t size() const { - return impl_.get()->size(); + return impl_->numel(); } + /** + * Return the number of bytes each item takes in the tensor. + */ inline size_t itemsize() const { - return impl_.get()->itemsize(); + return impl_->storage().itemsize(); } + /** + * Returns the total number of bytes of the storage. + * + * This is equivalent to calling size() * itemsize(). + */ inline size_t nbytes() const { - return impl_.get()->nbytes(); + return impl_->numel() * itemsize(); } inline const vector& dims() const { @@ -267,26 +347,37 @@ class CAFFE2_API Tensor final { } inline int64_t size_from_dim(int k) const { - return impl_.get()->size_from_dim(k); + return size_from_dim_(k, impl_->sizes()); } inline int64_t size_to_dim(int k) const { - return impl_.get()->size_to_dim(k); + return size_to_dim_(k, impl_->sizes()); } inline int64_t size_between_dim(int k, int l) const { - return impl_.get()->size_between_dim(k, l); + return size_between_dim_(k, l, impl_->sizes()); } + /** + * Returns the 'canonical' version of a (usually) user-specified axis, + * allowing for negative indexing (e.g., -1 for the last axis). + * + * @param axis_index the axis index. + * If 0 <= index < dim(), return index. + * If -ndim <= index <= -1, return (dim() - (-index)), + * e.g., the last axis index (dim() - 1) if index == -1, + * the second to last if index == -2, etc. + * Dies on out of range index. + */ inline int canonical_axis_index(int axis_index) const { - return impl_.get()->canonical_axis_index(axis_index); + return canonical_axis_index_(axis_index, impl_->dim()); } inline int64_t stride(int64_t dim) const { return impl_.get()->stride(dim); } - inline at::DimVector strides() { + inline at::IntList strides() { return impl_.get()->strides(); } @@ -294,25 +385,46 @@ class CAFFE2_API Tensor final { return impl_.get()->is_contiguous(); } + /** + * Checks if the tensor content is of the given data type. + */ template inline bool IsType() const { - return impl_.get()->IsType(); + return impl_->storage().IsType(); } + /** + * Returns the TypeMeta object associated with the current data type. + */ inline const TypeMeta& meta() const { - return impl_.get()->meta(); + return impl_->dtype(); } + /** + * Returns the i-th dimension of the tensor in int. + * + * This function returns an int value instead of int64_t, which depending on + * the typedef could be int64. If you want int64 dim values, make sure you + * call dim() instead. + */ inline int dim32(const int i) const { - return impl_.get()->dim32(i); +#ifndef NDEBUG + CAFFE_ENFORCE_LT_WITH_CALLER(i, static_cast(impl_->dim()), "Exceeding ndim limit"); + CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index"); +#endif + auto s = impl_->size(i); + CAFFE_ENFORCE_LT_WITH_CALLER(s, std::numeric_limits::max()); + return static_cast(s); } inline int64_t dim(const int i) const { - return impl_.get()->dim(i); + return impl_->size(i); } inline void ExtractDeviceOption(DeviceOption* device) const { - return impl_.get()->ExtractDeviceOption(device); + auto* context = GetStaticContext(); + CHECK(context); + context->ExtractDeviceOption(device, impl_->data()); } const Storage& storage() { diff --git a/caffe2/core/tensor_impl.cc b/caffe2/core/tensor_impl.cc index cff98c6101ea5d..dc8d666d6cb3a5 100644 --- a/caffe2/core/tensor_impl.cc +++ b/caffe2/core/tensor_impl.cc @@ -1,5 +1,4 @@ #include "caffe2/core/tensor_impl.h" - #include "caffe2/core/flags.h" CAFFE2_DEFINE_bool( diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h index 53c812f55e297b..2ee51f655e1e22 100644 --- a/caffe2/core/tensor_impl.h +++ b/caffe2/core/tensor_impl.h @@ -3,911 +3,13 @@ #include #include #include -#include - -#include "caffe2/core/allocator.h" -#include "caffe2/core/common.h" -#include "caffe2/core/flags.h" -#include "caffe2/core/logging.h" - -// A global boolean variable to control whether we free memory when a Tensor -// is shrinked to a smaller size. As a result, a Tensor is always going to -// keep the memory allocated for its maximum capacity reshaped to so far. -CAFFE2_DECLARE_bool(caffe2_keep_on_shrink); - -// Since we can have high variance in blob memory allocated across different -// inputs in the same run, we will shrink the blob only if the memory gain -// is larger than this flag in bytes. -CAFFE2_DECLARE_int64(caffe2_max_keep_on_shrink_memory); +#include namespace caffe2 { - -// Defined by protobuf -class DeviceOption; - -/** - * A utility function to convert vector to vector. - */ -inline std::vector ToVectorint64_t(const std::vector& src) { - return std::vector(src.begin(), src.end()); -} - -/** - * Return product of all dimensions starting from k - */ -inline int64_t size_from_dim_(int k, const std::vector& dims) { - int64_t r = 1; - for (size_t i = k; i < dims.size(); ++i) { - r *= dims[i]; - } - return r; -} - -// Product of all dims up to k (not including dims[k]) -inline int64_t size_to_dim_(int k, const std::vector& dims) { - CAFFE_ENFORCE((unsigned)k <= dims.size()); - int64_t r = 1; - for (int i = 0; i < k; ++i) { - r *= dims[i]; - } - return r; -} - -// Product of all dims between k and l (not including dims[k] and dims[l]) -inline int64_t size_between_dim_(int k, int l, const std::vector& dims) { - CAFFE_ENFORCE((unsigned)l < dims.size()); - int64_t r = 1; - if (k < l) { - for (int i = k + 1; i < l; ++i) { - r *= dims[i]; - } - } else { - for (int i = l + 1; i < k; ++i) { - r *= dims[i]; - } - } - return r; -} - -// Wrap around axis_index if it is negative, s.t., -1 is the last dim -inline int canonical_axis_index_(int axis_index, int ndims) { - CAFFE_ENFORCE_GE(axis_index, -ndims); - CAFFE_ENFORCE_LT(axis_index, ndims); - if (axis_index < 0) { - return axis_index + ndims; - } - return axis_index; -} - -/** - * @brief TensorImpl is the implementation of a tensor and the basic class - * in Caffe2 that stores a contiguous memory with its shape information. - * - * The TensorImpl class is essentially a wrapper around a device-specific memory - * (the device is specified by the Context template argument), and deals with - * the allocation and de-allocation of such memory. We make a simplified - * assumption that the memory is always contiguous. - */ -class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target { - public: - TensorImpl() = delete; - - explicit TensorImpl(at::Storage storage) : storage_(std::move(storage)), storage_offset_(0) { - data_type_ = storage_ ? storage_.dtype() : TypeMeta{}; - } - - TensorImpl(const TensorImpl&) = default; - TensorImpl& operator=(const TensorImpl&) = default; - TensorImpl(TensorImpl&&) = default; - TensorImpl& operator=(TensorImpl&&) = default; - - virtual ~TensorImpl() noexcept {} - - /* - * Since we removed template from tensor, we now store a static - * context pointer in tensor, which indicates the type of the tensor. - */ - at::BaseStaticContext* GetStaticContext() const { - auto device_type = GetDeviceType(); - return get_static_context(device_type); - } - - /* @brief - * Create a context that has the same device_type - * as the tensor. - * Note that this doesn't support passing in argument - * TODO(jerryzh): move this to a global registry - * that can create context for us - */ - std::unique_ptr CreateContext() const { - return GetStaticContext()->CreateContext(); - } - - at::DeviceType GetDeviceType() const { - return storage_.device_type(); - } - - /** - * @brief Copies the data from a source tensor, with a contex provided to - * carry out the underlying memcpy operation. - */ - void CopyFrom(const TensorImpl& src, at::BaseContext* context = nullptr) { - if ((void*)&src == (void*)this) { - return; - } - if (data_type_ != src.meta()) { - CAFFE_ENFORCE_WITH_CALLER( - src.is_contiguous(), - "Right now only copy of contiguous source Tensor is supported."); - storage_ = at::Storage(GetDeviceType(), src.meta()); - data_type_ = src.meta(); - } - if (src.size() == -1) { - dims_.clear(); - numel_ = -1; - strides_.clear(); - is_contiguous_ = true; - storage_.reset(); - data_type_ = TypeMeta(); - return; - } - Resize(src.dims()); - if (size() > 0) { - if (data_type_.copy()) { - CAFFE_ENFORCE( - GetDeviceType() == ::at::DeviceType::CPU, - "In CopyFrom source and dest tensors must both be CPU for meta copy"); - CAFFE_ENFORCE( - src.GetDeviceType() == ::at::DeviceType::CPU, - "In CopyFrom source and dest tensors must both be CPU for meta copy"); - data_type_.copy()(src.raw_data(), raw_mutable_data(), size()); - } else { - // We'll need to use a non-CPU context to perform the copy if - // one of the context is not CPU since only non-CPU context - // knows how to copy between CPU and that context - if (src.GetDeviceType() != ::at::DeviceType::CPU || GetDeviceType() == ::at::DeviceType::CPU) { - if (!context) { - src.CreateContext()->CopyBytesToDevice( - nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType()); - } else { - CAFFE_ENFORCE( - context->device_type() == src.GetDeviceType(), - "Type for provided context does not match the type of source"); - context->CopyBytesToDevice( - nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType()); - } - } else { - // In case source context is CPU, and target context is non-CPU - // We'll have to create a Context from target and perform the - // copy using that context - CreateContext()->CopyBytesFromCPU( - nbytes(), src.raw_data(), raw_mutable_data()); - } - } - } - } - - /** - * @brief Extend the outer-most dimension of this tensor - * to dimension of `num`. - */ - void ExtendTo(int64_t num, float growthPct, at::BaseContext* context) { - CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1); - CAFFE_ENFORCE_GE_WITH_CALLER(growthPct, 0); - CAFFE_ENFORCE(context != nullptr, "Context must be provided."); - Extend(num - dims_[0], growthPct, context); - } - - /** - * @brief Extends the outer-most dimension of this tensor by num elements, - * preserving the existing data. - * - * The underlying data may be reallocated in order to accommodate the new - * elements, in which case this tensors' capacity is grown at a factor of - * growthPct. This ensures that Extend runs on an amortized O(1) time - * complexity. - */ - void Extend(int64_t num, float growthPct, at::BaseContext* context) { - CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1); - CAFFE_ENFORCE_GE_WITH_CALLER( - num, 0, "`num` must be non-negative for Extend"); - CAFFE_ENFORCE_WITH_CALLER( - is_contiguous_, - "Right now Extend is only supported for contiguous Tensor."); - auto newDims = dims_; - newDims[0] += num; - if (!storage_.data()) { - Resize(newDims); - return; - } - auto newNumel = std::accumulate( - newDims.begin(), - newDims.end(), - static_cast(1), - std::multiplies()); - if (newNumel * storage_.itemsize() <= storage_.capacity()) { - dims_ = newDims; - numel_ = newNumel; - return; - } - auto newCapacity = dims_; - newCapacity[0] = std::max( - newDims[0], std::ceil(dims_[0] * (growthPct + 100) / 100)); - auto oldData = std::move(storage_.data_ptr()); - auto oldSize = numel_; - auto oldDims = dims_; - Resize(newCapacity); - auto* newData = raw_mutable_data(data_type_); - CAFFE_ENFORCE( - context != nullptr, "Context must be provided to Extend the tensor"); - context->CopyItemsSameDevice( - data_type_, oldSize, oldData.get(), newData); - reserved_ = true; - dims_ = newDims; - numel_ = newNumel; - } - - /** - * @brief Shrinks the outer-most dimension to given size, keeping the data. - * - * This method guarantees that no re-allocations are carried out, which means - * that the extra capacity after the end of the shurnk tensor is maintained. - */ - void ShrinkTo(int64_t outer_dim) { - CAFFE_ENFORCE_WITH_CALLER( - is_contiguous_, - "Right now ShrinkTo is only supported on contiguous Tensor."); - CAFFE_ENFORCE_WITH_CALLER(dims_.size() >= 1, "Tensor must be at least 1D"); - CAFFE_ENFORCE_WITH_CALLER( - outer_dim <= dims_[0], - "New outer dimension must be smaller than current."); - CAFFE_ENFORCE( - storage_.unique(), - "Can't call ShrinkTo on shared storage, please call Resize instead."); - dims_[0] = outer_dim; - numel_ = std::accumulate( - dims_.begin(), - dims_.end(), - static_cast(1), - std::multiplies()); - } - - /** - * @brief Reserve space for the underlying tensor. - * - * This must be called after Resize(), since we only specify the first - * dimension This does not copy over the old data to the newly allocated space - */ - template - void ReserveSpace(const T& outer_dim) { - CAFFE_ENFORCE_WITH_CALLER( - is_contiguous_, - "Right now ReserveSpace is only supported for contiguous Tensor."); - CAFFE_ENFORCE( - numel_ != -1, "size should be initialized before calling ReserveSpace"); - CAFFE_ENFORCE( - storage_.unique(), "Can't call ReserveSpace on shared storage."); - auto newCapacity = dims_; - newCapacity[0] = outer_dim; - auto newNumel = std::accumulate( - newCapacity.begin(), - newCapacity.end(), - static_cast(1), - std::multiplies()); - if (newNumel * storage_.itemsize() <= storage_.capacity()) { - return; - } - // Old data is discarded - storage_.data_ptr().clear(); - auto oldSize = numel_; - auto oldDims = dims_; - Resize(newCapacity); - // Allocate new memory but don't copy over the data - raw_mutable_data(data_type_); - dims_ = oldDims; - numel_ = oldSize; - reserved_ = true; - } - - /** - * @brief Resizes a tensor. - * - * Resize takes in a vector of ints specifying the dimensions of the tensor. - * You can pass in an empty vector to specify that it is a scalar (i.e. - * containing one single item). - * - * The underlying storage may be deleted after calling Resize: if the new - * shape leads to a different number of items in the tensor, the old memory - * is deleted and new memory will be allocated next time you call - * mutable_data(). However, if the shape is different but the total number of - * items is the same, the underlying storage is kept. - */ - template - void Resize(Ts... dim_source) { - bool is_init = numel_ == -1; - bool size_changed = SetDims(dim_source...); - if (size_changed) { - // If needed, we will free the data. the next mutable_data() call - // will create the data storage. - bool reset_tensor = false; - if (reserved_) { - // If tensor is reserved then don't claim its memeory unless capacity() - // is smaller than new size - reset_tensor = storage_.capacity() < (storage_offset_ + numel_) * storage_.itemsize(); - } else { - reset_tensor = storage_.capacity() < (storage_offset_ + numel_) * storage_.itemsize() || - !FLAGS_caffe2_keep_on_shrink || - storage_.capacity() - (storage_offset_ + numel_) * storage_.itemsize() > - FLAGS_caffe2_max_keep_on_shrink_memory; - } - - if (reset_tensor && !is_init) { - FreeMemory(); - } - } - } - - /** - * Resize the tensor like the source tensor. Note that this is just a - * sugar wrapper that essentially calls Resize(src_tensor.dims()). - */ - inline void ResizeLike(const TensorImpl& src_tensor) { - CAFFE_ENFORCE_WITH_CALLER( - src_tensor.is_contiguous(), - "Right now ResizeLike is only supported for contiguous Tensor."); - // Note: need casting for different context types. - if (static_cast(this) != static_cast(&src_tensor)) { - Resize(src_tensor.dims()); - } - } - - /** - * Resizes the tensor without touching underlying storage. - * This requires the total size of the tensor to remains constant. - */ - inline void Reshape(const std::vector& dims) { - CAFFE_ENFORCE_WITH_CALLER( - is_contiguous_, - "Right now Reshape is only supported for contiguous Tensor."); - int64_t new_size = 1; - for (auto d : dims) { - CAFFE_ENFORCE_GE_WITH_CALLER(d, 0); - new_size *= d; - } - CAFFE_ENFORCE_WITH_CALLER( - new_size == numel_, - "New size and old size are not equal. You cannot use Reshape, " - "but should use Resize." - // TODO(jiayq): remove the following warning after pending diffs - // stabilize. - " The old caffe2 mixes Reshape and Resize but this behavior has " - "been changed. If you find this error, most likely you will need " - "to change corresponding code from Reshape to Resize."); - dims_ = dims; - } - - inline void Reshape(const std::vector& dims) { - Reshape(ToVectorint64_t(dims)); - } - - /** - * Release whatever memory the tensor was holding but keep size and type - * information. Subsequent call to mutable_data will trigger new memory - * allocation. - */ - inline void FreeMemory() { - // We'll detach from the old Storage and create a new one - storage_ = at::Storage(storage_.device_type(), data_type_); - storage_offset_ = 0; - } - - /** - * A utility function to print the debug string for the tensor. Note that this - * is very slow since it involves quite some string operations, so do not use - * it in your performance-critical code. - */ - std::string DebugString() const { - std::stringstream ss; - ss << "A Tensor of item size " << storage_.itemsize() << " and type " - << data_type_.name() << " and dimension ("; - for (int d : dims_) { - ss << d << ","; - } - ss << ")."; - return ss.str(); - } - - /** - * @brief Shares the data with another tensor. - * - * To share data between two tensors, the sizes of the two tensors must be - * equal already. The reason we do not implicitly do a Resize to make the two - * tensors have the same shape is that we want to allow tensors of different - * shapes but the same number of items to still be able to share data. This - * allows one to e.g. have a n-dimensional Tensor and a flattened version - * sharing the same underlying storage. - * - * The source tensor should already have its data allocated. - */ - void ShareData(const TensorImpl& src) { - // Right now, we are assuming the device_type are the same, since it is - // inherently the same in the non-templatized code. We should probably add - // an ENFORCE here which might affect perf a little bit. - CAFFE_ENFORCE_EQ_WITH_CALLER( - src.numel_, - numel_, - "Size mismatch - did you call reshape before sharing the data?"); - // It is possible that the source tensor hasn't called mutable_data() yet, - // in which case ShareData() doesn't make much sense since we don't really - // know what to share yet. - CAFFE_ENFORCE_WITH_CALLER( - src.storage_.data() || src.numel_ == 0, - "Source tensor has no content and has size > 0"); - // Finally, do sharing. - /* Since we create new Storage whenever we need to change data_type/capacity - * this still keeps the original semantics - */ - storage_ = src.storage(); - data_type_ = src.dtype(); - storage_offset_ = src.storage_offset(); - } - - /** - * @brief Shares the data with an externally managed pointer. - * - * This is similar to ShareData() but the source is a pointer with an advanced - * deleter option. In default, no deletion takes place, and one needs to make - * sure that the external memory is deallocated only after the tensor finishes - * using it. If a Deleter object is passed in, when this tensor is reallocated - * or freed, the deleter function is going to be called. - */ - template - void - ShareExternalPointer(T* src, size_t capacity = 0, MemoryDeleter d = nullptr) { - ShareExternalPointer((void*)src, TypeMeta::Make(), capacity, d); - } - - template - void ShareExternalPointer(at::DataPtr&& data_ptr, size_t capacity = 0) { - ShareExternalPointer(std::move(data_ptr), TypeMeta::Make(), capacity); - } - - void ShareExternalPointer( - void* src, - const TypeMeta& data_type, - size_t capacity = 0, - MemoryDeleter d = nullptr) { - CAFFE_ENFORCE_WITH_CALLER( - is_contiguous_, - "Right now ShareExternalPointer is only supported for contiguos Tensor."); - CAFFE_ENFORCE_WITH_CALLER( - data_type.id() != TypeIdentifier::uninitialized(), - "To share with a raw external pointer you need to pass in an " - "initialized data_type(TypeMeta)."); - ShareExternalPointer( - at::DataPtr(src, src, d, GetDeviceType()), data_type, capacity); - } - - void ShareExternalPointer( - at::DataPtr&& data_ptr, - const TypeMeta& data_type, - size_t capacity) { - CAFFE_ENFORCE_WITH_CALLER( - data_type.id() != TypeIdentifier::uninitialized(), - "To share with a raw external pointer you need to pass in an " - "initialized data_type(TypeMeta)."); - if (!capacity) { - capacity = numel_ * data_type.itemsize(); - } - if (storage_.unique()) { - CAFFE_ENFORCE_WITH_CALLER( - numel_ >= 0, - "To share data with a raw pointer, you need to set shape first."); - storage_.UniqueStorageShareExternalPointer( - std::move(data_ptr), data_type, capacity); - data_type_ = data_type; - storage_offset_ = 0; - } else { - int64_t numel = capacity / data_type.itemsize(); - // Create a new Storage - storage_ = at::Storage(data_type, numel, std::move(data_ptr), nullptr, true); - data_type_ = data_type; - storage_offset_ = 0; - } - } - - /** - * Returns a const raw void* pointer of the underlying storage. mutable_data() - * or raw_mutable_data() must have been called prior to this function call. - */ - inline const void* raw_data() const { - CAFFE_ENFORCE_WITH_CALLER(storage_.data() || numel_ == 0); - return static_cast(static_cast(storage_.data()) + storage_offset_ * storage_.itemsize()); - } - - /** - * Returns a typed pointer of the underlying storage. mutable_data() or - * raw_mutable_data() must have been called prior to this function call, and - * the data type must be of the correct type. If you want to get a void* - * pointer instead, use raw_data(). - */ - template - inline const T* data() const { - CAFFE_ENFORCE_WITH_CALLER( - storage_.data() || numel_ == 0, - "The tensor is of non-zero shape, but its data is not allocated yet. " - "Caffe2 uses a lazy allocation, so you will need to call " - "mutable_data() or raw_mutable_data() to actually allocate memory."); - CAFFE_ENFORCE_WITH_CALLER( - IsType(), - "Tensor type mismatch, caller expects elements to be ", - TypeMeta::TypeName(), - ", while tensor contains ", - data_type_.name(), - ". "); - return static_cast(storage_.data()) + storage_offset_; - } - - /** - * Returns a mutable raw pointer of the underlying storage. Since we will need - * to know the type of the data for allocation, a TypeMeta object is passed in - * to specify the necessary information. This is conceptually equivalent of - * calling mutable_data() where the TypeMeta parameter meta is derived from - * the type T. This function differs from mutable_data() in the sense that - * the type T can be specified during runtime via the TypeMeta object. - * - * If the existing data does not match the desired type, it will be deleted - * and a new storage will be created. - */ - inline void* raw_mutable_data(const TypeMeta& meta) { - // For 0-size tensors it's fine to return any pointer (including nullptr) - if (data_type_ == meta && (storage_.data() || numel_ == 0)) { - return static_cast(static_cast(storage_.data()) + storage_offset_ * meta.itemsize()); - } else { - CAFFE_ENFORCE_WITH_CALLER( - numel_ >= 0, - "Tensor is not initialized. You probably need to call Resize() " - "before calling mutable_data()"); - bool had_special_dtor = data_type_.dtor() != nullptr; - storage_offset_ = 0; - if (storage_.unique()) { - storage_.set_dtype(meta); - } else { - if (data_type_ != meta) { - storage_ = at::Storage(storage_.device_type(), meta); - } - } - data_type_ = meta; - - // We can reuse the existing buffer if the current data does not have - // a special destructor and the new data doesn't have a special - // constructor. - if (numel_ == 0 || - (meta.ctor() == nullptr && !had_special_dtor && - storage_.numel() >= numel_)) { - AT_ASSERT(storage_offset_ == 0); // because we just reallocated - return storage_.data(); - } - const at::Allocator* allocator = storage_.allocator(); - // TODO: Get rid of StaticContext - CAFFE_ENFORCE( - allocator == nullptr, - "Allocator is not used within Caffe2 functions, please use StaticContext instead."); - if (meta.ctor()) { - // For types that need placement new, we will call it, as well as - // making sure that when the data is freed, it calls the right - // destruction procedure. - auto size = numel_; - auto dtor = data_type_.dtor(); - void* ptr; - at::DeleterFnPtr deleter; - auto ptr_and_deleter = GetStaticContext()->New( - numel_ * storage_.itemsize()); // Removing this can get rid of - // InefficientStdFunctionContext - ptr = ptr_and_deleter.first; - deleter = ptr_and_deleter.second; - storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr( - ptr, - [size, dtor, deleter](void* local_ptr) -> void { - dtor(local_ptr, size); - deleter(local_ptr); - }, - at::Device(storage_.device_type()))); - data_type_.ctor()(storage_.data(), numel_); - } else { - // For fundamental type, new and delete is easier. - auto ptr_and_deleter = - GetStaticContext()->New(numel_ * storage_.itemsize()); - storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr( - ptr_and_deleter.first, - ptr_and_deleter.second, - at::Device(storage_.device_type()))); - } - storage_.set_numel(numel_); - AT_ASSERT(storage_offset_ == 0); // because we just reallocated - return storage_.data(); - } - } - - /** - * Returns a mutable raw pointer of the underlying storage. This can only be - * used when you know for sure that the underlying storage of the tensor is - * already created via an earlier raw_mutable_data(meta) call or a - * mutable_data() call. - * - * If the existing data does not match the desired type, it will be deleted - * and a new storage will be created. - */ - inline void* raw_mutable_data() { - CAFFE_ENFORCE_WITH_CALLER( - data_type_.id() != TypeIdentifier::uninitialized(), - "Calling raw_mutable_data() without meta, but the current meta is " - "of unknown type."); - return raw_mutable_data(data_type_); - } - - /** - * Returns a typed pointer of the underlying storage. - * - * For fundamental types, we reuse possible existing storage if there - * is sufficient capacity. - */ - template - inline T* mutable_data() { - if ((numel_ == 0 || storage_.data()) && IsType()) { - return static_cast(storage_.data()) + storage_offset_; - } - // Check it here statically - otherwise TypeMeta would throw the runtime - // error in attempt to invoke TypeMeta::ctor() - static_assert( - std::is_default_constructible::value, - "Tensor can't hold non-default-constructible types"); - return static_cast(raw_mutable_data(TypeMeta::Make())); - } - - /** - * Returns the number of dimensions of the data. - */ - inline int ndim() const { - return dims_.size(); - } - /** - * Returns the size (i.e. the number of items) of the tensor. - */ - inline int64_t size() const { - return numel_; - } - /** - * Return the number of bytes each item takes in the tensor. - */ - inline size_t itemsize() const { - return storage_.itemsize(); - } - /** - * Returns the total number of bytes of the storage. - * - * This is equivalent to calling size() * itemsize(). - */ - inline size_t nbytes() const { - return numel_ * itemsize(); - ; - } - - /** - * Returns the dimensions of the tensor as a vector. - */ - inline const std::vector& dims() const { - return dims_; - } - - inline int64_t size_from_dim(int k) const { - return size_from_dim_(k, dims_); - } - - inline int64_t size_to_dim(int k) const { - return size_to_dim_(k, dims_); - } - - inline int64_t size_between_dim(int k, int l) const { - return size_between_dim_(k, l, dims_); - } - - /** - * Returns the 'canonical' version of a (usually) user-specified axis, - * allowing for negative indexing (e.g., -1 for the last axis). - * - * @param axis_index the axis index. - * If 0 <= index < ndim(), return index. - * If -ndim <= index <= -1, return (ndim() - (-index)), - * e.g., the last axis index (ndim() - 1) if index == -1, - * the second to last if index == -2, etc. - * Dies on out of range index. - */ - inline int canonical_axis_index(int axis_index) const { - return canonical_axis_index_(axis_index, ndim()); - } - - inline int64_t stride(int64_t dim) const { -#ifndef NDEBUG - // TODO: dim wrapping? - CAFFE_ENFORCE_LT_WITH_CALLER(dim, strides_.size(), "Exceeding ndim limit"); - CAFFE_ENFORCE_GE_WITH_CALLER( - dim, 0, "Cannot have negative dimension index"); -#endif - return strides_[dim]; - } - - // TODO: Change to ArrayRef later - inline at::DimVector strides() { - return strides_; - } - - inline bool is_contiguous() const { - return is_contiguous_; - } - - /** - * Checks if the tensor content is of the given data type. - */ - template - inline bool IsType() const { - return storage_.IsType(); - } - /** - * Returns the TypeMeta object associated with the current data type. - */ - inline const TypeMeta& meta() const { - return data_type_; - } - - inline const TypeMeta& dtype() const { - return data_type_; - } - - /** - * Returns the i-th dimension of the tensor in int. - * - * This function returns an int value instead of int64_t, which depending on - * the typedef could be int64. If you want int64 dim values, make sure you - * call dim() instead. - */ - inline int dim32(const int i) const { -#ifndef NDEBUG - CAFFE_ENFORCE_LT_WITH_CALLER(i, dims_.size(), "Exceeding ndim limit"); - CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index"); -#endif - CAFFE_ENFORCE_LT_WITH_CALLER(dims_[i], std::numeric_limits::max()); - return static_cast(dims_[i]); - } - - /** - * Returns the i-th dimension of the tensor. Note that the passed in index - * must be between 0 (inclusive) and the number of dimensions, otherwise - * this function will produce a fatal message. - */ - inline int64_t dim(const int i) const { -#ifndef NDEBUG - CAFFE_ENFORCE_LT_WITH_CALLER(i, dims_.size(), "Exceeding ndim limit"); - CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index"); -#endif - return dims_[i]; - } - - void ExtractDeviceOption(DeviceOption* device) const { - auto* context = GetStaticContext(); - CHECK(context); - context->ExtractDeviceOption(device, raw_data()); - } - - const at::Storage& storage() { - return storage_; - } - - const at::Storage& storage() const { - return storage_; - } - - int64_t storage_offset() const { - return storage_offset_; - } - - protected: - // TODO: change to DimVector - std::vector dims_; // sizes_ - at::DimVector strides_; - int64_t numel_ = -1; // numel_ - bool is_contiguous_ = true; - // we decide to keep reserved_ and it will - // live in Tensor after the split - // The logic is that if Extend() or ReserveSpace() were ever called, - // then subsequent Resize()s will not free up Storage. - bool reserved_ = false; - at::Storage storage_; - int64_t storage_offset_ = 0; - TypeMeta data_type_; - - private: - template < - typename T, - typename = typename std::enable_if::value>::type> - bool SetDims(const std::vector& src) { - auto old_numel = numel_; - dims_.resize(src.size()); - int64_t new_numel = 1; - for (size_t i = 0; i < src.size(); ++i) { - new_numel *= src[i]; - dims_[i] = src[i]; - } - update_strides(); - numel_ = new_numel; - return numel_ != old_numel; - } - - bool SetDims() { - auto old_numel = numel_; - dims_.resize(0); - update_strides(); - numel_ = 1; - return numel_ != old_numel; - } - - // TODO(jiayq): maybe rewrite the following functions with initializer list. - // NVCC does not play well with initializer lists last time, but worth - // another shot. - bool SetDims(const int64_t d0) { - auto old_numel = numel_; - dims_.resize(1); - dims_[0] = d0; - update_strides(); - numel_ = d0; - return numel_ != old_numel; - } - - bool SetDims(const int64_t d0, const int64_t d1) { - auto old_numel = numel_; - dims_.resize(2); - dims_[0] = d0; - dims_[1] = d1; - update_strides(); - numel_ = d0 * d1; - return numel_ != old_numel; - } - - bool SetDims(const int64_t d0, const int64_t d1, const int64_t d2) { - auto old_numel = numel_; - dims_.resize(3); - dims_[0] = d0; - dims_[1] = d1; - dims_[2] = d2; - update_strides(); - numel_ = d0 * d1 * d2; - return numel_ != old_numel; - } - - bool - SetDims(const int64_t d0, const int64_t d1, const int64_t d2, const int64_t d3) { - auto old_numel = numel_; - dims_.resize(4); - dims_[0] = d0; - dims_[1] = d1; - dims_[2] = d2; - dims_[3] = d3; - update_strides(); - numel_ = d0 * d1 * d2 * d3; - return numel_ != old_numel; - } - - inline void update_strides() { - strides_.resize(dims_.size()); - if (ndim() > 0) { - int last_idx = ndim() - 1; - strides_[last_idx] = 1; - for (auto i = last_idx - 1; i >= 0; --i) { - strides_[i] = strides_[i + 1] * std::max(dims_[i + 1], 1); - } - } - is_contiguous_ = true; - } -}; - + using at::ToVectorint64_t; + using at::size_from_dim_; + using at::size_to_dim_; + using at::size_between_dim_; + using at::canonical_axis_index_; + using at::TensorImpl; } diff --git a/caffe2/core/transform.cc b/caffe2/core/transform.cc index 5b3f80fbe3fc0a..549322abccc7da 100644 --- a/caffe2/core/transform.cc +++ b/caffe2/core/transform.cc @@ -10,7 +10,7 @@ namespace caffe2 { using transform::Graph; -CAFFE_DEFINE_REGISTRY(TransformRegistry, Transform); +C10_DEFINE_REGISTRY(TransformRegistry, Transform); std::vector> Transform::PatternMatch(const Graph& graph) { // checks if the node at index i is matched already or not diff --git a/caffe2/core/transform.h b/caffe2/core/transform.h index c6aaf119513847..723e14789d627c 100644 --- a/caffe2/core/transform.h +++ b/caffe2/core/transform.h @@ -150,9 +150,9 @@ class CAFFE2_API Transform { // Creates a Transform based on a key, which should be defined in registry. CAFFE2_API unique_ptr CreateTransform(string key); -CAFFE_DECLARE_REGISTRY(TransformRegistry, Transform); +C10_DECLARE_REGISTRY(TransformRegistry, Transform); #define REGISTER_TRANSFORM(name, ...) \ - CAFFE_REGISTER_CLASS(TransformRegistry, name, __VA_ARGS__) + C10_REGISTER_CLASS(TransformRegistry, name, __VA_ARGS__) // Create a Transform object from registry, // and immediately apply it to a Netdef. diff --git a/caffe2/core/workspace.h b/caffe2/core/workspace.h index 2ad486c328f56d..324766359de607 100644 --- a/caffe2/core/workspace.h +++ b/caffe2/core/workspace.h @@ -11,8 +11,8 @@ #include #include +#include "c10/util/Registry.h" #include "caffe2/core/blob.h" -#include "caffe2/core/registry.h" #include "caffe2/core/net.h" #include "caffe2/proto/caffe2_pb.h" #include "caffe2/utils/signal_handler.h" diff --git a/caffe2/ideep/operators/momentum_sgd_op.cc b/caffe2/ideep/operators/momentum_sgd_op.cc new file mode 100644 index 00000000000000..320780c12ffe1d --- /dev/null +++ b/caffe2/ideep/operators/momentum_sgd_op.cc @@ -0,0 +1,125 @@ +#include + +namespace caffe2 { + +void momentum_sgd_update( + const int N, + const float* g, + const float* m, + float* ng, + float* nm, + const float* lr, + const float momentum, + const bool nesterov, + float* param) { + const float LR = lr[0]; +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif + for (auto i = 0; i < N; ++i) { + if (!nesterov) { + const float adjusted_gradient = LR * g[i] + momentum * m[i]; + nm[i] = adjusted_gradient; + ng[i] = adjusted_gradient; + } else { + const float mi = m[i]; + const float mi_new = momentum * mi + LR * g[i]; + nm[i] = mi_new; + ng[i] = (1 + momentum) * mi_new - momentum * mi; + } + + if (param) { + param[i] -= ng[i]; + } + } +} + +class IDEEPMomentumSGDOp final : public IDEEPOperator { + public: + USE_IDEEP_DEF_ALIASES(); + USE_IDEEP_OPERATOR_FUNCTIONS(); + + IDEEPMomentumSGDOp(const OperatorDef& operator_def, Workspace* ws) + : IDEEPOperator(operator_def, ws), + momentum_(OperatorBase::GetSingleArgument("momentum", 0.0)), + nesterov_(OperatorBase::GetSingleArgument("nesterov", 0)) {} + + bool RunOnDevice() override { + CAFFE_ENFORCE(Input(GRAD).get_nelems() == Input(MOMENTUM).get_nelems()); + if (Input(GRAD) != *Output(OUTPUT_GRAD)) { + Output(OUTPUT_GRAD)->reinit(Input(GRAD).get_descriptor()); + } + if (Input(MOMENTUM) != *Output(OUTPUT_MOMENTUM)) { + Output(OUTPUT_MOMENTUM)->reinit(Input(MOMENTUM).get_descriptor()); + } + + // TODO: Use itensor after 0-dim is supported. Now use CPU tensor. + const auto& lr = OperatorBase::Input(LR, CPU); + CAFFE_ENFORCE(lr.size() == 1); + + momentum_sgd_update( + Input(GRAD).get_nelems(), + static_cast(Input(GRAD).get_data_handle()), + static_cast(Input(MOMENTUM).get_data_handle()), + static_cast(Output(OUTPUT_GRAD)->get_data_handle()), + static_cast(Output(OUTPUT_MOMENTUM)->get_data_handle()), + lr.template data(), + momentum_, + nesterov_, + nullptr); + return true; + } + + protected: + float momentum_{0.9}; + bool nesterov_; + INPUT_TAGS(GRAD, MOMENTUM, LR); + OUTPUT_TAGS(OUTPUT_GRAD, OUTPUT_MOMENTUM); +}; + +class IDEEPMomentumSGDUpdateOp final : public IDEEPOperator { + public: + USE_IDEEP_DEF_ALIASES(); + USE_IDEEP_OPERATOR_FUNCTIONS(); + IDEEPMomentumSGDUpdateOp(const OperatorDef& operator_def, Workspace* ws) + : IDEEPOperator(operator_def, ws), + momentum_(OperatorBase::GetSingleArgument("momentum", 0.0)), + nesterov_(OperatorBase::GetSingleArgument("nesterov", 0)) {} + + bool RunOnDevice() override { + CAFFE_ENFORCE(Input(GRAD).get_nelems() == Input(MOMENTUM).get_nelems()); + if (Input(GRAD) != *Output(OUTPUT_GRAD)) { + Output(OUTPUT_GRAD)->reinit(Input(GRAD).get_descriptor()); + } + if (Input(MOMENTUM) != *Output(OUTPUT_MOMENTUM)) { + Output(OUTPUT_MOMENTUM)->reinit(Input(MOMENTUM).get_descriptor()); + } + + // TODO: Use itensor after 0-dim is supported. Now use CPU tensor. + const auto& lr = OperatorBase::Input(LR, CPU); + CAFFE_ENFORCE(lr.size() == 1); + + momentum_sgd_update( + Input(GRAD).get_nelems(), + static_cast(Input(GRAD).get_data_handle()), + static_cast(Input(MOMENTUM).get_data_handle()), + static_cast(Output(OUTPUT_GRAD)->get_data_handle()), + static_cast(Output(OUTPUT_MOMENTUM)->get_data_handle()), + lr.template data(), + momentum_, + nesterov_, + static_cast(Output(OUTPUT_PARAM)->get_data_handle())); + return true; + } + + protected: + float momentum_{0.9}; + bool nesterov_; + INPUT_TAGS(GRAD, MOMENTUM, LR, PARAM); + OUTPUT_TAGS(OUTPUT_GRAD, OUTPUT_MOMENTUM, OUTPUT_PARAM); +}; + +REGISTER_IDEEP_OPERATOR(MomentumSGD, IDEEPMomentumSGDOp); +REGISTER_IDEEP_OPERATOR(MomentumSGDUpdate, IDEEPMomentumSGDUpdateOp); + +} // namespace caffe2 diff --git a/caffe2/ideep/operators/operator_fallback_ideep.h b/caffe2/ideep/operators/operator_fallback_ideep.h index 3226a08c4af9cf..0292dbd5d5a637 100644 --- a/caffe2/ideep/operators/operator_fallback_ideep.h +++ b/caffe2/ideep/operators/operator_fallback_ideep.h @@ -36,7 +36,7 @@ namespace caffe2 { * IDEEPFallbackOp>); */ template > -class IDEEPFallbackOp final : public IDEEPOperator { +class C10_EXPORT IDEEPFallbackOp final : public IDEEPOperator { public: USE_IDEEP_DEF_ALIASES(); USE_IDEEP_OPERATOR_FUNCTIONS(); diff --git a/caffe2/ideep/utils/ideep_context.h b/caffe2/ideep/utils/ideep_context.h index f50a4f34c66789..087078c507d164 100644 --- a/caffe2/ideep/utils/ideep_context.h +++ b/caffe2/ideep/utils/ideep_context.h @@ -20,6 +20,8 @@ class IDEEPContext final : public BaseContext { : RandomNumberSeed()) { CAFFE_ENFORCE_EQ(option.device_type(), PROTO_IDEEP); } + explicit IDEEPContext(const at::Device& device) + : IDEEPContext(DeviceToOption(device)) {} ~IDEEPContext() noexcept override {} @@ -178,15 +180,6 @@ class IDEEPStaticContext : public BaseStaticContext { return GetCPUAllocator()->New(nbytes); } - std::unique_ptr CreateContext() override { - return caffe2::make_unique(); - } - - std::unique_ptr CreateContext( - const DeviceOption& option) override { - return caffe2::make_unique(option); - } - DeviceType GetDeviceType() override { return IDEEP; } diff --git a/caffe2/ideep/utils/ideep_operator.h b/caffe2/ideep/utils/ideep_operator.h index 5cccbb509725c2..f9b6a831061388 100644 --- a/caffe2/ideep/utils/ideep_operator.h +++ b/caffe2/ideep/utils/ideep_operator.h @@ -6,21 +6,21 @@ namespace caffe2 { -CAFFE_DECLARE_REGISTRY( +C10_DECLARE_REGISTRY( IDEEPOperatorRegistry, OperatorBase, const OperatorDef&, Workspace*); #define REGISTER_IDEEP_OPERATOR_CREATOR(key, ...) \ - CAFFE_REGISTER_CREATOR(IDEEPOperatorRegistry, key, __VA_ARGS__) + C10_REGISTER_CREATOR(IDEEPOperatorRegistry, key, __VA_ARGS__) #define REGISTER_IDEEP_OPERATOR(name, ...) \ - CAFFE_REGISTER_CLASS(IDEEPOperatorRegistry, name, __VA_ARGS__) + C10_REGISTER_CLASS(IDEEPOperatorRegistry, name, __VA_ARGS__) #define REGISTER_IDEEP_OPERATOR_STR(str_name, ...) \ - CAFFE_REGISTER_TYPED_CLASS(IDEEPOperatorRegistry, str_name, __VA_ARGS__) + C10_REGISTER_TYPED_CLASS(IDEEPOperatorRegistry, str_name, __VA_ARGS__) #define REGISTER_IDEEP_OPERATOR_WITH_ENGINE(name, engine, ...) \ - CAFFE_REGISTER_CLASS(IDEEPOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__) + C10_REGISTER_CLASS(IDEEPOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__) // IDEEPOperator is the base scaffolding of the operators that uses IDEEP. It // provides a few operators that are useful to IDEEP specific implementations. @@ -51,7 +51,10 @@ class IDEEPOperator : public OperatorBase { // FinishDeviceComputation, // it is always just a re-route to RunOnDevice(). try { - return RunOnDevice(); + StartAllObservers(); + bool result = RunOnDevice(); + StopAllObservers(); + return result; } catch (EnforceNotMet& err) { err.AppendMessage(getErrorMsg()); throw; diff --git a/caffe2/ideep/utils/ideep_register.cc b/caffe2/ideep/utils/ideep_register.cc index 020e22fa6143ed..a0b80f8a8e401c 100644 --- a/caffe2/ideep/utils/ideep_register.cc +++ b/caffe2/ideep/utils/ideep_register.cc @@ -4,11 +4,14 @@ #include #include "ideep_context.h" +namespace at { +REGISTER_CONTEXT(DeviceType::IDEEP, caffe2::IDEEPContext); +} // namespace at namespace caffe2 { CAFFE_KNOWN_TYPE(ideep::tensor); -CAFFE_DEFINE_REGISTRY( +C10_DEFINE_REGISTRY( IDEEPOperatorRegistry, OperatorBase, const OperatorDef&, @@ -27,7 +30,7 @@ REGISTER_EVENT_ERROR_MESSAGE_FUNCTION(IDEEP, EventErrorMessageCPU); REGISTER_EVENT_SET_FINISHED_FUNCTION(IDEEP, EventSetFinishedCPU); REGISTER_EVENT_RESET_FUNCTION(IDEEP, EventResetCPU); -BaseStaticContext* GetIDEEPStaticContext() { +C10_EXPORT BaseStaticContext* GetIDEEPStaticContext() { static IDEEPStaticContext context; return &context; } diff --git a/caffe2/mkl/mkl_operator.cc b/caffe2/mkl/mkl_operator.cc index bf5b460d0920be..8fba56da8474d6 100644 --- a/caffe2/mkl/mkl_operator.cc +++ b/caffe2/mkl/mkl_operator.cc @@ -9,7 +9,7 @@ CAFFE2_DEFINE_bool( namespace caffe2 { -CAFFE_DEFINE_REGISTRY( +C10_DEFINE_REGISTRY( MKLOperatorRegistry, OperatorBase, const OperatorDef&, diff --git a/caffe2/mkl/utils/mkl_context.cc b/caffe2/mkl/utils/mkl_context.cc index 6e9075df43475f..8c66bc111282ac 100644 --- a/caffe2/mkl/utils/mkl_context.cc +++ b/caffe2/mkl/utils/mkl_context.cc @@ -3,6 +3,10 @@ #include "mkl_context.h" #include "caffe2/core/event_cpu.h" +namespace at { + +REGISTER_CONTEXT(DeviceType::MKLDNN, caffe2::MKLContext); +} // namespace at namespace caffe2 { // MKL events are the same as CPU events diff --git a/caffe2/mkl/utils/mkl_context.h b/caffe2/mkl/utils/mkl_context.h index 0a7b5808a446be..8364026d91c651 100644 --- a/caffe2/mkl/utils/mkl_context.h +++ b/caffe2/mkl/utils/mkl_context.h @@ -29,6 +29,8 @@ class MKLContext : public BaseContext { : RandomNumberSeed()) { CAFFE_ENFORCE_EQ(option.device_type(), PROTO_MKLDNN); } + explicit MKLContext(const at::Device& device) + : MKLContext(DeviceToOption(device)) {} ~MKLContext() override {} @@ -155,15 +157,6 @@ class MKLStaticContext : public BaseStaticContext { return GetCPUAllocator()->New(nbytes); } - std::unique_ptr CreateContext() override { - return caffe2::make_unique(); - } - - std::unique_ptr CreateContext( - const DeviceOption& option) override { - return caffe2::make_unique(option); - } - DeviceType GetDeviceType() override { return MKLDNN; } diff --git a/caffe2/mkl/utils/mkl_memory.h b/caffe2/mkl/utils/mkl_memory.h index 736d8ede8cf53d..ffa1899197f2ba 100644 --- a/caffe2/mkl/utils/mkl_memory.h +++ b/caffe2/mkl/utils/mkl_memory.h @@ -148,7 +148,7 @@ class LayoutWrapper { * Most of the MKLMemory functions are not thread safe. */ template -class MKLMemory { +class C10_EXPORT MKLMemory { public: // Initializes an empty MKLMemory. MKLMemory() {} @@ -460,7 +460,7 @@ class MKLMemory { return dims_; } - inline const int ndim() const { return dims_.size(); } + inline int ndim() const { return dims_.size(); } inline int dim32(const int i) const { CAFFE_ENFORCE_LT(dims_.at(i), std::numeric_limits::max()); diff --git a/caffe2/mkl/utils/mkl_operator.h b/caffe2/mkl/utils/mkl_operator.h index 2236e9267af542..0f028fbfaa8c01 100644 --- a/caffe2/mkl/utils/mkl_operator.h +++ b/caffe2/mkl/utils/mkl_operator.h @@ -10,20 +10,20 @@ CAFFE2_DECLARE_bool(caffe2_mkl_memonger_in_use); namespace caffe2 { -CAFFE_DECLARE_REGISTRY( +C10_DECLARE_REGISTRY( MKLOperatorRegistry, OperatorBase, const OperatorDef&, Workspace*); #define REGISTER_MKL_OPERATOR_CREATOR(key, ...) \ - CAFFE_REGISTER_CREATOR(MKLOperatorRegistry, key, __VA_ARGS__) + C10_REGISTER_CREATOR(MKLOperatorRegistry, key, __VA_ARGS__) #define REGISTER_MKL_OPERATOR(name, ...) \ - CAFFE_REGISTER_CLASS(MKLOperatorRegistry, name, __VA_ARGS__) + C10_REGISTER_CLASS(MKLOperatorRegistry, name, __VA_ARGS__) #define REGISTER_MKL_OPERATOR_STR(str_name, ...) \ - CAFFE_REGISTER_TYPED_CLASS(MKLOperatorRegistry, str_name, __VA_ARGS__) + C10_REGISTER_TYPED_CLASS(MKLOperatorRegistry, str_name, __VA_ARGS__) #define REGISTER_MKL_OPERATOR_WITH_ENGINE(name, engine, ...) \ - CAFFE_REGISTER_CLASS(MKLOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__) + C10_REGISTER_CLASS(MKLOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__) namespace mkl { // MKLOperator is the base scaffolding of the operators that uses MKLDNN. It diff --git a/caffe2/mobile/contrib/arm-compute/core/net_gl.h b/caffe2/mobile/contrib/arm-compute/core/net_gl.h index 1dc93dedc3fff3..48a47ff87f3351 100644 --- a/caffe2/mobile/contrib/arm-compute/core/net_gl.h +++ b/caffe2/mobile/contrib/arm-compute/core/net_gl.h @@ -3,10 +3,10 @@ #include +#include "c10/util/Registry.h" #include "caffe2/core/common.h" #include "caffe2/core/logging.h" #include "caffe2/core/net.h" -#include "caffe2/core/registry.h" #include "caffe2/core/tensor.h" #include "caffe2/core/workspace.h" #include "caffe2/proto/caffe2_pb.h" diff --git a/caffe2/mobile/contrib/arm-compute/core/operator.cc b/caffe2/mobile/contrib/arm-compute/core/operator.cc index bd4337aa85e7cf..cddd0b0129c6a0 100644 --- a/caffe2/mobile/contrib/arm-compute/core/operator.cc +++ b/caffe2/mobile/contrib/arm-compute/core/operator.cc @@ -2,8 +2,11 @@ namespace caffe2 { -CAFFE_DEFINE_REGISTRY(GLOperatorRegistry, OperatorBase, const OperatorDef &, - Workspace *); +C10_DEFINE_REGISTRY( + GLOperatorRegistry, + OperatorBase, + const OperatorDef&, + Workspace*); CAFFE_REGISTER_DEVICE_TYPE(DeviceType::OPENGL, GLOperatorRegistry); } // namespace caffe2 diff --git a/caffe2/mobile/contrib/arm-compute/core/operator.h b/caffe2/mobile/contrib/arm-compute/core/operator.h index 037173054f7715..4df78c7734b849 100644 --- a/caffe2/mobile/contrib/arm-compute/core/operator.h +++ b/caffe2/mobile/contrib/arm-compute/core/operator.h @@ -1,26 +1,29 @@ #ifndef CAFFE2_OPENGL_OPERATOR_H_ #define CAFFE2_OPENGL_OPERATOR_H_ +#include "c10/util/Registry.h" #include "caffe2/core/operator.h" -#include "caffe2/core/registry.h" namespace caffe2 { -CAFFE_DECLARE_REGISTRY(GLOperatorRegistry, OperatorBase, const OperatorDef &, - Workspace *); -#define REGISTER_GL_OPERATOR_CREATOR(key, ...) \ - CAFFE_REGISTER_CREATOR(GLOperatorRegistry, key, __VA_ARGS__) -#define REGISTER_GL_OPERATOR(name, ...) \ - extern void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(); \ - static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_GL##name() { \ - CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(); \ - } \ - CAFFE_REGISTER_CLASS(GLOperatorRegistry, name, __VA_ARGS__) -#define REGISTER_GL_OPERATOR_STR(str_name, ...) \ - CAFFE_REGISTER_TYPED_CLASS(GLOperatorRegistry, str_name, __VA_ARGS__) +C10_DECLARE_REGISTRY( + GLOperatorRegistry, + OperatorBase, + const OperatorDef&, + Workspace*); +#define REGISTER_GL_OPERATOR_CREATOR(key, ...) \ + C10_REGISTER_CREATOR(GLOperatorRegistry, key, __VA_ARGS__) +#define REGISTER_GL_OPERATOR(name, ...) \ + extern void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(); \ + static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_GL##name() { \ + CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(); \ + } \ + C10_REGISTER_CLASS(GLOperatorRegistry, name, __VA_ARGS__) +#define REGISTER_GL_OPERATOR_STR(str_name, ...) \ + C10_REGISTER_TYPED_CLASS(GLOperatorRegistry, str_name, __VA_ARGS__) -#define REGISTER_GL_OPERATOR_WITH_ENGINE(name, engine, ...) \ - CAFFE_REGISTER_CLASS(GLOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__) +#define REGISTER_GL_OPERATOR_WITH_ENGINE(name, engine, ...) \ + C10_REGISTER_CLASS(GLOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__) } // namespace caffe2 diff --git a/caffe2/onnx/torch_ops/CMakeLists.txt b/caffe2/onnx/torch_ops/CMakeLists.txt new file mode 100644 index 00000000000000..99443af4cc9bc6 --- /dev/null +++ b/caffe2/onnx/torch_ops/CMakeLists.txt @@ -0,0 +1,5 @@ +# ---[ Extra onnx files. +file(GLOB ONNX_SRCS *.cc) + +# ---[ Send the lists to the parent scope. +set(ONNX_SRCS ${ONNX_SRCS} PARENT_SCOPE) diff --git a/caffe2/onnx/torch_ops/constants.h b/caffe2/onnx/torch_ops/constants.h new file mode 100644 index 00000000000000..ebd2a2464d9b33 --- /dev/null +++ b/caffe2/onnx/torch_ops/constants.h @@ -0,0 +1,7 @@ +namespace ONNX_NAMESPACE { + +const int AI_ONNX_PYTORCH_DOMAIN_MIN_OPSET = 1; +const int AI_ONNX_PYTORCH_DOMAIN_MAX_OPSET = 1; +constexpr const char* AI_ONNX_PYTORCH_DOMAIN = "ai.onnx.pytorch"; + +} // namespace ONNX_NAMESPACE diff --git a/caffe2/onnx/torch_ops/defs.cc b/caffe2/onnx/torch_ops/defs.cc new file mode 100644 index 00000000000000..8d03120af03557 --- /dev/null +++ b/caffe2/onnx/torch_ops/defs.cc @@ -0,0 +1,24 @@ +// Copyright (c) Facebook Inc. and Microsoft Corporation. +// Licensed under the MIT license. + +#include "./schema.h" + +namespace ONNX_NAMESPACE { + +static const char* dummy_test_only_ver1_doc = R"DOC( +A dummy op for verifying the build setup works, don't use me. +)DOC"; + +ONNX_PYTORCH_OPERATOR_SET_SCHEMA( + DUMMY_TEST_ONLY, + 1, + OpSchema() + .SetDoc(dummy_test_only_ver1_doc) + .Input(0, "input", "Input tensor", "T") + .Output(0, "output", "Output tensor", "T") + .TypeConstraint( + "T", + {"tensor(float16)", "tensor(float)", "tensor(double)"}, + "Constrain input and output types to float tensors.")); + +} // namespace ONNX_NAMESPACE diff --git a/caffe2/onnx/torch_ops/operator_sets.h b/caffe2/onnx/torch_ops/operator_sets.h new file mode 100644 index 00000000000000..760a6b7fa2a7b6 --- /dev/null +++ b/caffe2/onnx/torch_ops/operator_sets.h @@ -0,0 +1,22 @@ +#pragma once + +#include "onnx/defs/schema.h" + +namespace ONNX_NAMESPACE { + +class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(PyTorch, 1, DUMMY_TEST_ONLY); + +// Iterate over schema from ai.onnx.pytorch domain opset 1 +class OpSet_PyTorch_ver1 { + public: + static void ForEachSchema(std::function fn) { + fn(GetOpSchema()); + } +}; + +inline void RegisterPyTorchOperatorSetSchema() { + RegisterOpSetSchema(); +} + +} // namespace ONNX_NAMESPACE diff --git a/caffe2/onnx/torch_ops/schema.cc b/caffe2/onnx/torch_ops/schema.cc new file mode 100644 index 00000000000000..de933c2c23ab2e --- /dev/null +++ b/caffe2/onnx/torch_ops/schema.cc @@ -0,0 +1,17 @@ +#include "./schema.h" +#include "./operator_sets.h" + +namespace { +using namespace ONNX_NAMESPACE; +class PyTorchSchemasRegisterer { + public: + PyTorchSchemasRegisterer() { + OpSchemaRegistry::DomainToVersionRange::Instance().AddDomainToVersion( + AI_ONNX_PYTORCH_DOMAIN, + AI_ONNX_PYTORCH_DOMAIN_MIN_OPSET, + AI_ONNX_PYTORCH_DOMAIN_MAX_OPSET); + RegisterPyTorchOperatorSetSchema(); + } +}; +static PyTorchSchemasRegisterer registerer{}; +} // namespace diff --git a/caffe2/onnx/torch_ops/schema.h b/caffe2/onnx/torch_ops/schema.h new file mode 100644 index 00000000000000..3454e366a1eeba --- /dev/null +++ b/caffe2/onnx/torch_ops/schema.h @@ -0,0 +1,8 @@ +#pragma once + +#include "./constants.h" +#include "onnx/defs/schema.h" + +#define ONNX_PYTORCH_OPERATOR_SET_SCHEMA(name, ver, impl) \ + ONNX_OPERATOR_SET_SCHEMA_EX( \ + name, PyTorch, AI_ONNX_PYTORCH_DOMAIN, ver, false, impl) diff --git a/caffe2/operators/batch_gather_ops.h b/caffe2/operators/batch_gather_ops.h index 2b9e4d6d5e6ecc..07ee6187443a97 100644 --- a/caffe2/operators/batch_gather_ops.h +++ b/caffe2/operators/batch_gather_ops.h @@ -35,31 +35,52 @@ class BatchGatherOp final : public Operator { auto block_size = data.size_from_dim(2); auto block_bytesize = block_size * data.meta().itemsize(); auto N = indices.size(); - auto data_batch_bytesize = data.size_from_dim(1) * data.meta().itemsize(); - auto gathered_batch_bytesize = - N * data.size_from_dim(2) * data.meta().itemsize(); + auto data_batch_size = data.size_from_dim(1); + auto gathered_batch_size = N * data.size_from_dim(2); + auto data_batch_bytesize = data_batch_size * data.meta().itemsize(); + auto gathered_batch_bytesize = gathered_batch_size * data.meta().itemsize(); const TInd* idxs = indices.template data(); auto src_base = static_cast(data.raw_data()); auto out = static_cast(output->raw_mutable_data(data.meta())); - for (auto batch = 0; batch < data.dim(0); ++batch) { - for (auto i = 0; i < N; ++i) { - auto idx = idxs[i]; - CAFFE_ENFORCE( - 0 <= idx && idx < data.dim(1), - "INDICES element is out of DATA bounds, id=", - idx, - " data_dim=", - data.dim(1)); - auto src = - src_base + idx * block_bytesize + batch * data_batch_bytesize; - auto dst = out + i * block_bytesize + batch * gathered_batch_bytesize; - context_.CopyItemsSameDevice(data.meta(), block_size, src, dst); + for (auto i = 0; i < N; ++i) { + auto idx = idxs[i]; + CAFFE_ENFORCE( + 0 <= idx && idx < data.dim(1), + "INDICES element is out of DATA bounds, id=", + idx, + " data_dim=", + data.dim(1)); + } + + if (data.template IsType() && block_size == 1) { + auto src = data.template data(); + auto dst = output->template mutable_data(); + + for (auto batch = 0; batch < data.dim(0); ++batch) { + auto src_batch_base = src + batch * data_batch_size; + auto out_batch_base = dst + batch * gathered_batch_size; + + for (auto i = 0; i < N; ++i) { + auto idx = idxs[i]; + out_batch_base[i] = src_batch_base[idx]; + } + } + } else { + for (auto batch = 0; batch < data.dim(0); ++batch) { + auto src_batch_base = src_base + batch * data_batch_bytesize; + auto out_batch_base = out + batch * gathered_batch_bytesize; + + for (auto i = 0; i < N; ++i) { + auto idx = idxs[i]; + auto src = src_batch_base + idx * block_bytesize; + auto dst = out_batch_base + i * block_bytesize; + context_.CopyItemsSameDevice(data.meta(), block_size, src, dst); + } } } return true; } - INPUT_TAGS(DATA, INDICES); }; @@ -108,21 +129,32 @@ class BatchGatherGradientOp final : public Operator { auto gathered_batch_size = N * data.size_from_dim(2); const TInd* idxs = indices.template data(); + for (auto i = 0; i < N; ++i) { + auto idx = idxs[i]; + CAFFE_ENFORCE( + 0 <= idx && idx < data.dim(1), + "INDICES element is out of DATA bounds, id=", + idx, + " data_dim=", + data.dim(1)); + } + for (auto batch = 0; batch < grad.dim(0); ++batch) { + auto src_batch_base = grad_data + batch * gathered_batch_size; + auto out_batch_base = out_data + batch * data_batch_size; + for (auto i = 0; i < N; ++i) { auto idx = idxs[i]; - CAFFE_ENFORCE( - 0 <= idx && idx < data.dim(1), - "INDICES element is out of DATA bounds, id=", - idx, - " data_dim=", - data.dim(1)); - math::Add( - block_size, - out_data + idx * block_size + batch * data_batch_size, - grad_data + i * block_size + batch * gathered_batch_size, - out_data + idx * block_size + batch * data_batch_size, - &context_); + if (block_size == 1) { + out_batch_base[idx * block_size] += src_batch_base[i * block_size]; + } else { + math::Add( + block_size, + out_batch_base + idx * block_size, + src_batch_base + i * block_size, + out_batch_base + idx * block_size, + &context_); + } } } return true; diff --git a/caffe2/operators/crf_viterbi_op.cc b/caffe2/operators/crf_viterbi_op.cc new file mode 100644 index 00000000000000..39a5391d735fcd --- /dev/null +++ b/caffe2/operators/crf_viterbi_op.cc @@ -0,0 +1,221 @@ +#include +#include +#include +#include +#include "caffe2/core/blob_serialization.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/tensor.h" +#include "caffe2/utils/eigen_utils.h" +#include "caffe2/utils/math.h" + +namespace caffe2 { +namespace { + +void RowwiseMaxAndArg( + const float* mat, + int32_t N, + int32_t D, + float* rowMax, + int32_t* argMax) { + auto eigenMat = ConstEigenMatrixMap(mat, D, N); + for (auto i = 0; i < D; i++) { + // eigenMat.row(i) is equivalent to column i in mat + rowMax[i] = eigenMat.row(i).maxCoeff(argMax + i); + } +} +void ColwiseMaxAndArg( + const float* mat, + int32_t N, + int32_t D, + float* colMax, + int32_t* argMax) { + auto eigenMat = ConstEigenMatrixMap(mat, D, N); + for (auto i = 0; i < N; i++) { + // eigenMat.col(i) is equivalent to row i in mat + colMax[i] = eigenMat.col(i).maxCoeff(argMax + i); + } +} + +class ViterbiPathOp : public Operator { + public: + ViterbiPathOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws) {} + + void GatherRow( + const TensorCPU& data, + int32_t rowIndex, + int32_t block_size, + int32_t block_bytesize, + TensorCPU* outRow) { + CAFFE_ENFORCE( + 0 <= rowIndex && rowIndex < data.dim(0), + "rowIndex is out of DATA bounds"); + auto out = static_cast(outRow->raw_mutable_data(data.meta())); + auto src_base = static_cast(data.raw_data()); + auto src = src_base + rowIndex * block_bytesize; + context_.CopyItemsSameDevice(data.meta(), block_size, src, out); + } + + void + AddColToMat(const TensorCPU& mat, const TensorCPU& col, TensorCPU* result) { + float* resultData = result->template mutable_data(); + const float* colData = col.template data(); + // Initialize the columns of the result to be = the input col + for (auto i = 0; i < result->dim32(1); i++) { + for (auto j = 0; j < result->dim32(0); j++) { + resultData[i * result->dim32(0) + j] = colData[i]; + } + } + // Element-wise add of the result and the input matrix + math::Add( + mat.size(), + resultData, + mat.template data(), + resultData, + &context_); + } + + bool RunOnDevice() override { + auto& predictions = Input(0); + auto& transitions = Input(1); + auto* viterbiPath = Output(0); + + CAFFE_ENFORCE( + predictions.ndim() == 2 && transitions.ndim() == 2, + "Predictions and transitions hould 2D matrices"); + + CAFFE_ENFORCE( + predictions.dim(1) == transitions.dim(0), + "Predictions and transitions dimensions not matching"); + + auto seqLen = predictions.dim32(0); + + viterbiPath->Resize(seqLen); + auto block_size = predictions.size() / predictions.dim(0); + auto block_bytesize = + predictions.size_from_dim(1) * predictions.meta().itemsize(); + Tensor backpointers(CPU); + backpointers.ResizeLike(predictions); + + Tensor trellis(std::vector{block_size}, CPU); + Tensor dpMat(CPU); + dpMat.ResizeLike(transitions); + Tensor dpMax(std::vector{block_size}, CPU); + GatherRow(predictions, 0, block_size, block_bytesize, &trellis); + for (auto i = 1; i < seqLen; i++) { + AddColToMat(transitions, trellis, &dpMat); + RowwiseMaxAndArg( + dpMat.template data(), + dpMat.dim(0), + dpMat.dim(1), + dpMax.template mutable_data(), + backpointers.template mutable_data() + (i * block_size)); + + GatherRow(predictions, i, block_size, block_bytesize, &trellis); + math::Add( + trellis.size(), + trellis.template data(), + dpMax.template data(), + trellis.template mutable_data(), + &context_); + } + + Tensor tMax(std::vector{1}, CPU); + Tensor tArgMax(std::vector{1}, CPU); + ColwiseMaxAndArg( + trellis.template data(), + 1, + trellis.size(), + tMax.template mutable_data(), + tArgMax.template mutable_data()); + + std::vector viterbiVec; + viterbiVec.push_back(tArgMax.template data()[0]); + Tensor bpEntry(std::vector{block_size}, CPU); + block_bytesize = + backpointers.size_from_dim(1) * backpointers.meta().itemsize(); + for (auto i = seqLen - 1; i > 0; i--) { + GatherRow(backpointers, i, block_size, block_bytesize, &bpEntry); + viterbiVec.push_back(bpEntry.template data()[viterbiVec.back()]); + } + std::reverse_copy( + viterbiVec.begin(), + viterbiVec.end(), + viterbiPath->template mutable_data()); + return true; + } +}; +class SwapBestPathOp : public Operator { + public: + SwapBestPathOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws) {} + bool RunOnDevice() override { + auto& data = Input(0); + auto& newBestIdicies = Input(1); + auto* updatedData = Output(0); + + CAFFE_ENFORCE( + data.ndim() == 2 && newBestIdicies.ndim() == 1, + "predictions should be a 2D matrix and bestPath should be 1D vector"); + + CAFFE_ENFORCE( + data.dim(0) == newBestIdicies.dim(0), + "predictions and bestPath dimensions not matching"); + + updatedData->ResizeLike(data); + float* outData = updatedData->template mutable_data(); + context_.CopyItemsSameDevice( + data.meta(), data.size(), data.template data(), outData); + + Tensor bestScores(CPU); + bestScores.ResizeLike(newBestIdicies); + Tensor oldBestIndices(CPU); + oldBestIndices.ResizeLike(newBestIdicies); + + ColwiseMaxAndArg( + data.template data(), + data.dim(0), + data.dim(1), + bestScores.template mutable_data(), + oldBestIndices.template mutable_data()); + + auto block_size = data.size() / data.dim(0); + + const int32_t* oldBestIdx = oldBestIndices.template data(); + const int32_t* newIdx = newBestIdicies.template data(); + + for (auto i = 0; i < data.dim32(0); i++) { + std::swap( + outData[i * block_size + newIdx[i]], + outData[i * block_size + oldBestIdx[i]]); + } + return true; + } +}; +REGISTER_CPU_OPERATOR(ViterbiPath, ViterbiPathOp); +OPERATOR_SCHEMA(ViterbiPath) + .NumInputs(2) + .NumOutputs(1) + .SetDoc(R"DOC( +Given a predictions matrix and a transitions matrix, get the path with the best +score +)DOC") + .Input(0, "predictions", "N*D predictions matrix") + .Input(1, "transitions", "D*D transitions matrix") + .Output(0, "viterbi_path", "N*1 vector holds the best path indices"); +NO_GRADIENT(ViterbiPath); +REGISTER_CPU_OPERATOR(SwapBestPath, SwapBestPathOp); +OPERATOR_SCHEMA(SwapBestPath) + .NumInputs(2) + .NumOutputs(1) + .SetDoc(R"DOC( +Given a sequence of idices and a matrix, enforce that these indices have the +best columnwise scores +score +)DOC") + .Input(0, "predictions", "N*D predictions matrix") + .Input(1, "bestPath", "N*1 vector holds the best path indices ") + .Output(0, "new_predictions", "N*D updated predictions matrix"); +NO_GRADIENT(SwapBestPath); +} // namespace +} // namespace caffe2 diff --git a/caffe2/operators/fused_rowwise_8bit_conversion_ops.cc b/caffe2/operators/fused_rowwise_8bit_conversion_ops.cc index b6966fe89c76fc..f70149110378fc 100644 --- a/caffe2/operators/fused_rowwise_8bit_conversion_ops.cc +++ b/caffe2/operators/fused_rowwise_8bit_conversion_ops.cc @@ -1,5 +1,5 @@ #include "caffe2/operators/fused_rowwise_8bit_conversion_ops.h" -#include "caffe2/core/registry.h" +#include "c10/util/Registry.h" namespace caffe2 { REGISTER_CPU_OPERATOR( diff --git a/caffe2/operators/fused_rowwise_random_quantization_ops.cc b/caffe2/operators/fused_rowwise_random_quantization_ops.cc index ca5d8f25d3a9f2..9dec789393d993 100644 --- a/caffe2/operators/fused_rowwise_random_quantization_ops.cc +++ b/caffe2/operators/fused_rowwise_random_quantization_ops.cc @@ -1,5 +1,5 @@ #include "caffe2/operators/fused_rowwise_random_quantization_ops.h" -#include "caffe2/core/registry.h" +#include "c10/util/Registry.h" #include "caffe2/utils/math.h" namespace caffe2 { diff --git a/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.cc b/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.cc index 6dc47d7781d131..513ac64e795c41 100644 --- a/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.cc +++ b/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.cc @@ -1,5 +1,5 @@ #include "caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.h" -#include "caffe2/core/registry.h" +#include "c10/util/Registry.h" namespace caffe2 { diff --git a/caffe2/operators/lengths_reducer_rowwise_8bit_ops.cc b/caffe2/operators/lengths_reducer_rowwise_8bit_ops.cc index bfa1a666e6ed9d..5ecfceef5dc612 100644 --- a/caffe2/operators/lengths_reducer_rowwise_8bit_ops.cc +++ b/caffe2/operators/lengths_reducer_rowwise_8bit_ops.cc @@ -1,5 +1,5 @@ #include "caffe2/operators/lengths_reducer_rowwise_8bit_ops.h" -#include "caffe2/core/registry.h" +#include "c10/util/Registry.h" namespace caffe2 { diff --git a/caffe2/operators/load_save_op.cc b/caffe2/operators/load_save_op.cc index ffef2f8b39fb5b..50dcf5259a84eb 100644 --- a/caffe2/operators/load_save_op.cc +++ b/caffe2/operators/load_save_op.cc @@ -5,6 +5,7 @@ namespace caffe2 { template <> void LoadOp::SetCurrentDevice(BlobProto* proto) { if (proto->has_tensor()) { + proto->mutable_tensor()->clear_device_detail(); proto->mutable_tensor()->mutable_device_detail()->set_device_type( PROTO_CPU); } diff --git a/caffe2/operators/load_save_op_gpu.cc b/caffe2/operators/load_save_op_gpu.cc index cd70e9c2b5df2f..eaa90b3dcdbc13 100644 --- a/caffe2/operators/load_save_op_gpu.cc +++ b/caffe2/operators/load_save_op_gpu.cc @@ -6,6 +6,7 @@ namespace caffe2 { template <> void LoadOp::SetCurrentDevice(BlobProto* proto) { if (proto->has_tensor()) { + proto->mutable_tensor()->clear_device_detail(); auto* device_detail = proto->mutable_tensor()->mutable_device_detail(); device_detail->set_device_type(PROTO_CUDA); device_detail->set_cuda_gpu_id(CaffeCudaGetDevice()); diff --git a/caffe2/operators/segment_reduction_op.cc b/caffe2/operators/segment_reduction_op.cc index 4029a52d0c3587..482d0599fc0e2b 100644 --- a/caffe2/operators/segment_reduction_op.cc +++ b/caffe2/operators/segment_reduction_op.cc @@ -51,6 +51,7 @@ OPERATOR_SCHEMA(SparseLengthsIndicesInGradientWeightedSumWithMainInputGradient) REGISTER_CPU_OPERATOR( SparseLengthsIndicesInGradientWeightedSumWithMainInputGradient, AbstractLengthsWithMainInputGradientOp< + float, float, int, CPUContext, diff --git a/caffe2/operators/segment_reduction_op.h b/caffe2/operators/segment_reduction_op.h index 9e7ab6d604016c..4449613787e881 100644 --- a/caffe2/operators/segment_reduction_op.h +++ b/caffe2/operators/segment_reduction_op.h @@ -1616,6 +1616,7 @@ class AbstractLengthsGradientOp : public Operator { // Version of gradient that requires the main input and thus needs to receive // length, indices and other stuff template < + typename Tembedding, typename T, typename TLengths, class Context, @@ -1689,8 +1690,7 @@ class AbstractLengthsWithMainInputGradientOp : public Operator { int64_t segmentBlockSize = segmentGradsInput.size_from_dim(1); T* dataGrads = dataGradsOutput->template mutable_data(); - const T* data = dataInput.template data(); - + const Tembedding* data = dataInput.template data(); int64_t dataIndex = 0; for (int64_t rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) { ReducerGradient reducer( @@ -1945,6 +1945,7 @@ segments, i.e. len(*LENGTHS*). using BackwardOp = AbstractLengthsGradientOp; using WithMainInputBackwardOp = AbstractLengthsWithMainInputGradientOp< + T, T, SIndex, Context, @@ -2048,6 +2049,7 @@ i.e. `len(LENGTHS)`. Other dimensions are inherited from the input tensor. ReducerGradient, false /*GradientNeedIndices*/>; using WithMainInputBackwardOp = AbstractLengthsWithMainInputGradientOp< + T, T, SIndex, Context, diff --git a/caffe2/operators/stats_put_ops.cc b/caffe2/operators/stats_put_ops.cc new file mode 100644 index 00000000000000..40c6b8cc60d085 --- /dev/null +++ b/caffe2/operators/stats_put_ops.cc @@ -0,0 +1,92 @@ +#include "caffe2/operators/stats_put_ops.h" +#include "caffe2/core/operator.h" +#include "caffe2/core/stats.h" +#include "caffe2/core/tensor.h" + +namespace caffe2 { +#define REGISTER_TEMPLATED_STAT_PUT_OP(OP_NAME, STAT_NAME, STAT_MACRO) \ + struct STAT_NAME { \ + CAFFE_STAT_CTOR(STAT_NAME); \ + STAT_MACRO(stat_value); \ + }; \ + REGISTER_CPU_OPERATOR(OP_NAME, TemplatePutOp); + +REGISTER_TEMPLATED_STAT_PUT_OP( + AveragePut, + AveragePutStat, + CAFFE_AVG_EXPORTED_STAT) + +OPERATOR_SCHEMA(AveragePut) + .NumInputs(1) + .NumOutputs(0) + .Arg( + "name", + "(*str*): name of the stat. If not present, then uses name of input blob") + .Arg( + "magnitude_expand", + "(*int64_t*): number to multiply input values by (used when inputting floats, as stats can only receive integers") + .SetDoc(R"DOC( + Consume a value and pushes it to the global stat registry as an average. + + Github Links: + - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/stats_put_ops.cc + + )DOC") + .Input( + 0, + "value", + "(*Tensor``*): A scalar tensor, representing any numeric value"); + +REGISTER_TEMPLATED_STAT_PUT_OP( + IncrementPut, + IncrementPutStat, + CAFFE_EXPORTED_STAT) + +OPERATOR_SCHEMA(IncrementPut) + .NumInputs(1) + .NumOutputs(0) + .Arg( + "name", + "(*str*): name of the stat. If not present, then uses name of input blob") + .Arg( + "magnitude_expand", + "(*int64_t*): number to multiply input values by (used when inputting floats, as stats can only receive integers") + .SetDoc(R"DOC( + Consume a value and pushes it to the global stat registry as an sum. + + Github Links: + - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/stats_put_ops.cc + + )DOC") + .Input( + 0, + "value", + "(*Tensor``*): A scalar tensor, representing any numeric value"); + +REGISTER_TEMPLATED_STAT_PUT_OP( + StdDevPut, + StdDevPutStat, + CAFFE_STDDEV_EXPORTED_STAT) + +OPERATOR_SCHEMA(StdDevPut) + .NumInputs(1) + .NumOutputs(0) + .Arg( + "name", + "(*str*): name of the stat. If not present, then uses name of input blob") + .Arg( + "magnitude_expand", + "(*int64_t*): number to multiply input values by (used when inputting floats, as stats can only receive integers") + .SetDoc(R"DOC( + Consume a value and pushes it to the global stat registry as an standard deviation. + + Github Links: + - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/stats_put_ops.cc + + )DOC") + .Input( + 0, + "value", + "(*Tensor``*): A scalar tensor, representing any numeric value"); + +} // namespace caffe2 diff --git a/caffe2/operators/stats_put_ops.h b/caffe2/operators/stats_put_ops.h new file mode 100644 index 00000000000000..659df219809d34 --- /dev/null +++ b/caffe2/operators/stats_put_ops.h @@ -0,0 +1,53 @@ +#include +#include "caffe2/core/operator.h" +#include "caffe2/core/stats.h" +#include "caffe2/core/tensor.h" +#include "caffe2/core/types.h" + +namespace caffe2 { + +template +struct TemplatePutOp : public Operator { + TemplatePutOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws), + given_name_(GetSingleArgument( + "stat_name", + operator_def.input().Get(0))), + magnitude_expand_(GetSingleArgument("magnitude_expand", 1)), + stat_(given_name_) {} + + bool RunOnDevice() override { + return DispatchHelper>::call(this, Input(0)); + } + + template + bool DoRunWithType() { + auto input = *Input(0).template data(); + + CAFFE_ENFORCE( + static_cast(input + 1) < + std::numeric_limits::max() / magnitude_expand_, + "Input value is too large for the given magnitude expansion!"); + + int64_t int_value = input * magnitude_expand_; + + CAFFE_EVENT(stat_, stat_value, int_value); + + return true; + } + + private: + const std::string given_name_; + const long magnitude_expand_; + T stat_; +}; +} // namespace caffe2 diff --git a/caffe2/opt/annotations.cc b/caffe2/opt/annotations.cc index 937fb789cce125..271ce3dcc4c61b 100644 --- a/caffe2/opt/annotations.cc +++ b/caffe2/opt/annotations.cc @@ -27,6 +27,27 @@ caffe2::OperatorDef* Caffe2Annotation::getMutableOperatorDef() { } // Distributed annotations +void Caffe2Annotation::setDeviceOption(const caffe2::DeviceOption& devOpt) { + *OpDef.mutable_device_option() = devOpt; +} + +bool Caffe2Annotation::hasDeviceOption() const { + return OpDef.has_device_option(); +} + +const caffe2::DeviceOption& Caffe2Annotation::getDeviceOption() const { + CAFFE_ENFORCE( + hasDeviceOption(), + "DeviceOption was never set. Use Caffe2Annotation::setDeviceOption."); + return OpDef.device_option(); +} +caffe2::DeviceOption* Caffe2Annotation::getMutableDeviceOption() { + CAFFE_ENFORCE( + hasDeviceOption(), + "DeviceOption was never set. Use Caffe2Annotation::setDeviceOption."); + return OpDef.mutable_device_option(); +} + void Caffe2Annotation::setDevice(std::string device) { Device = device; } diff --git a/caffe2/opt/annotations.h b/caffe2/opt/annotations.h index e143c5e960c542..9bc1f1e3137648 100644 --- a/caffe2/opt/annotations.h +++ b/caffe2/opt/annotations.h @@ -19,6 +19,11 @@ class CAFFE2_API Caffe2Annotation : public nom::repr::Annotation { const caffe2::OperatorDef& getOperatorDef() const; caffe2::OperatorDef* getMutableOperatorDef(); + void setDeviceOption(const caffe2::DeviceOption& opDef); + bool hasDeviceOption() const; + const caffe2::DeviceOption& getDeviceOption() const; + caffe2::DeviceOption* getMutableDeviceOption(); + // Distributed annotations void setDevice(std::string device); const std::string getDevice() const; diff --git a/caffe2/opt/converter.cc b/caffe2/opt/converter.cc index f9956060b75cfd..46fd8349b05832 100644 --- a/caffe2/opt/converter.cc +++ b/caffe2/opt/converter.cc @@ -56,7 +56,7 @@ int getGroup(std::map& argMap) { namespace caffe2 { -CAFFE_DEFINE_REGISTRY(ConverterRegistry, Converter); +C10_DEFINE_REGISTRY(ConverterRegistry, Converter); std::map Converter::getArgumentsFromOperator( caffe2::OperatorDef op) { @@ -519,4 +519,48 @@ caffe2::NetDef convertToCaffe2Proto(repr::NNModule &m, const caffe2::NetDef& old return predictNet; } +void pushOpToFront(caffe2::OperatorDef& op, caffe2::NetDef* net) { + *net->add_op() = op; + google::protobuf::RepeatedPtrField* op_list( + net->mutable_op()); + // Reverse iterate, swapping new element in front each time + for (int i(net->op_size() - 1); i > 0; --i) { + op_list->SwapElements(i, i - 1); + } +} + +void injectDataEdgeIndicators(caffe2::NetDef* net) { + for (const auto& input : net->external_input()) { + caffe2::OperatorDef op; + op.set_type("Declare"); + op.add_output(input); + pushOpToFront(op, net); + } + for (const auto& output : net->external_output()) { + caffe2::OperatorDef op; + op.set_type("Export"); + op.add_input(output); + *net->add_op() = op; + } + net->clear_external_input(); + net->clear_external_output(); +} + +void removeDataEdgeIndicators(caffe2::NetDef* net) { + google::protobuf::RepeatedPtrField* op_list( + net->mutable_op()); + for (auto i = 0; i < net->op_size(); ++i) { + auto op = net->op(i); + if (op.type() == "Declare") { + net->add_external_input(op.output(0)); + } else if (op.type() == "Export") { + net->add_external_output(op.input(0)); + } else { + continue; + } + // Note that this compensates for modifying the list inplace + op_list->DeleteSubrange(i--, 1); + } +} + } // namespace caffe2 diff --git a/caffe2/opt/converter.h b/caffe2/opt/converter.h index c106fc66057916..9666739d14f016 100644 --- a/caffe2/opt/converter.h +++ b/caffe2/opt/converter.h @@ -13,6 +13,9 @@ namespace caffe2 { +CAFFE2_API void injectDataEdgeIndicators(caffe2::NetDef* net); +CAFFE2_API void removeDataEdgeIndicators(caffe2::NetDef* net); + CAFFE2_API nom::repr::NNModule convertToNNModule(caffe2::NetDef &net, bool strict = false); CAFFE2_API caffe2::NetDef convertToCaffe2Proto(nom::repr::NNModule&); @@ -44,9 +47,9 @@ class CAFFE2_API Converter { virtual ~Converter() {} }; -CAFFE_DECLARE_REGISTRY(ConverterRegistry, Converter); +C10_DECLARE_REGISTRY(ConverterRegistry, Converter); #define REGISTER_CONVERTER(name, cls) \ - CAFFE_REGISTER_CLASS(ConverterRegistry, name, cls) + C10_REGISTER_CLASS(ConverterRegistry, name, cls) #define TRIVIAL_CONVERTER(opName) \ class opName##Converter : public Converter { \ diff --git a/caffe2/opt/converter_nomigraph_test.cc b/caffe2/opt/converter_nomigraph_test.cc index 995c9a5961c800..e9da69a42dbe3c 100644 --- a/caffe2/opt/converter_nomigraph_test.cc +++ b/caffe2/opt/converter_nomigraph_test.cc @@ -98,3 +98,37 @@ TEST(Converter, ExternalOutputs) { EXPECT_EQ(new_netdef.external_output(i), net.external_output(i)); } } + +TEST(Converter, InjectDataEdgeIndicators) { + auto net = fakeNet(); + caffe2::injectDataEdgeIndicators(&net); + + EXPECT_EQ(net.op_size(), 3 + 1 + 2); // Inserted 1 Declare and 2 Export + + auto declare_count = 0; + auto export_count = 0; + for (const auto& op : net.op()) { + declare_count += op.type() == "Declare"; + export_count += op.type() == "Export"; + } + EXPECT_EQ(declare_count, 1); + EXPECT_EQ(export_count, 2); + + // Remove them from the network + EXPECT_EQ(net.external_input_size(), 0); + EXPECT_EQ(net.external_output_size(), 0); + + // Ensure nomnigraph can handle this change + auto nn = caffe2::convertToNNModule(net); + auto new_net = caffe2::convertToCaffe2Proto(nn); + + caffe2::removeDataEdgeIndicators(&new_net); + + for (const auto& op : new_net.op()) { + EXPECT_NE(op.type(), "Declare"); + EXPECT_NE(op.type(), "Export"); + } + + EXPECT_EQ(new_net.external_input_size(), 1); + EXPECT_EQ(new_net.external_output_size(), 2); +} diff --git a/caffe2/opt/passes.cc b/caffe2/opt/passes.cc index e9f05a9df01c79..74250d1bbb3b12 100644 --- a/caffe2/opt/passes.cc +++ b/caffe2/opt/passes.cc @@ -2,7 +2,11 @@ namespace caffe2 { -CAFFE_DEFINE_REGISTRY(WorkspaceOptimizationPassRegistry, WorkspaceOptimizationPass, NNModule*, Workspace*); -CAFFE_DEFINE_REGISTRY(OptimizationPassRegistry, OptimizationPass, NNModule*); +C10_DEFINE_REGISTRY( + WorkspaceOptimizationPassRegistry, + WorkspaceOptimizationPass, + NNModule*, + Workspace*); +C10_DEFINE_REGISTRY(OptimizationPassRegistry, OptimizationPass, NNModule*); } // namespace caffe2 diff --git a/caffe2/opt/passes.h b/caffe2/opt/passes.h index 056dbcf8779b3b..fc15dcad13fe7b 100644 --- a/caffe2/opt/passes.h +++ b/caffe2/opt/passes.h @@ -40,9 +40,13 @@ class CAFFE2_API WorkspaceOptimizationPass : public OptimizationPass { Workspace* ws_; }; -CAFFE_DECLARE_REGISTRY(WorkspaceOptimizationPassRegistry, WorkspaceOptimizationPass, NNModule*, Workspace*); +C10_DECLARE_REGISTRY( + WorkspaceOptimizationPassRegistry, + WorkspaceOptimizationPass, + NNModule*, + Workspace*); #define REGISTER_WS_OPT_PASS(clsname) \ - CAFFE_REGISTER_CLASS(WorkspaceOptimizationPassRegistry, clsname, clsname) + C10_REGISTER_CLASS(WorkspaceOptimizationPassRegistry, clsname, clsname) #define REGISTER_WS_OPT_PASS_FROM_FUNC(passname, funcname) \ class passname : public WorkspaceOptimizationPass { \ public: \ @@ -53,9 +57,9 @@ CAFFE_DECLARE_REGISTRY(WorkspaceOptimizationPassRegistry, WorkspaceOptimizationP }; \ REGISTER_WS_OPT_PASS(passname); -CAFFE_DECLARE_REGISTRY(OptimizationPassRegistry, OptimizationPass, NNModule*); +C10_DECLARE_REGISTRY(OptimizationPassRegistry, OptimizationPass, NNModule*); #define REGISTER_OPT_PASS(clsname) \ - CAFFE_REGISTER_CLASS(OptimizationPassRegistry, clsname, clsname) + C10_REGISTER_CLASS(OptimizationPassRegistry, clsname, clsname) #define REGISTER_OPT_PASS_FROM_FUNC(passname, funcname) \ class passname : public OptimizationPass { \ public: \ diff --git a/caffe2/proto/caffe2_pb.h b/caffe2/proto/caffe2_pb.h index 0a08c8db241e98..ded59d52b21f47 100644 --- a/caffe2/proto/caffe2_pb.h +++ b/caffe2/proto/caffe2_pb.h @@ -1,5 +1,5 @@ #pragma once -#include +#include #include #include @@ -47,6 +47,10 @@ inline CAFFE2_API DeviceType ProtoToType(const caffe2::DeviceTypeProto p) { } } +inline CAFFE2_API DeviceType ProtoToType(int p) { + return ProtoToType(static_cast(p)); +} + inline CAFFE2_API DeviceTypeProto TypeToProto(const DeviceType& t) { switch (t) { case DeviceType::CPU: @@ -77,4 +81,59 @@ inline CAFFE2_API DeviceTypeProto TypeToProto(const DeviceType& t) { } } +inline CAFFE2_API caffe2::DeviceOption DeviceToOption( + const at::Device& device) { + caffe2::DeviceOption option; + auto type = device.type(); + option.set_device_type(TypeToProto(type)); + + switch (type) { + case DeviceType::CPU: + if (device.index() != -1) { + option.set_numa_node_id(device.index()); + } + break; + case DeviceType::CUDA: + option.set_cuda_gpu_id(device.index()); + break; + case DeviceType::HIP: + option.set_hip_gpu_id(device.index()); + break; + case DeviceType::OPENGL: + case DeviceType::OPENCL: + case DeviceType::MKLDNN: + case DeviceType::IDEEP: + case DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES: + case DeviceType::ONLY_FOR_TEST: + break; + default: + AT_ERROR( + "Unknown device:", + static_cast(type), + ". If you have recently updated the caffe2.proto file to add a new " + "device type, did you forget to update the ProtoToType() and TypeToProto" + "function to reflect such recent changes?"); + } + return option; +} + +inline CAFFE2_API at::Device OptionToDevice(const caffe2::DeviceOption option) { + auto type = option.device_type(); + int32_t id = -1; + switch (type) { + case caffe2::PROTO_CPU: + if (option.has_numa_node_id()) { + id = option.numa_node_id(); + } + break; + case caffe2::PROTO_CUDA: + id = option.cuda_gpu_id(); + break; + case caffe2::PROTO_HIP: + id = option.hip_gpu_id(); + break; + } + return at::Device(ProtoToType(type), id); +} + } // namespace caffe2 diff --git a/caffe2/python/crf_predict.py b/caffe2/python/crf_predict.py new file mode 100644 index 00000000000000..dd1c8720bfb153 --- /dev/null +++ b/caffe2/python/crf_predict.py @@ -0,0 +1,33 @@ +from __future__ import absolute_import, division, print_function, unicode_literals + +import numpy as np +from caffe2.python.crf import CRFWithLoss + + +def crf_update_predictions(model, crf_with_loss, classes): + return apply_crf( + model.param_init_net, + model.net, + crf_with_loss.transitions, + classes, + crf_with_loss.num_classes, + ) + + +def apply_crf(init_net, net, transitions, predictions, num_classes): + padded_classes = CRFWithLoss.pad_predictions( + predictions, init_net, net, num_classes + ) + bestPath = net.ViterbiPath([padded_classes, transitions]) + new_padded_classes = net.SwapBestPath([padded_classes, bestPath]) + # Revert the effect of pad_predictions by removing the last two rows and + # the last two columns + new_classes = net.RemovePadding( + [new_padded_classes], padding_width=1, end_padding_width=1 + ) + slice_starts = np.array([0, 0]).astype(np.int32) + slice_ends = np.array([-1, -3]).astype(np.int32) + slice_starts = net.GivenTensorIntFill([], shape=[2], values=slice_starts) + slice_ends = net.GivenTensorIntFill([], shape=[2], values=slice_ends) + new_classes = net.Slice([new_classes, slice_starts, slice_ends]) + return new_classes diff --git a/caffe2/python/crf_viterbi_test.py b/caffe2/python/crf_viterbi_test.py new file mode 100644 index 00000000000000..a4502d27e3e990 --- /dev/null +++ b/caffe2/python/crf_viterbi_test.py @@ -0,0 +1,45 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +from caffe2.python import workspace, crf + +from caffe2.python.cnn import CNNModelHelper +from caffe2.python.crf_predict import crf_update_predictions +from caffe2.python.test_util import TestCase +import hypothesis.strategies as st +from hypothesis import given +import numpy as np + + +class TestCrfDecode(TestCase): + + @given(num_tags=st.integers(2, 4), num_words=st.integers(2, 15)) + def test_crf_viterbi(self, num_tags, num_words): + model = CNNModelHelper(name='external') + predictions = np.random.randn(num_words, num_tags).astype(np.float32) + transitions = np.random.uniform( + low=-1, high=1, size=(num_tags + 2, num_tags + 2) + ).astype(np.float32) + predictions_blob, transitions_blob = ( + model.net.AddExternalInputs('predictions', 'crf_transitions') + ) + workspace.FeedBlob(str(transitions_blob), transitions) + workspace.FeedBlob(str(predictions_blob), predictions) + crf_layer = crf.CRFWithLoss(model, num_tags, transitions_blob) + + updated_predictions = crf_update_predictions( + model, crf_layer, predictions_blob + ) + ref_predictions = crf_layer.update_predictions(predictions_blob) + + workspace.RunNetOnce(model.param_init_net) + workspace.RunNetOnce(model.net) + + updated_predictions = workspace.FetchBlob(str(updated_predictions)) + ref_predictions = workspace.FetchBlob(str(ref_predictions)) + np.testing.assert_allclose( + updated_predictions, + ref_predictions, + atol=1e-4, rtol=1e-4, err_msg='Mismatch in CRF predictions' + ) diff --git a/caffe2/python/ideep/moment_sgd_op_test.py b/caffe2/python/ideep/moment_sgd_op_test.py new file mode 100644 index 00000000000000..90b49a8600d76c --- /dev/null +++ b/caffe2/python/ideep/moment_sgd_op_test.py @@ -0,0 +1,61 @@ +from __future__ import unicode_literals +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import hypothesis.strategies as st +import unittest +import caffe2.python.hypothesis_test_util as hu +from caffe2.python import core, workspace +from hypothesis import given +import caffe2.python.ideep_test_util as mu + + +@unittest.skipIf(not workspace.C.use_ideep, "No IDEEP support.") +class TestMomentumSGDUpdateOps(hu.HypothesisTestCase): + @given(n=st.integers(4, 8), nesterov=st.booleans(), + **mu.gcs) + def test_MomentumSGDUpdate(self, n, nesterov, gc, dc): + param = np.random.rand(n).astype(np.float32) + grad = np.random.rand(n).astype(np.float32) + lr = np.random.rand(1).astype(np.float32) + param_momentum = np.random.rand(n).astype(np.float32) + momentum = 0.9 + op = core.CreateOperator( + "MomentumSGDUpdate", + ["grad", "param_momentum", "lr", "param"], + ["grad", "param_momentum", "param"], + momentum=momentum, + nesterov=int(nesterov), + ) + # Iter lives on the CPU + input_device_options = {'lr': hu.cpu_do} + + self.assertDeviceChecks( + dc, + op, + [grad, param_momentum, lr, param], + [0], + input_device_options=input_device_options, + threshold=0.001) + + op_noparam = core.CreateOperator( + "MomentumSGD", + ["grad", "param_momentum", "lr"], + ["grad", "param_momentum"], + momentum=momentum, + nesterov=int(nesterov), + ) + + self.assertDeviceChecks( + dc, + op_noparam, + [grad, param_momentum, lr], + [0], + input_device_options=input_device_options, + threshold=0.001) + + +if __name__ == "__main__": + unittest.main() diff --git a/caffe2/python/operator_test/gather_ops_test.py b/caffe2/python/operator_test/gather_ops_test.py index d5ab8e58cec0f2..5d05c1e5b23b91 100644 --- a/caffe2/python/operator_test/gather_ops_test.py +++ b/caffe2/python/operator_test/gather_ops_test.py @@ -39,10 +39,11 @@ def _inputs(draw): rows_num = draw(st.integers(1, 100)) index_num = draw(st.integers(1, 10)) batch_size = draw(st.integers(2, 10)) + block_size = draw(st.integers(1, 2)) return ( draw(hnp.arrays( np.float32, - (batch_size, rows_num, 2), + (batch_size, rows_num, block_size), elements=st.floats(-10.0, 10.0), )), draw(hnp.arrays( diff --git a/caffe2/python/operator_test/load_save_test.py b/caffe2/python/operator_test/load_save_test.py index 07f378beb18ff0..2d53027a0a053d 100644 --- a/caffe2/python/operator_test/load_save_test.py +++ b/caffe2/python/operator_test/load_save_test.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import errno import hypothesis.strategies as st -from hypothesis import given +from hypothesis import given, assume import numpy as np import os import shutil @@ -42,6 +42,8 @@ def load_save(self, src_device_type, src_gpu_id, np.int16, np.int32, np.int64, np.uint8, np.uint16] arrays = [np.random.permutation(6).reshape(2, 3).astype(T) for T in dtypes] + assume(src_device_type == caffe2_pb2.CUDA or src_gpu_id == 0) + assume(dst_device_type == caffe2_pb2.CUDA or dst_gpu_id == 0) src_device_option = core.DeviceOption( src_device_type, src_gpu_id) dst_device_option = core.DeviceOption( diff --git a/caffe2/python/operator_test/stats_put_ops_test.py b/caffe2/python/operator_test/stats_put_ops_test.py new file mode 100644 index 00000000000000..d3757c3b396e50 --- /dev/null +++ b/caffe2/python/operator_test/stats_put_ops_test.py @@ -0,0 +1,102 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from caffe2.python import core, workspace +from caffe2.python.test_util import TestCase +import numpy as np + + +class TestPutOps(TestCase): + + def test_avg_put_ops(self): + put_value = 15.1111 + magnitude_expand = 10000 + stat_name = "a1".encode('ascii') + sum_postfix = "/stat_value/sum".encode("ascii") + count_postfix = "/stat_value/count".encode("ascii") + + workspace.FeedBlob("value", np.array([put_value], dtype=np.float)) + + workspace.RunOperatorOnce(core.CreateOperator( + "AveragePut", + "value", + [], + stat_name=stat_name, + magnitude_expand=magnitude_expand)) + + workspace.RunOperatorOnce(core.CreateOperator( + 'StatRegistryExport', [], ['k', 'v', 't'])) + + k = workspace.FetchBlob('k') + v = workspace.FetchBlob('v') + + stat_dict = dict(zip(k, v)) + + self.assertIn(stat_name + sum_postfix, stat_dict) + self.assertIn(stat_name + count_postfix, stat_dict) + self.assertEquals(stat_dict[stat_name + sum_postfix], + put_value * magnitude_expand) + self.assertEquals(stat_dict[stat_name + count_postfix], 1) + + def test_increment_put_ops(self): + put_value = 15.1111 + magnitude_expand = 10000 + stat_name = "i1".encode('ascii') + member_postfix = "/stat_value".encode("ascii") + + workspace.FeedBlob("value", np.array([put_value], dtype=np.float)) + + workspace.RunOperatorOnce(core.CreateOperator( + "IncrementPut", + "value", + [], + stat_name=stat_name, + magnitude_expand=magnitude_expand)) + + workspace.RunOperatorOnce(core.CreateOperator( + 'StatRegistryExport', [], ['k', 'v', 't'])) + + k = workspace.FetchBlob('k') + v = workspace.FetchBlob('v') + + stat_dict = dict(zip(k, v)) + + self.assertIn(stat_name + member_postfix, stat_dict) + self.assertEquals(stat_dict[stat_name + member_postfix], + put_value * magnitude_expand) + + def test_stddev_put_ops(self): + put_value = 15.1111 + magnitude_expand = 10000 + stat_name = "s1".encode('ascii') + sum_postfix = "/stat_value/sum".encode("ascii") + count_postfix = "/stat_value/count".encode("ascii") + sumoffset_postfix = "/stat_value/sumoffset".encode("ascii") + sumsqoffset_postfix = "/stat_value/sumsqoffset".encode("ascii") + + workspace.FeedBlob("value", np.array([put_value], dtype=np.float)) + + workspace.RunOperatorOnce(core.CreateOperator( + "StdDevPut", + "value", + [], + stat_name=stat_name, + magnitude_expand=magnitude_expand)) + + workspace.RunOperatorOnce(core.CreateOperator( + 'StatRegistryExport', [], ['k', 'v', 't'])) + + k = workspace.FetchBlob('k') + v = workspace.FetchBlob('v') + + stat_dict = dict(zip(k, v)) + + self.assertIn(stat_name + sum_postfix, stat_dict) + self.assertIn(stat_name + count_postfix, stat_dict) + self.assertIn(stat_name + sumoffset_postfix, stat_dict) + self.assertIn(stat_name + sumsqoffset_postfix, stat_dict) + self.assertEquals(stat_dict[stat_name + sum_postfix], + put_value * magnitude_expand) + self.assertEquals(stat_dict[stat_name + count_postfix], 1) diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py index 482d16a0dfa6a6..0c5b18b0b6ab11 100644 --- a/caffe2/python/optimizer.py +++ b/caffe2/python/optimizer.py @@ -22,6 +22,8 @@ AuxOptimizerParams = namedtuple("AuxOptimizerParams", ["local", "shared"]) _optimizer_instance_count = defaultdict(int) +FP16_ENGINES = ["SIMD_Q_FP16", "SIMD_Q_STOC_FP16", "SIMD_Q_STOC_MKL_FP16"] + logger = logging.getLogger(__name__) @@ -584,7 +586,7 @@ def _run(self, net, param_init_net, param_info): value=0.0 ) else: - if self.engine == "SIMD_Q_FP16" or self.engine == "SIMD_Q_STOC_FP16": + if self.engine in FP16_ENGINES: shapes, types = workspace.InferShapesAndTypes([param_init_net]) assert str(param) in shapes, shapes shape = shapes[str(param)] diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc index 9a1d715bfdf225..7062ead045df1c 100644 --- a/caffe2/python/pybind_state.cc +++ b/caffe2/python/pybind_state.cc @@ -53,12 +53,12 @@ static std::string gCurrentWorkspaceName; BlobFetcherBase::~BlobFetcherBase() {} BlobFeederBase::~BlobFeederBase() {} -CAFFE_DEFINE_TYPED_REGISTRY( +C10_DEFINE_TYPED_REGISTRY( BlobFetcherRegistry, TypeIdentifier, BlobFetcherBase, std::unique_ptr); -CAFFE_DEFINE_TYPED_REGISTRY( +C10_DEFINE_TYPED_REGISTRY( BlobFeederRegistry, caffe2::DeviceType, BlobFeederBase, diff --git a/caffe2/python/pybind_state.h b/caffe2/python/pybind_state.h index 4f81569e429369..dcb416b07a8fea 100644 --- a/caffe2/python/pybind_state.h +++ b/caffe2/python/pybind_state.h @@ -60,24 +60,24 @@ class BlobFeederBase { Feed(const DeviceOption& option, PyArrayObject* array, Blob* blob) = 0; }; -C10_EXPORT CAFFE_DECLARE_TYPED_REGISTRY( +C10_DECLARE_TYPED_REGISTRY( BlobFetcherRegistry, TypeIdentifier, BlobFetcherBase, std::unique_ptr); #define REGISTER_BLOB_FETCHER(id, ...) \ - CAFFE_REGISTER_TYPED_CLASS(BlobFetcherRegistry, id, __VA_ARGS__) + C10_REGISTER_TYPED_CLASS(BlobFetcherRegistry, id, __VA_ARGS__) inline unique_ptr CreateFetcher(TypeIdentifier id) { return BlobFetcherRegistry()->Create(id); } -CAFFE_DECLARE_TYPED_REGISTRY( +C10_DECLARE_TYPED_REGISTRY( BlobFeederRegistry, DeviceType, BlobFeederBase, std::unique_ptr); #define REGISTER_BLOB_FEEDER(device_type, ...) \ - CAFFE_REGISTER_TYPED_CLASS(BlobFeederRegistry, device_type, __VA_ARGS__) + C10_REGISTER_TYPED_CLASS(BlobFeederRegistry, device_type, __VA_ARGS__) inline unique_ptr CreateFeeder(int device_type) { return BlobFeederRegistry()->Create( caffe2::ProtoToType(static_cast(device_type))); @@ -148,7 +148,7 @@ class TensorFetcher : public BlobFetcherBase { } if (result.copied) { - auto context = tensor.GetStaticContext()->CreateContext(); + auto context = CreateContext(tensor.GetDeviceType()); context->CopyBytesToCPU(tensor.nbytes(), tensor.raw_data(), outPtr); context->FinishDeviceComputation(); } diff --git a/caffe2/python/pybind_state_registry.cc b/caffe2/python/pybind_state_registry.cc index 9dfb87731ff4de..77fabf34256480 100644 --- a/caffe2/python/pybind_state_registry.cc +++ b/caffe2/python/pybind_state_registry.cc @@ -5,7 +5,7 @@ namespace python { namespace py = pybind11; -CAFFE_DEFINE_REGISTRY(PybindAdditionRegistry, PybindAddition, py::module&); +C10_DEFINE_REGISTRY(PybindAdditionRegistry, PybindAddition, py::module&); } // namespace python } // namespace caffe2 diff --git a/caffe2/python/pybind_state_registry.h b/caffe2/python/pybind_state_registry.h index a107e7db8ea0ad..18bb0a3dbaa01d 100644 --- a/caffe2/python/pybind_state_registry.h +++ b/caffe2/python/pybind_state_registry.h @@ -1,7 +1,7 @@ #pragma once #include -#include "caffe2/core/registry.h" +#include "c10/util/Registry.h" namespace caffe2 { namespace python { @@ -14,19 +14,16 @@ struct PybindAddition { virtual ~PybindAddition(){}; }; -CAFFE_DECLARE_REGISTRY(PybindAdditionRegistry, PybindAddition, py::module&); +C10_DECLARE_REGISTRY(PybindAdditionRegistry, PybindAddition, py::module&); -#define REGISTER_PYBIND_ADDITION(funcname) \ - namespace { \ - struct funcname##Impl : public PybindAddition { \ - funcname##Impl(py::module& m) { \ - funcname(m); \ - } \ - }; \ - CAFFE_REGISTER_CLASS( \ - PybindAdditionRegistry, \ - funcname##Impl, \ - funcname##Impl); \ +#define REGISTER_PYBIND_ADDITION(funcname) \ + namespace { \ + struct funcname##Impl : public PybindAddition { \ + funcname##Impl(py::module& m) { \ + funcname(m); \ + } \ + }; \ + C10_REGISTER_CLASS(PybindAdditionRegistry, funcname##Impl, funcname##Impl); \ } } // namespace python diff --git a/caffe2/python/test_util.py b/caffe2/python/test_util.py index dc1f7370132230..fd4b3ab030428d 100644 --- a/caffe2/python/test_util.py +++ b/caffe2/python/test_util.py @@ -30,6 +30,10 @@ def randBlobsFloat32(names, *dims, **kwargs): randBlobFloat32(name, *dims, **kwargs) +def numOps(net): + return len(net.Proto().op) + + def str_compare(a, b, encoding="utf8"): if isinstance(a, bytes): a = a.decode(encoding) diff --git a/caffe2/python/transformations_test.py b/caffe2/python/transformations_test.py index 26f5450605a1c1..502c844404c567 100644 --- a/caffe2/python/transformations_test.py +++ b/caffe2/python/transformations_test.py @@ -30,134 +30,86 @@ class TestTransformations(tu.TestCase): - def test_transformer_AddNNPACK(self): + def _base_test_net(self): net = core.Net("net") net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW") - net.Relu(["Y"], ["Y2"]) + return net + + def _add_nnpack(self, net): transformer.AddNNPACK(net) assert tu.str_compare(net.Proto().op[0].engine, "NNPACK") - def test_transformer_FuseNNPACKConvRelu(self): - net = core.Net("net") - net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW") - net.Relu(["Y"], ["Y2"]) - transformer.AddNNPACK(net) # get the NNPACK engine - assert tu.str_compare(net.Proto().op[0].engine, "NNPACK") + def _fuse_nnpack_convrelu(self, net, expected_result_num_ops, + expected_activation_arg=True): + self._add_nnpack(net) transformer.FuseNNPACKConvRelu(net) - assert len(net.Proto().op) == 1 + self.assertEquals(tu.numOps(net), expected_result_num_ops) has_activation_arg = False for arg in net.Proto().op[0].arg: if tu.str_compare(arg.name, "activation"): assert tu.str_compare(arg.s, "Relu") has_activation_arg = True - assert has_activation_arg + if expected_activation_arg: + assert has_activation_arg + else: + assert not has_activation_arg + + def test_transformer_AddNNPACK(self): + net = self._base_test_net() + net.Relu(["Y"], ["Y2"]) + self._add_nnpack(net) + + def test_transformer_FuseNNPACKConvRelu(self): + net = self._base_test_net() + net.Relu(["Y"], ["Y2"]) + self._fuse_nnpack_convrelu(net, 1) def test_noFuseNNPACKConvRelu(self): - net = core.Net("net") - net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW") + net = self._base_test_net() net.Relu(["Y"], ["Y2"]) net.Relu(["Y"], ["Y3"]) - transformer.AddNNPACK(net) # get the NNPACK engine - assert tu.str_compare(net.Proto().op[0].engine, "NNPACK") - transformer.FuseNNPACKConvRelu(net) - assert len(net.Proto().op) == 3 - has_activation_arg = False - for arg in net.Proto().op[0].arg: - if tu.str_compare(arg.name, "activation") and tu.str_compare(arg.s, "Relu"): - has_activation_arg = True - assert not has_activation_arg + self._fuse_nnpack_convrelu(net, 3, expected_activation_arg=False) def test_transformer_FuseNNPACKConvReluNoInplace(self): - net = core.Net("net") - net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW") + net = self._base_test_net() net.Relu(["Y"], ["X"]) - transformer.AddNNPACK(net) # get the NNPACK engine - assert tu.str_compare(net.Proto().op[0].engine, "NNPACK") - transformer.FuseNNPACKConvRelu(net) - assert len(net.Proto().op) == 1 - has_activation_arg = False - for arg in net.Proto().op[0].arg: - if tu.str_compare(arg.name, "activation"): - assert tu.str_compare(arg.s, "Relu") - has_activation_arg = True - assert has_activation_arg + self._fuse_nnpack_convrelu(net, 1) assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0] def test_transformer_FuseNNPACKConvReluInplaceRelu(self): - net = core.Net("net") - net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW") + net = self._base_test_net() net.Relu(["Y"], ["Y"]) - transformer.AddNNPACK(net) # get the NNPACK engine - assert tu.str_compare(net.Proto().op[0].engine, "NNPACK") - transformer.FuseNNPACKConvRelu(net) - assert len(net.Proto().op) == 1 - has_activation_arg = False - for arg in net.Proto().op[0].arg: - if tu.str_compare(arg.name, "activation"): - assert tu.str_compare(arg.s, "Relu") - has_activation_arg = True - assert has_activation_arg + self._fuse_nnpack_convrelu(net, 1) assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0] def test_transformer_FuseNNPACKConvReluPingPongNaming(self): - net = core.Net("net") - net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW") + net = self._base_test_net() net.Relu(["Y"], ["X"]) net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW") - transformer.AddNNPACK(net) # get the NNPACK engine - assert tu.str_compare(net.Proto().op[0].engine, "NNPACK") - transformer.FuseNNPACKConvRelu(net) - assert len(net.Proto().op) == 2 - has_activation_arg = False - for arg in net.Proto().op[0].arg: - if tu.str_compare(arg.name, "activation"): - assert tu.str_compare(arg.s, "Relu") - has_activation_arg = True - assert has_activation_arg + self._fuse_nnpack_convrelu(net, 2) assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0] assert net.Proto().op[1].output[0] != net.Proto().op[1].input[0] def test_transformer_FuseNNPACKConvReluFollowedByMultipleInputOp(self): - net = core.Net("net") - net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW") + net = self._base_test_net() net.Relu(["Y"], ["Y2"]) net.Conv(["Y2", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW") net.Relu(["Y"], ["Y2"]) - transformer.AddNNPACK(net) # get the NNPACK engine - assert tu.str_compare(net.Proto().op[0].engine, "NNPACK") - transformer.FuseNNPACKConvRelu(net) - assert len(net.Proto().op) == 2 - has_activation_arg = False - for arg in net.Proto().op[0].arg: - if tu.str_compare(arg.name, "activation"): - assert tu.str_compare(arg.s, "Relu") - has_activation_arg = True - assert has_activation_arg + self._fuse_nnpack_convrelu(net, 2) assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0] assert net.Proto().op[1].output[0] != net.Proto().op[1].input[0] def test_transformer_FuseNNPACKConvReluInplaceFollowedByMultipleInputOp(self): - net = core.Net("net") - net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW") + net = self._base_test_net() net.Relu(["Y"], ["Y"]) net.Conv(["Y", "w", "b"], ["Y2"], stride=1, pad=0, kernel=3, order="NCHW") net.Relu(["Y2"], ["Y2"]) - transformer.AddNNPACK(net) # get the NNPACK engine - assert tu.str_compare(net.Proto().op[0].engine, "NNPACK") - transformer.FuseNNPACKConvRelu(net) - assert len(net.Proto().op) == 2 - has_activation_arg = False - for arg in net.Proto().op[0].arg: - if tu.str_compare(arg.name, "activation"): - assert tu.str_compare(arg.s, "Relu") - has_activation_arg = True - assert has_activation_arg + self._fuse_nnpack_convrelu(net, 2) assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0] assert net.Proto().op[1].output[0] != net.Proto().op[1].input[0] def test_transformer_SinkMaxPool(self): - net = core.Net("net") - net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW") + net = self._base_test_net() net.MaxPool(["Y"], ["Y1"], kernel=3) net.Relu(["Y1"], ["Y1"]) transformer.SinkMaxPool(net) @@ -205,7 +157,7 @@ def test_transformer_FuseConvBN(self, size, input_channels, seed, order, epsilon transformer.FuseConvBN(net) # Ensure fusion - assert len(net.Proto().op) == 1 + assert tu.numOps(net) == 1 workspace.RunNetOnce(net) postTransformOutput = workspace.FetchBlob("Y2").flatten() # Check that there is no numerical difference @@ -256,7 +208,7 @@ def test_transformer_FuseConvBNNoConvBias(self, size, input_channels, seed, orde transformer.FuseConvBN(net) # Ensure fusion - assert len(net.Proto().op) == 1 + assert tu.numOps(net) == 1 workspace.RunNetOnce(net) postTransformOutput = workspace.FetchBlob("Y2").flatten() # Check that there is no numerical difference @@ -307,7 +259,7 @@ def test_transformer_FuseConvBNNoConvBiasDuplicatedName(self, size, input_channe transformer.FuseConvBN(net) # Ensure fusion - assert len(net.Proto().op) == 1 + assert tu.numOps(net) == 1 workspace.RunNetOnce(net) postTransformOutput = workspace.FetchBlob("Y2").flatten() print("pre") @@ -365,7 +317,7 @@ def test_transformer_FuseConv3DBN( transformer.FuseConvBN(net) # Ensure fusion - assert len(net.Proto().op) == 1 + assert tu.numOps(net) == 1 workspace.RunNetOnce(net) postTransformOutput = workspace.FetchBlob("Y2").flatten() # Check that there is no numerical difference diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 82aff7b8cc87d5..d9ddfcdfc4f9ee 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -797,7 +797,10 @@ if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) if (CAFFE2_LINK_LOCAL_PROTOBUF) set(ONNX_PROTO_POST_BUILD_SCRIPT ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake) endif() + # Add op schemas in "ai.onnx.pytorch" domain + add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../caffe2/onnx/torch_ops") add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/onnx) + include_directories(${ONNX_INCLUDE_DIRS}) add_definitions(-DONNX_NAMESPACE=${ONNX_NAMESPACE}) # In mobile build we care about code size, and so we need drop diff --git a/docker/caffe2/jenkins/common/install_clang.sh b/docker/caffe2/jenkins/common/install_clang.sh index 694606ec0b91f3..fbf5515bae36d5 100755 --- a/docker/caffe2/jenkins/common/install_clang.sh +++ b/docker/caffe2/jenkins/common/install_clang.sh @@ -4,6 +4,13 @@ set -ex [ -n "$CLANG_VERSION" ] +if [[ "$CLANG_VERSION" == "7" ]]; then + apt-get update + apt-get install -y --no-install-recommends software-properties-common wget + wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - + apt-add-repository "deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-7 main" +fi + apt-get update apt-get install -y --no-install-recommends clang-"$CLANG_VERSION" rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* diff --git a/docs/cpp/source/Doxyfile b/docs/cpp/source/Doxyfile index 2ab4947453eaca..b600edee6c2498 100644 --- a/docs/cpp/source/Doxyfile +++ b/docs/cpp/source/Doxyfile @@ -66,6 +66,8 @@ CREATE_SUBDIRS = NO FULL_PATH_NAMES = YES # Nested folders will be ignored without this. RECURSIVE = YES +# Blacklist certain file patterns from the INPUT section. +EXCLUDE = ../../../torch/csrc/api/include/torch/nn/pimpl-inl.h ################################################################################ # Output formats for Doxygen to create. # ################################################################################ @@ -102,7 +104,7 @@ EXTRACT_ALL = YES EXTRACT_PACKAGE = YES EXTRACT_STATIC = YES CASE_SENSE_NAMES = NO -EXCLUDE_SYMBOLS = c10::* caffe2::* cereal* DL* TH* cudnn* +EXCLUDE_SYMBOLS = c10::* caffe2::* cereal* DL* TH* cudnn* std::* ################################################################################ # Docstring control / customization. # ################################################################################ diff --git a/docs/cpp/source/building.rst b/docs/cpp/source/building.rst deleted file mode 100644 index 24ab7a5e69ba3f..00000000000000 --- a/docs/cpp/source/building.rst +++ /dev/null @@ -1,2 +0,0 @@ -Building -======== diff --git a/docs/cpp/source/contributing.rst b/docs/cpp/source/contributing.rst index 5a1988f1db7c62..14ae9224d734ea 100644 --- a/docs/cpp/source/contributing.rst +++ b/docs/cpp/source/contributing.rst @@ -1,2 +1,8 @@ -Contributing -============ +Contributing to PyTorch +======================= + +If you would like to contribute to the PyTorch C++ API, refer to the +`CONTRIBUTING.md +`_ document in +the PyTorch repository. It contains instructions on how to develop PyTorch from source +and submit a proposal for your patch or feature. We will be happy to review it! diff --git a/docs/cpp/source/examples.rst b/docs/cpp/source/examples.rst deleted file mode 100644 index bac945d559fec7..00000000000000 --- a/docs/cpp/source/examples.rst +++ /dev/null @@ -1,2 +0,0 @@ -Examples -======== diff --git a/docs/cpp/source/frontend.rst b/docs/cpp/source/frontend.rst new file mode 100644 index 00000000000000..0a9a9943c6cbcd --- /dev/null +++ b/docs/cpp/source/frontend.rst @@ -0,0 +1,146 @@ +The PyTorch C++ Frontend +======================== + +The PyTorch C++ frontend is a C++11 library for CPU and GPU +tensor computation, with automatic differentation and high level building +blocks for state of the art machine learning applications. + +Description +----------- + +The PyTorch C++ frontend can be thought of as a C++ version of the +PyTorch Python frontend, providing automatic differentiation and various higher +level abstractions for machine learning and neural networks. Specifically, +it consists of the following components: + ++----------------------+------------------------------------------------------------------------+ +| Component | Description | ++======================+========================================================================+ +| ``torch::Tensor`` | Automatically differentiable, efficient CPU and GPU enabled tensors | ++----------------------+------------------------------------------------------------------------+ +| ``torch::nn`` | A collection of composable modules for neural network modeling | ++----------------------+------------------------------------------------------------------------+ +| ``torch::optim`` | Optimization algorithms like SGD, Adam or RMSprop to train your models | ++----------------------+------------------------------------------------------------------------+ +| ``torch::data`` | Datasets, data pipelines and multi-threaded, asynchronous data loader | ++----------------------+------------------------------------------------------------------------+ +| ``torch::serialize`` | A serialization API for storing and loading model checkpoints | ++----------------------+------------------------------------------------------------------------+ +| ``torch::python`` | Glue to bind your C++ models into Python | ++----------------------+------------------------------------------------------------------------+ +| ``torch::jit`` | Pure C++ access to the TorchScript JIT compiler | ++----------------------+------------------------------------------------------------------------+ + +End-to-end example +------------------ + +Here is a simple, end-to-end example of defining and training a simple +neural network on the MNIST dataset: + +.. code-block:: cpp + + #include + + // Define a new Module. + struct Net : torch::nn::Module { + Net() { + // Construct and register two Linear submodules. + fc1 = register_module("fc1", torch::nn::Linear(8, 64)); + fc2 = register_module("fc2", torch::nn::Linear(64, 1)); + } + + // Implement the Net's algorithm. + torch::Tensor forward(torch::Tensor x) { + // Use one of many tensor manipulation functions. + x = torch::relu(fc1->forward(x)); + x = torch::dropout(x, /*p=*/0.5); + x = torch::sigmoid(fc2->forward(x)); + return x; + } + + // Use one of many "standard library" modules. + torch::nn::Linear fc1{nullptr}, fc2{nullptr}; + }; + + // Create a new Net. + Net net; + + // Create a multi-threaded data loader for the MNIST dataset. + auto data_loader = + torch::data::data_loader(torch::data::datasets::MNIST("./data")); + + // Instantiate an SGD optimization algorithm to update our Net's parameters. + torch::optim::SGD optimizer(net.parameters(), /*lr=*/0.1); + + for (size_t epoch = 1; epoch <= 10; ++epoch) { + size_t batch_index = 0; + // Iterate the data loader to yield batches from the dataset. + for (auto batch : data_loader) { + // Reset gradients. + optimizer.zero_grad(); + // Execute the model on the input data. + auto prediction = model.forward(batch.data); + // Compute a loss value to judge the prediction of our model. + auto loss = torch::binary_cross_entropy(prediction, batch.label); + // Compute gradients of the loss w.r.t. the parameters of our model. + loss.backward(); + // Update the parameters based on the calculated gradients. + optimizer.step(); + + if (batch_index++ % 10 == 0) { + std::cout << "Epoch: " << epoch << " | Batch: " << batch_index + << " | Loss: " << loss << std::endl; + // Serialize your model periodically as a checkpoint. + torch::save(net, "net.pt"); + } + } + +To see more complete examples of using the PyTorch C++ frontend, see `the example repository +`_. + +Philosophy +---------- + +PyTorch's C++ frontend was designed with the idea that the Python frontend is +great, and should be used when possible; but in some settings, performance and +portability requirements make the use of the Python interpreter infeasible. For +example, Python is a poor choice for low latency, high performance or +multithreaded environments, such as video games or production servers. The +goal of the C++ frontend is to address these use cases, while not sacrificing +the user experience of the Python frontend. + +As such, the C++ frontend has been written with a few philosophical goals in mind: + +* **Closely model the Python frontend in its design**, naming, conventions and + functionality. While there may be occasional differences between the two + frontends (e.g., where we have dropped deprecated features or fixed "warts" + in the Python frontend), we guarantee that the effort in porting a Python model + to C++ should lie exclusively in **translating language features**, + not modifying functionality or behavior. + +* **Prioritize flexibility and user-friendliness over micro-optimization.** + In C++, you can often get optimal code, but at the cost of an extremely + unfriendly user experience. Flexibility and dynamism is at the heart of + PyTorch, and the C++ frontend seeks to preserve this experience, in some + cases sacrificing performance (or "hiding" performance knobs) to keep APIs + simple and explicable. We want researchers who don't write C++ for a living + to be able to use our APIs. + +A word of warning: Python is not necessarily slower than +C++! The Python frontend calls into C++ for almost anything computationally expensive +(especially any kind of numeric operation), and these operations will take up +the bulk of time spent in a program. If you would prefer to write Python, +and can afford to write Python, we recommend using the Python interface to +PyTorch. However, if you would prefer to write C++, or need to write C++ +(because of multithreading, latency or deployment requirements), the +C++ frontend to PyTorch provides an API that is approximately as convenient, +flexible, friendly and intuitive as its Python counterpart. The two frontends +serve different use cases, work hand in hand, and neither is meant to +unconditionally replace the other. + +Installation +------------ + +Instructions on how to install the C++ frontend library distribution, including +an example for how to build a minimal application depending on LibTorch, may be +found by following `this `_ link. diff --git a/docs/cpp/source/index.rst b/docs/cpp/source/index.rst index 2743c3ea650b4e..5fef739c975518 100644 --- a/docs/cpp/source/index.rst +++ b/docs/cpp/source/index.rst @@ -1,36 +1,168 @@ PyTorch C++ API =============== -The PyTorch C++ API is a research and production ready C++ interface to PyTorch, -a library for tensors and dynamic neural networks with strong GPU acceleration. +These pages provide documentation for the public portions of the PyTorch C++ +API. This API can roughly be divided into five parts: -Description +- **ATen**: The foundational tensor and mathematical operation library on which all else is built; +- **Autograd**: Augments ATen with automatic differentiation; +- **C++ Frontend**: High level constructs for training and evaluation of machine learning models; +- **TorchScript**: An interface to the TorchScript JIT compiler and interpreter; +- **C++ Extensions**: A means of extending the Python API with custom C++ and CUDA routines. + +Together, these building blocks form a research and +production ready C++ library for tensor computation and dynamic neural +networks with strong emphasis on GPU acceleration as well as fast CPU +performance. It is currently in use at Facebook in research and +production; we look forward to welcoming more users of the PyTorch C++ API. + +.. warning:: + + At the moment, the C++ API should be considered "beta" stability; we may + make major breaking changes to the backend in order to improve the API, + or in service of providing the Python interface to PyTorch, which is our + most stable and best supported interface. + +ATen +---- + +ATen is fundamentally a tensor library, on top of which almost all other Python +and C++ interfaces in PyTorch are built. It provides a core ``Tensor`` class, +on which many hundreds of operations are defined. Most of these operations have +both CPU and GPU implementations, to which the ``Tensor`` class will +dynamically dispatch based on its type. A small example of using ATen could +look as follows: + +.. code-block:: cpp + + #include + + at::Tensor a = at::ones({2, 2}, at::kInt); + at::Tensor b = at::randn({2, 2}); + auto c = a + b.to(at::kInt); + +This ``Tensor`` class and all other symbols in ATen are found in the `at::` +namespace, documented +`here `_. + +Autograd +-------- + +What we term *autograd* are the portions of PyTorch's C++ API that augment the +ATen ``Tensor`` class with capabilities concerning automatic differentiation. +The autograd system records operations on tensors to form an *autograd graph*. +Calling ``backwards()`` on a leaf variable in this graph performs reverse mode +differentiation through the network of functions and tensors spanning the +autograd graph, ultimately yieldings gradients. The following example provides +a taste of this interface: + +.. code-block:: cpp + + #include + #include + + at::Tensor a = torch::ones({2, 2}, at::requires_grad()); + at::Tensor b = torch::randn({2, 2}); + auto c = a + b; + c.backward(); // a.grad() will now hold the gradient of c w.r.t. a. + +The ``at::Tensor`` class in ATen is not differentiable by default. To add the +differentiability of tensors the autograd API provides, you must use tensor +factory functions from the `torch::` namespace instead of the `at` namespace. +For example, while a tensor created with `at::ones` will not be differentiable, +a tensor created with `torch::ones` will be. + +C++ Frontend +------------ + +The PyTorch C++ frontend provides a high level, pure C++ modeling interface for +neural network and general machine learning research and production use cases, +largely following the Python API in design and provided functionality. The C++ +frontend includes the following: + +- An interface for defining machine learning models through a hierarchical module system (like ``torch.nn.Module``); +- A "standard library" of pre-existing modules for the most common modeling purposes (e.g. convolutions, RNNs, batch normalization etc.); +- An optimization API, including implementations of popular optimizers such as SGD, Adam, RMSprop and others; +- A means of representing datasets and data pipelines, including functionality to load data in parallel over many CPU cores; +- A serialization format for storing and loading checkpoints of a training session (like ``torch.utils.data.DataLoader``); +- Automatic parallelization of models onto multiple GPUs (like ``torch.nn.parallel.DataParallel``); +- Support code to easily bind C++ models into Python using pybind11; +- Entry points to the TorchScript JIT compiler; +- Helpful utilities to facilitate interfacing with the ATen and Autograd APIs. + +See `this `_ document for a more +detailed description of the C++ frontend. Relevant sections of the `torch::` +namespace related to the C++ Frontend include `torch::nn +`_, +`torch::optim +`_, +`torch::data +`_, +`torch::serialize +`_, +`torch::jit +`_ +and `torch::python +`_. +Examples of the C++ frontend can be found in `this repository +`_ which is being +expanded on a continuous and active basis. + +.. note:: + + Unless you have a particular reason to constrain yourself exclusively to ATen + or the Autograd API, the C++ frontend is the recommended entry point to the + PyTorch C++ ecosystem. While it is still in beta as we collect user feedback + (from you!), it provides both more functionality and better stability + guarantees than the ATen and Autograd APIs. + +TorchScript ----------- -The PyTorch C++ API provides all the major building blocks to research and iterate on -state of the art machine learning models with a user friendly modern C++ interface, -as well as providing an excellent platform for deploying machine learning applications -in bare bones, high performance environments. +TorchScript a representation of a PyTorch model that can be understood, +compiled and serialized by the TorchScript compiler. Fundamentally, TorchScript +is a programming language in its own right. It is a subset of Python using +the PyTorch API. The C++ interface to TorchScript encompasses three primary pieces of +functionality: -1. Design Philosophy -2. Description of components -3. One small example +- A mechanism for loading and executing serialized TorchScript models defined in Python; +- An API for defining custom operators that extend the TorchScript standard library of operations; +- Just-in-time compilation of TorchScript programs from C++. -License -------- +The first mechanism may be of great interest to you if you would like to define +your models in Python as much as possible, but subsequently export them to C++ +for production environments and no-Python inference. You can find out more +about this by following `this +`_ link. The second +API concerns itself with scenarios in which you would like to extend +TorchScript with custom operators, which can similarly be serialized and +invoked from C++ during inference. Lastly, the `torch::jit::compile +`_ +function may be used to access the TorchScript compiler directly from C++. +C++ Extensions +-------------- +*C++ Extensions* offer a simple yet powerful way of accessing all of the above +interfaces for the purpose of extending regular Python use-cases of PyTorch. +C++ extensions are most commonly used to implement custom operators in C++ or +CUDA to accelerate research in vanilla PyTorch setups. The C++ extension API +does not add any new functionality to the PyTorch C++ API. Instead, it +provides integration with Python setuptools as well as JIT compilation +mechanisms that allow access to ATen, the autograd and other C++ APIs from +Python. To learn more about the C++ extension API, see +`this `_ tutorial. Contents -======== +-------- .. toctree:: :maxdepth: 2 - api/library_root - examples - building + frontend + installing contributing + api/library_root Indices and tables diff --git a/docs/cpp/source/installing.rst b/docs/cpp/source/installing.rst new file mode 100644 index 00000000000000..24906dbb53391a --- /dev/null +++ b/docs/cpp/source/installing.rst @@ -0,0 +1,131 @@ +Installing C++ Distributions of PyTorch +======================================= + +We provide binary distributions of all headers, libraries and CMake +configuration files required to depend on PyTorch. We call this distribution +*LibTorch*, and you can download ZIP archives containing the latest LibTorch +distribution on `our website `_. Below +is a small example of writing a minimal application that depends on LibTorch +and uses the `at::Tensor` class which comes with the PyTorch C++ API. + +Minimal Example +--------------- + +The first step is to download the LibTorch ZIP archive via the link above. For +example: + +.. code-block:: sh + + wget https://download.pytorch.org/libtorch/nightly/cpu/libtorch-shared-with-deps-latest.zip + unzip libtorch-shared-with-deps-latest.zip + + +Next, we can write a minimal CMake build configuration to develop a small +application that depends on LibTorch. CMake is not a hard requirement for using +LibTorch, but it is the recommended and blessed build system and will be well +supported into the future. A most basic `CMakeLists.txt` file could look like +this: + +.. code-block:: cmake + + cmake_minimum_required(VERSION 3.0 FATAL_ERROR) + project(example-app) + + find_package(Torch REQUIRED) + + add_executable(example-app example-app.cpp) + target_link_libraries(example-app "${TORCH_LIBRARIES}") + set_property(TARGET example-app PROPERTY CXX_STANDARD 11) + +The implementation of our example will simply create a new `at::Tensor` and +print it: + +.. code-block:: cpp + + #include + #include + + int main() { + at::Tensor tensor = torch::rand({2, 3}); + std::cout << tensor << std::endl; + } + +While there are more fine-grained headers you can include to access only parts +of the PyTorch C++ API, including `torch/torch.h` is the most sure-proof way of +including most of its functionality. + +The last step is to build the application. For this, assume our example +directory is laid out like this: + +.. code-block:: sh + + example-app/ + CMakeLists.txt + example-app.cpp + +We can now run the following commands to build the application from within the +``example-app/`` folder: + +.. code-block:: sh + + mkdir build + cd build + cmake -DCMAKE_PREFIX_PATH=/path/to/libtorch .. + make + +where ``/path/to/libtorch`` should be the full path to the unzipped LibTorch +distribution. If all goes well, it will look something like this: + +.. code-block:: sh + + root@4b5a67132e81:/example-app# mkdir build + root@4b5a67132e81:/example-app# cd build + root@4b5a67132e81:/example-app/build# cmake -DCMAKE_PREFIX_PATH=/path/to/libtorch .. + -- The C compiler identification is GNU 5.4.0 + -- The CXX compiler identification is GNU 5.4.0 + -- Check for working C compiler: /usr/bin/cc + -- Check for working C compiler: /usr/bin/cc -- works + -- Detecting C compiler ABI info + -- Detecting C compiler ABI info - done + -- Detecting C compile features + -- Detecting C compile features - done + -- Check for working CXX compiler: /usr/bin/c++ + -- Check for working CXX compiler: /usr/bin/c++ -- works + -- Detecting CXX compiler ABI info + -- Detecting CXX compiler ABI info - done + -- Detecting CXX compile features + -- Detecting CXX compile features - done + -- Looking for pthread.h + -- Looking for pthread.h - found + -- Looking for pthread_create + -- Looking for pthread_create - not found + -- Looking for pthread_create in pthreads + -- Looking for pthread_create in pthreads - not found + -- Looking for pthread_create in pthread + -- Looking for pthread_create in pthread - found + -- Found Threads: TRUE + -- Configuring done + -- Generating done + -- Build files have been written to: /example-app/build + root@4b5a67132e81:/example-app/build# make + Scanning dependencies of target example-app + [ 50%] Building CXX object CMakeFiles/example-app.dir/example-app.cpp.o + [100%] Linking CXX executable example-app + [100%] Built target example-app + +Executing the resulting ``example-app`` binary found in the ``build`` folder +should now merrily print the tensor (exact output subject to randomness): + +.. code-block:: sh + + root@4b5a67132e81:/example-app/build# ./example-app model.pt + 0.2063 0.6593 0.0866 + 0.0796 0.5841 0.1569 + [ Variable[CPUFloatType]{2,3} ] + +Support +------- + +If you run into any troubles with this installation and minimal usage guide, +please use our `forum `_ or `GitHub issues +`_ to get in touch. diff --git a/docs/source/nn.rst b/docs/source/nn.rst index 68420d837bf801..a0d3abfa5501ad 100644 --- a/docs/source/nn.rst +++ b/docs/source/nn.rst @@ -1173,6 +1173,11 @@ Distance functions .. autofunction:: cosine_similarity +:hidden:`pdist` +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autofunction:: pdist + Loss functions -------------- diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst index 63047bb11fddff..212f68e694d7f9 100644 --- a/docs/source/notes/cuda.rst +++ b/docs/source/notes/cuda.rst @@ -74,9 +74,10 @@ You can force synchronous computation by setting environment variable operation is actually executed, so the stack trace does not show where it was requested.) -As an exception, several functions such as :meth:`~torch.Tensor.copy_` admit -an explicit :attr:`async` argument, which lets the caller bypass synchronization -when it is unnecessary. Another exception is CUDA streams, explained below. +As an exception, several functions such as :meth:`~torch.Tensor.to` and +:meth:`~torch.Tensor.copy_` admit an explicit :attr:`non_blocking` argument, +which lets the caller bypass synchronization when it is unnecessary. +Another exception is CUDA streams, explained below. CUDA streams ^^^^^^^^^^^^ diff --git a/docs/source/sparse.rst b/docs/source/sparse.rst index a329bb049baac3..71dcaa7511fa26 100644 --- a/docs/source/sparse.rst +++ b/docs/source/sparse.rst @@ -110,6 +110,7 @@ An empty sparse tensor can be constructed by specifying its size: .. method:: mm .. method:: mul .. method:: mul_ + .. method:: narrow_copy .. method:: resizeAs_ .. method:: size .. method:: spadd diff --git a/docs/source/torch.rst b/docs/source/torch.rst index 31585d4a969770..1d55fa8b937738 100644 --- a/docs/source/torch.rst +++ b/docs/source/torch.rst @@ -223,6 +223,7 @@ Reduction Ops Comparison Ops ~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: allclose +.. autofunction:: argsort .. autofunction:: eq .. autofunction:: equal .. autofunction:: ge @@ -256,6 +257,7 @@ Spectral Ops Other Operations ~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: bincount +.. autofunction:: broadcast_tensors .. autofunction:: cross .. autofunction:: diag .. autofunction:: diagflat diff --git a/setup.py b/setup.py index 94455ed1cf7be7..a8cdac91e92369 100644 --- a/setup.py +++ b/setup.py @@ -1202,6 +1202,8 @@ def make_relative_rpath(path): 'lib/include/caffe2/utils/*.h', 'lib/include/c10/*.h', 'lib/include/c10/macros/*.h', + 'lib/include/c10/util/*.h', + 'lib/include/caffe2/core/*.h', 'lib/include/torch/*.h', 'lib/include/torch/csrc/*.h', 'lib/include/torch/csrc/api/include/torch/*.h', diff --git a/test/custom_operator/CMakeLists.txt b/test/custom_operator/CMakeLists.txt index f692bdfae123b9..059b004a84840e 100644 --- a/test/custom_operator/CMakeLists.txt +++ b/test/custom_operator/CMakeLists.txt @@ -5,7 +5,6 @@ project(custom_ops) find_package(Torch REQUIRED) add_library(custom_ops SHARED op.cpp) -target_compile_features(custom_ops PUBLIC cxx_range_for) target_link_libraries(custom_ops ${TORCH_LIBRARIES}) add_executable(test_custom_ops test_custom_ops.cpp) diff --git a/test/expect/TestBatched.test_for.expect b/test/expect/TestBatched.test_for.expect index bcbcffaee486a3..8932957402c94e 100644 --- a/test/expect/TestBatched.test_for.expect +++ b/test/expect/TestBatched.test_for.expect @@ -6,17 +6,17 @@ graph(%x.1_data : Dynamic %y_dims : Dynamic) { %6 : int = prim::Constant[value=10]() %7 : int = prim::Constant[value=1]() - %x : Dynamic, %21 : Dynamic, %22 : Dynamic = prim::Loop(%6, %7, %x.1_data, %x.1_mask, %x.1_dims) + %x : Dynamic, %9 : Dynamic, %10 : Dynamic = prim::Loop(%6, %7, %x.1_data, %x.1_mask, %x.1_dims) block0(%loop_num : int, %5_data : Dynamic, %5_mask : Dynamic, %5_dims : Dynamic) { - %13 : int = prim::Constant[value=1]() - %14 : Long() = prim::NumToTensor(%13) - %alpha : float = prim::TensorToNum(%14) + %15 : int = prim::Constant[value=1]() + %16 : Long() = prim::NumToTensor(%15) + %alpha : float = prim::TensorToNum(%16) %data.1 : Dynamic = aten::add(%5_data, %y_data, %alpha) %mask : Dynamic = aten::mul(%5_mask, %y_mask) %dims : Dynamic = aten::__or__(%5_dims, %y_dims) - %19 : int = prim::Constant[value=1]() + %21 : int = prim::Constant[value=1]() %data : Dynamic = aten::where(%mask, %data.1, %5_data) - -> (%19, %data, %mask, %dims) + -> (%21, %data, %mask, %dims) } - return (%x, %21, %22); + return (%x, %9, %10); } diff --git a/test/expect/TestBatched.test_while.expect b/test/expect/TestBatched.test_while.expect index 66e3cdb6a2dfa8..7aba7a89ace320 100644 --- a/test/expect/TestBatched.test_while.expect +++ b/test/expect/TestBatched.test_while.expect @@ -14,34 +14,34 @@ graph(%a.1_data : Dynamic %13 : Dynamic = aten::sum(%12) %14 : Dynamic = aten::gt(%13, %11) %15 : int = prim::TensorToNum(%14) - %63 : Dynamic, %64 : Dynamic, %65 : Dynamic, %a : Dynamic, %61 : Dynamic, %62 : Dynamic = prim::Loop(%6, %15, %7, %8, %9, %a.1_data, %a.1_mask, %a.1_dims) + %16 : Dynamic, %17 : Dynamic, %18 : Dynamic, %a : Dynamic, %20 : Dynamic, %21 : Dynamic = prim::Loop(%6, %15, %7, %8, %9, %a.1_data, %a.1_mask, %a.1_dims) block0(%loop_num : int, %cond_data.2 : Dynamic, %cond_mask.3 : Dynamic, %cond_dims : Dynamic, %6_data : Dynamic, %6_mask : Dynamic, %6_dims : Dynamic) { - %24 : int = prim::Constant[value=1]() - %25 : Long() = prim::NumToTensor(%24) - %alpha : float = prim::TensorToNum(%25) + %29 : int = prim::Constant[value=1]() + %30 : Long() = prim::NumToTensor(%29) + %alpha : float = prim::TensorToNum(%30) %data.1 : Dynamic = aten::sub(%6_data, %b_data, %alpha) %mask : Dynamic = aten::mul(%6_mask, %b_mask) %dims : Dynamic = aten::__or__(%6_dims, %b_dims) - %30 : Dynamic = aten::gt(%data.1, %b_data) - %31 : Dynamic = aten::mul(%mask, %b_mask) - %32 : Dynamic = aten::__or__(%dims, %b_dims) - %33 : int = prim::TensorToNum(%30) - %34 : int = prim::Constant[value=1]() - %35 : Dynamic = aten::type_as(%cond_mask.3, %cond_data.2) - %cond_mask.1 : Dynamic = aten::mul(%cond_data.2, %35) - %37 : int = aten::dim(%cond_mask.1) - %38 : int = aten::eq(%37, %34) - %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%38) + %35 : Dynamic = aten::gt(%data.1, %b_data) + %36 : Dynamic = aten::mul(%mask, %b_mask) + %37 : Dynamic = aten::__or__(%dims, %b_dims) + %38 : int = prim::TensorToNum(%35) + %39 : int = prim::Constant[value=1]() + %40 : Dynamic = aten::type_as(%cond_mask.3, %cond_data.2) + %cond_mask.1 : Dynamic = aten::mul(%cond_data.2, %40) + %42 : int = aten::dim(%cond_mask.1) + %43 : int = aten::eq(%42, %39) + %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%43) block0() { - %42 : int = aten::dim(%data.1) - %43 : int = aten::sub(%42, %34) - %44 : int = prim::Constant[value=1]() - %data.3 : Dynamic = prim::Loop(%43, %44, %cond_mask.1) - block0(%_ : int, %47 : Dynamic) { - %48 : int = aten::dim(%47) - %data.2 : Dynamic = aten::unsqueeze(%47, %48) - %50 : int = prim::Constant[value=1]() - -> (%50, %data.2) + %47 : int = aten::dim(%data.1) + %48 : int = aten::sub(%47, %39) + %49 : int = prim::Constant[value=1]() + %data.3 : Dynamic = prim::Loop(%48, %49, %cond_mask.1) + block0(%_ : int, %52 : Dynamic) { + %53 : int = aten::dim(%52) + %data.2 : Dynamic = aten::unsqueeze(%52, %53) + %55 : int = prim::Constant[value=1]() + -> (%55, %data.2) } %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1) %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask) @@ -53,12 +53,12 @@ graph(%a.1_data : Dynamic %res_data : Dynamic = aten::where(%cond_data, %data.1, %6_data) %res_mask : Dynamic = aten::where(%cond_mask, %mask, %6_mask) %res_dims : Dynamic = aten::__or__(%dims, %6_dims) - %56 : int = prim::Constant[value=0]() - %57 : Dynamic = aten::mul(%30, %31) - %58 : Dynamic = aten::sum(%57) - %59 : Dynamic = aten::gt(%58, %56) - %60 : int = prim::TensorToNum(%59) - -> (%60, %30, %31, %32, %res_data, %res_mask, %res_dims) + %61 : int = prim::Constant[value=0]() + %62 : Dynamic = aten::mul(%35, %36) + %63 : Dynamic = aten::sum(%62) + %64 : Dynamic = aten::gt(%63, %61) + %65 : int = prim::TensorToNum(%64) + -> (%65, %35, %36, %37, %res_data, %res_mask, %res_dims) } - return (%a, %61, %62); + return (%a, %20, %21); } diff --git a/test/expect/TestJit.test_constant_prop_print.expect b/test/expect/TestJit.test_constant_prop_print.expect index 5bc86daf4765c7..6f72acc4c8483e 100644 --- a/test/expect/TestJit.test_constant_prop_print.expect +++ b/test/expect/TestJit.test_constant_prop_print.expect @@ -2,6 +2,7 @@ graph(%input_tensor : Dynamic) { %1 : int = prim::Constant[value=6]() = prim::Print(%1) %2 : int = prim::Constant[value=8]() - %3 : Dynamic = aten::add(%2, %input_tensor) - return (%3); + %3 : int = prim::Constant[value=1]() + %4 : Dynamic = aten::add(%input_tensor, %2, %3) + return (%4); } diff --git a/test/expect/TestJit.test_constant_prop_simple.expect b/test/expect/TestJit.test_constant_prop_simple.expect index 029f9ac05a0783..71cf099a54a663 100644 --- a/test/expect/TestJit.test_constant_prop_simple.expect +++ b/test/expect/TestJit.test_constant_prop_simple.expect @@ -1,5 +1,6 @@ graph(%input_tensor : Dynamic) { %1 : int = prim::Constant[value=8]() - %2 : Dynamic = aten::add(%1, %input_tensor) - return (%2); + %2 : int = prim::Constant[value=1]() + %3 : Dynamic = aten::add(%input_tensor, %1, %2) + return (%3); } diff --git a/test/expect/TestScript.test_call_script_fn_from_script_fn.expect b/test/expect/TestScript.test_call_script_fn_from_script_fn.expect index bed05b89580c6c..b2159144f798fc 100644 --- a/test/expect/TestScript.test_call_script_fn_from_script_fn.expect +++ b/test/expect/TestScript.test_call_script_fn_from_script_fn.expect @@ -1,7 +1,7 @@ graph(%x : Dynamic) { - %2 : int = prim::Constant[value=1]() - %1 : Dynamic = aten::neg(%x) + %1 : int = prim::Constant[value=1]() + %2 : Dynamic = aten::neg(%x) %3 : int = prim::Constant[value=1]() - %4 : Dynamic = aten::add(%1, %2, %3) + %4 : Dynamic = aten::add(%2, %1, %3) return (%4); } diff --git a/test/expect/TestScript.test_call_script_mod_from_script_fn.expect b/test/expect/TestScript.test_call_script_mod_from_script_fn.expect index b7492626b5d8fe..3478376f829c85 100644 --- a/test/expect/TestScript.test_call_script_mod_from_script_fn.expect +++ b/test/expect/TestScript.test_call_script_mod_from_script_fn.expect @@ -1,14 +1,14 @@ graph(%x : Dynamic) { - %9 : int = prim::Constant[value=1]() - %1 : int = prim::Constant[value=3]() - %2 : int = prim::Constant[value=4]() - %3 : int[] = prim::ListConstruct(%2, %1) - %4 : int = prim::Constant[value=6]() - %5 : int = prim::Constant[value=0]() - %6 : int[] = prim::Constant[value=[0, -1]]() - %7 : Dynamic = aten::zeros(%3, %4, %5, %6) - %8 : Dynamic = aten::mm(%x, %7) + %1 : int = prim::Constant[value=1]() + %2 : int = prim::Constant[value=3]() + %3 : int = prim::Constant[value=4]() + %4 : int[] = prim::ListConstruct(%3, %2) + %5 : int = prim::Constant[value=6]() + %6 : int = prim::Constant[value=0]() + %7 : int[] = prim::Constant[value=[0, -1]]() + %8 : Dynamic = aten::zeros(%4, %5, %6, %7) + %9 : Dynamic = aten::mm(%x, %8) %10 : int = prim::Constant[value=1]() - %11 : Dynamic = aten::add(%8, %9, %10) + %11 : Dynamic = aten::add(%9, %1, %10) return (%11); } diff --git a/test/expect/TestScript.test_call_script_mod_from_script_module.expect b/test/expect/TestScript.test_call_script_mod_from_script_module.expect index 5cae9dcdf96e9d..0365ff600b0a20 100644 --- a/test/expect/TestScript.test_call_script_mod_from_script_module.expect +++ b/test/expect/TestScript.test_call_script_mod_from_script_module.expect @@ -1,7 +1,7 @@ graph(%x : Dynamic %1 : Dynamic - %3 : Dynamic) { - %2 : Dynamic = aten::mm(%x, %1) - %4 : Dynamic = aten::mm(%2, %3) + %2 : Dynamic) { + %3 : Dynamic = aten::mm(%x, %1) + %4 : Dynamic = aten::mm(%3, %2) return (%4); } diff --git a/test/expect/TestScript.test_lstm_fusion_cuda-backward.expect b/test/expect/TestScript.test_lstm_fusion_cuda-backward.expect index cbdbc744b5e85d..3674a3fbc07d2b 100644 --- a/test/expect/TestScript.test_lstm_fusion_cuda-backward.expect +++ b/test/expect/TestScript.test_lstm_fusion_cuda-backward.expect @@ -56,8 +56,8 @@ with prim::FusionGroup_0 = graph(%0 : Float(*, *) %23 : Float(*, *) = aten::neg(%3) %24 : int = prim::Constant[value=1]() %25 : Float(*, *) = aten::add(%23, %24, %24) - %26 : Float(*, *) = aten::mul(%19, %3) - %27 : Float(*, *) = aten::mul(%26, %25) + %26 : Float(*, *) = aten::mul(%25, %3) + %27 : Float(*, *) = aten::mul(%26, %19) %28 : Float(*, *) = aten::mul(%2, %2) %29 : Float(*, *) = aten::neg(%28) %30 : int = prim::Constant[value=1]() @@ -66,13 +66,13 @@ with prim::FusionGroup_0 = graph(%0 : Float(*, *) %33 : Float(*, *) = aten::neg(%1) %34 : int = prim::Constant[value=1]() %35 : Float(*, *) = aten::add(%33, %34, %34) - %36 : Float(*, *) = aten::mul(%22, %1) - %37 : Float(*, *) = aten::mul(%36, %35) + %36 : Float(*, *) = aten::mul(%35, %1) + %37 : Float(*, *) = aten::mul(%36, %22) %38 : Float(*, *) = aten::neg(%0) %39 : int = prim::Constant[value=1]() %40 : Float(*, *) = aten::add(%38, %39, %39) - %41 : Float(*, *) = aten::mul(%20, %0) - %42 : Float(*, *) = aten::mul(%41, %40) + %41 : Float(*, *) = aten::mul(%40, %0) + %42 : Float(*, *) = aten::mul(%41, %20) %43 : Float(*, *) = prim::FusedConcat[dim=1](%42, %37, %32, %27) return (%43, %18); } diff --git a/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect b/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect index b0dc85644751d8..fb14a35296623a 100644 --- a/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect +++ b/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect @@ -62,8 +62,8 @@ with prim::FusionGroup_0 = graph(%0 : Float(*, *) %20 : Float(*, *) = aten::neg(%3) %21 : int = prim::Constant[value=1]() %22 : Float(*, *) = aten::add(%20, %21, %21) - %23 : Float(*, *) = aten::mul(%8, %3) - %24 : Float(*, *) = aten::mul(%23, %22) + %23 : Float(*, *) = aten::mul(%22, %3) + %24 : Float(*, *) = aten::mul(%23, %8) %25 : Float(*, *) = aten::mul(%2, %2) %26 : Float(*, *) = aten::neg(%25) %27 : int = prim::Constant[value=1]() @@ -72,13 +72,13 @@ with prim::FusionGroup_0 = graph(%0 : Float(*, *) %30 : Float(*, *) = aten::neg(%1) %31 : int = prim::Constant[value=1]() %32 : Float(*, *) = aten::add(%30, %31, %31) - %33 : Float(*, *) = aten::mul(%19, %1) - %34 : Float(*, *) = aten::mul(%33, %32) + %33 : Float(*, *) = aten::mul(%32, %1) + %34 : Float(*, *) = aten::mul(%33, %19) %35 : Float(*, *) = aten::neg(%0) %36 : int = prim::Constant[value=1]() %37 : Float(*, *) = aten::add(%35, %36, %36) - %38 : Float(*, *) = aten::mul(%17, %0) - %39 : Float(*, *) = aten::mul(%38, %37) + %38 : Float(*, *) = aten::mul(%37, %0) + %39 : Float(*, *) = aten::mul(%38, %17) %40 : Float(*, *) = prim::FusedConcat[dim=1](%39, %34, %29, %24) return (%40); } diff --git a/test/expect/TestScript.test_scalar_fusion.expect b/test/expect/TestScript.test_scalar_fusion.expect index e2fd92a0f5739c..565855f262d16c 100644 --- a/test/expect/TestScript.test_scalar_fusion.expect +++ b/test/expect/TestScript.test_scalar_fusion.expect @@ -6,7 +6,7 @@ graph(%x : Float() with prim::FusionGroup_0 = graph(%0 : Float() %1 : Float()) { %2 : int = prim::Constant[value=2]() - %3 : Float() = aten::mul(%2, %1) + %3 : Float() = aten::mul(%1, %2) %4 : int = prim::Constant[value=1]() %5 : Float() = aten::add(%3, %0, %4) return (%5); diff --git a/test/onnx/expect/TestOperators.test_full.expect b/test/onnx/expect/TestOperators.test_full.expect new file mode 100644 index 00000000000000..db975329ffa7b5 --- /dev/null +++ b/test/onnx/expect/TestOperators.test_full.expect @@ -0,0 +1,148 @@ +ir_version: 3 +producer_name: "pytorch" +producer_version: "0.4" +graph { + node { + output: "1" + op_type: "Constant" + attribute { + name: "value" + t { + data_type: INT64 + raw_data: "\000\000\000\000\000\000\000\000" + } + type: TENSOR + } + } + node { + input: "0" + output: "2" + op_type: "Shape" + } + node { + input: "2" + input: "1" + output: "3" + op_type: "Gather" + attribute { + name: "axis" + i: 0 + type: INT + } + } + node { + output: "4" + op_type: "Constant" + attribute { + name: "value" + t { + data_type: INT64 + raw_data: "\001\000\000\000\000\000\000\000" + } + type: TENSOR + } + } + node { + input: "0" + output: "5" + op_type: "Shape" + } + node { + input: "5" + input: "4" + output: "6" + op_type: "Gather" + attribute { + name: "axis" + i: 0 + type: INT + } + } + node { + input: "3" + output: "7" + op_type: "Unsqueeze" + attribute { + name: "axes" + ints: 0 + type: INTS + } + } + node { + input: "6" + output: "8" + op_type: "Unsqueeze" + attribute { + name: "axes" + ints: 0 + type: INTS + } + } + node { + input: "7" + input: "8" + output: "9" + op_type: "Concat" + attribute { + name: "axis" + i: 0 + type: INT + } + } + node { + input: "9" + output: "10" + op_type: "ConstantFill" + attribute { + name: "dtype" + i: 1 + type: INT + } + attribute { + name: "input_as_shape" + i: 1 + type: INT + } + attribute { + name: "value" + f: 2 + type: FLOAT + } + } + name: "torch-jit-export" + input { + name: "0" + type { + tensor_type { + elem_type: FLOAT + shape { + dim { + dim_value: 3 + } + dim { + dim_value: 4 + } + } + } + } + } + output { + name: "10" + type { + tensor_type { + elem_type: FLOAT + shape { + dim { + dim_value: 3 + } + dim { + dim_value: 4 + } + } + } + } + } +} +opset_import { + version: 9 +} diff --git a/test/onnx/expect/TestOperators.test_ge.expect b/test/onnx/expect/TestOperators.test_ge.expect index 63f1f3cc563951..f11abcc7e6ef3b 100644 --- a/test/onnx/expect/TestOperators.test_ge.expect +++ b/test/onnx/expect/TestOperators.test_ge.expect @@ -3,8 +3,8 @@ producer_name: "pytorch" producer_version: "0.4" graph { node { - input: "1" input: "0" + input: "1" output: "2" op_type: "Less" } diff --git a/test/onnx/expect/TestOperators.test_le.expect b/test/onnx/expect/TestOperators.test_le.expect index fb36f3449f2664..b50002eacbaf3b 100644 --- a/test/onnx/expect/TestOperators.test_le.expect +++ b/test/onnx/expect/TestOperators.test_le.expect @@ -3,8 +3,8 @@ producer_name: "pytorch" producer_version: "0.4" graph { node { - input: "1" input: "0" + input: "1" output: "2" op_type: "Greater" } diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py index 2dfdd409a15ce7..1e2d0ffb294219 100644 --- a/test/onnx/test_operators.py +++ b/test/onnx/test_operators.py @@ -287,6 +287,10 @@ def test_hardtanh(self): x = Variable(torch.randn(3, 4), requires_grad=True) self.assertONNX(lambda x: torch.nn.Hardtanh(-0.5, 0.5)(x), x) + def test_full(self): + x = torch.randn(3, 4, requires_grad=True) + self.assertONNX(lambda x: torch.full(x.shape, 2), x) + def test_max(self): x = Variable(torch.randn(3, 4), requires_grad=True) y = Variable(torch.randn(3, 4), requires_grad=True) diff --git a/test/test_autograd.py b/test/test_autograd.py index f9ccfb6c958e99..0642e87399c676 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -1406,6 +1406,7 @@ def test_unused_output(self): expected_grad[:2] = grad_output self.assertEqual(x.grad.data, expected_grad) + @skipIfRocm def test_ctc_loss(self): batch_size = 64 num_labels = 101 diff --git a/test/test_cuda.py b/test/test_cuda.py index cdf8d46ce236cf..2c647b08cbd601 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -268,11 +268,11 @@ def tmp(t): ('div', small_3d, lambda t: [number(3.14, 3, t)], '', types, False, "skipIfRocm:ByteTensor,CharTensor,FloatTensor,HalfTensor,ShortTensor"), ('div', small_3d, lambda t: [small_3d_positive(t)], 'tensor'), - ('pow', small_3d, lambda t: [number(3.14, 3, t)], None, float_types, False, "skipIfRocm:HalfTensor"), - ('pow', small_3d, lambda t: [number(1., 1, t)], 'pow1', types, False, "skipIfRocm:HalfTensor"), - ('pow', small_3d, lambda t: [number(2., 2, t)], 'pow2', types, False, "skipIfRocm:HalfTensor"), - ('pow', small_3d, lambda t: [number(3., 3, t)], 'pow3', types, False, "skipIfRocm:HalfTensor"), - ('pow', small_3d, lambda t: [number(-1., -1, t)], 'pow-1', float_types, False, "skipIfRocm:HalfTensor"), + ('pow', small_3d, lambda t: [number(3.14, 3, t)], None, float_types), + ('pow', small_3d, lambda t: [number(1., 1, t)], 'pow1'), + ('pow', small_3d, lambda t: [number(2., 2, t)], 'pow2'), + ('pow', small_3d, lambda t: [number(3., 3, t)], 'pow3'), + ('pow', small_3d, lambda t: [number(-1., -1, t)], 'pow-1', float_types), # HalfTensor gives bad result at pow-2 with data sampled from torch.randn ('pow', small_3d, lambda t: [number(-2., -2, t)], 'pow-2', float_types_no_half, False, "skipIfRocm:HalfTensor,FloatTensor"), diff --git a/test/test_dataloader.py b/test/test_dataloader.py index 020486c1fbda35..3d9af20c859658 100644 --- a/test/test_dataloader.py +++ b/test/test_dataloader.py @@ -371,6 +371,7 @@ def test_segfault(self): finally: p.terminate() + @skipIfRocm def test_timeout(self): p = ErrorTrackingProcess(target=_test_timeout) p.start() diff --git a/test/test_distributions.py b/test/test_distributions.py index 5fbc2003be27e4..2c489d858c1238 100644 --- a/test/test_distributions.py +++ b/test/test_distributions.py @@ -542,6 +542,12 @@ def is_all_nan(tensor): 'scale': torch.tensor([1., -1.], requires_grad=True), }, ]), + Example(MultivariateNormal, [ + { + 'loc': torch.tensor([1., 1.], requires_grad=True), + 'covariance_matrix': torch.tensor([[1.0, 0.0], [0.0, -2.0]], requires_grad=True), + }, + ]), Example(Normal, [ { 'loc': torch.tensor([1., 1.], requires_grad=True), @@ -2372,12 +2378,20 @@ def test_valid_parameter_broadcasting(self): (1, 2)), (StudentT(df=torch.tensor([1.]), scale=torch.tensor([[1.]])), (1, 1)), + (StudentT(df=1., loc=torch.zeros(5, 1), scale=torch.ones(3)), + (5, 3)), ] for dist, expected_size in valid_examples: - dist_sample_size = dist.sample().size() - self.assertEqual(dist_sample_size, expected_size, - 'actual size: {} != expected size: {}'.format(dist_sample_size, expected_size)) + actual_size = dist.sample().size() + self.assertEqual(actual_size, expected_size, + '{} actual size: {} != expected size: {}'.format(dist, actual_size, expected_size)) + + sample_shape = torch.Size((2,)) + expected_size = sample_shape + expected_size + actual_size = dist.sample(sample_shape).size() + self.assertEqual(actual_size, expected_size, + '{} actual size: {} != expected size: {}'.format(dist, actual_size, expected_size)) def test_invalid_parameter_broadcasting(self): # invalid broadcasting cases; should throw error diff --git a/test/test_jit.py b/test/test_jit.py index a448362b470bbf..24d8076d31365f 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -240,14 +240,10 @@ def getExportImportCopy(self, m): imported = torch.jit.load(f.name) finally: os.unlink(f.name) - f = tempfile.NamedTemporaryFile(delete=False) - try: - f.close() - imported.save(f.name) - imported = torch.jit.load(f.name) - finally: - os.unlink(f.name) - return imported + buffer = io.BytesIO() + torch.jit.save(imported, buffer) + buffer.seek(0) + return torch.jit.load(buffer) def assertGraphContains(self, graph, kind): self.assertTrue(any(n.kind() == kind for n in graph.nodes())) @@ -2230,7 +2226,7 @@ def single_if(a, b): script_if = torch.jit.script(single_if) graph = torch.to_batch_graph(script_if.graph) - self.assertExpected(str(graph)) + self.assertExpected(canonical(graph)) def test_if_else_with_scalar(self): def single_if(a, b): @@ -2250,7 +2246,7 @@ def single_if(a, b): script_if = torch.jit.script(single_if) graph = torch.to_batch_graph(script_if.graph) - self.assertExpected(str(graph)) + self.assertExpected(canonical(graph)) def test_if_noelse(self): def single_if(a, b): @@ -2268,7 +2264,7 @@ def single_if(a, b): script_if = torch.jit.script(single_if) graph = torch.to_batch_graph(script_if.graph) - self.assertExpected(str(graph)) + self.assertExpected(canonical(graph)) def test_if_noelse_with_scalar(self): def single_if(a, b): @@ -2286,7 +2282,7 @@ def single_if(a, b): script_if = torch.jit.script(single_if) graph = torch.to_batch_graph(script_if.graph) - self.assertExpected(str(graph)) + self.assertExpected(canonical(graph)) def test_while(self): def single_while(a, b): @@ -2305,7 +2301,7 @@ def single_while(a, b): script_while = torch.jit.script(single_while) graph = torch.to_batch_graph(script_while.graph) - self.assertExpected(str(graph)) + self.assertExpected(canonical(graph)) def test_for(self): def single_for(x, y): @@ -2323,7 +2319,7 @@ def single_for(x, y): script_for = torch.jit.script(single_for) graph = torch.to_batch_graph(script_for.graph) - self.assertExpected(str(graph)) + self.assertExpected(canonical(graph)) def test_lstm(self): def LSTM(x_all, h, c, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c): @@ -2645,18 +2641,23 @@ def stuff3(x): return torch.ones(x), x self.checkScript(stuff3, ([3, 2],)) - def test_nested_list_error(self): - with self.assertRaisesRegex(RuntimeError, "Lists can only contain"): - @torch.jit.script - def foo(x): - # type: (Tuple[List[List[int]]]) -> int - return 4 + def test_nested_list(self): + def foo(z): + # type: (Tuple[int, List[List[int]]]) -> int + x, y = z + return y[0][1] + self.checkScript(foo, ((1, [[1, 2], [3, 4]]),)) + + def test_nested_list_construct(self): + def foo(): + return [[4]] + [[4, 5]] + self.checkScript(foo, ()) - def test_nested_list_construct_error(self): - with self.assertRaisesRegex(RuntimeError, "Lists can only contain"): + def test_generic_list_errors(self): + with self.assertRaisesRegex(RuntimeError, "previously matched to type"): @torch.jit.script def foo(x): - return [[4]] + return [[x]] + [[1]] def test_script_cu(self): cu = torch.jit.CompilationUnit(''' @@ -2723,18 +2724,23 @@ def func(a, b): @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") @skipIfRocm def test_clamp_fusion(self): - def func(a, b): + def func2(a, b): return torch.clamp(a + b, min=0, max=2) + def funcInf(a, b): + return torch.clamp(a + b, min=0, max=float('inf')) + a = torch.randn(4, 4, dtype=torch.float, device='cuda', requires_grad=True) b = torch.randn(4, 4, dtype=torch.float, device='cuda') - s = self.checkScript(func, (a, b)) - self.assertAllFused(s.graph_for(a, b)) + funcs = (func2, funcInf) + for f in funcs: + s = self.checkScript(f, (a, b)) + self.assertAllFused(s.graph_for(a, b)) - c = s(a, b) - c.sum().backward() - self.assertAllFused(backward_graph(s)) + c = s(a, b) + c.sum().backward() + self.assertAllFused(backward_graph(s)) def test_mul(self): def func(a, b): @@ -3448,6 +3454,80 @@ def test_over_slice(): return a[3:10] == [3, 4] self.checkScript(test_backward_slice, ()) + def test_mutable_list(self): + def test_append(): + a = [0, 1] + a.append(2) + a.append(3) + return a == [0, 1, 2, 3] + self.checkScript(test_append, ()) + + def test_append_2(): + a = [0, 1] + a.append(2) + a = [1] + a.append(4) + return a == [1, 4] + self.checkScript(test_append_2, ()) + + def test_append_if(): + a = [1] + if True: + a.append(4) + return a == [1, 4] + self.checkScript(test_append_if, ()) + + def test_append_if_else(): + a = [1] + if False: + a.append(4) + else: + a.append(10) + return a == [1, 10] + self.checkScript(test_append_if_else, ()) + + def test_append_loop(): + a = _construct_empty_int_list() + for i in range(5): + a.append(i) + + return a == [0, 1, 2, 3, 4] + self.checkScript(test_append_loop, ()) + + def test_append_loop_if(): + a = _construct_empty_int_list() + for i in range(5): + if i > 3: + a.append(i) + else: + a.append(0) + + return a == [0, 0, 0, 0, 4] + self.checkScript(test_append_loop_if, ()) + + def test_nested_loop(): + a = _construct_empty_int_list() + for i in range(2): + for j in range(2): + a.append(i + j) + + return a == [0, 1, 1, 2] + self.checkScript(test_append_loop_if, ()) + + def test_mutable_list_function_inline(self): + @torch.jit.script + def bar(y): + # type: (List[int]) -> List[int] + y.append(4) + + @torch.jit.script + def foo(): + x = [1, 2, 3] + bar(x) + return x + + self.assertEqual(foo(), [1, 2, 3, 4]) + def test_func_call(self): script = ''' def add(a, b): @@ -4845,7 +4925,6 @@ def bar(): bar() def test_tuples(self): - @torch.jit.script def foo(i): a = (i + 4, i * 2) c = a @@ -4857,10 +4936,12 @@ def foo(i): while False: t0, t1 = c c = (t1, t0) - return t0 + x = (1,) + y = 1, + return t0, x, y v = torch.rand(10, 3) - self.assertEqual(v * 9, foo(v)) + self.checkScript(foo, (v,)) with self.assertRaisesRegex(RuntimeError, r"variable 'a' previously has type \(Tensor, Tensor\)"): @torch.jit.script @@ -6491,7 +6572,7 @@ def script_fn(x): # Note: the neg op from script_fn1 should be properly inlined into the # graph of script_fn - self.assertExpected(str(script_fn.graph)) + self.assertExpected(canonical(script_fn.graph)) def test_call_script_mod_from_script_fn(self): class ScriptMod(torch.jit.ScriptModule): @@ -6508,7 +6589,7 @@ def forward(self, x): def script_fn(x): return sm(x) + 1 - self.assertExpected(str(script_fn.graph)) + self.assertExpected(canonical(script_fn.graph)) def test_call_python_fn_from_script_module(self): def python_fn(x): @@ -6607,7 +6688,7 @@ def forward(self, x): return script_fn(torch.mm(x, self.param)) sm = ScriptMod() - self.assertExpected(str(sm.__getattr__('forward').graph)) + self.assertExpected(canonical(sm.__getattr__('forward').graph)) def test_call_script_mod_from_script_module(self): class ScriptMod1(torch.jit.ScriptModule): @@ -6633,7 +6714,7 @@ def forward(self, x): # Note: the parameters from both modules should appear in the flattened # input list to the graph. The mm op from ScriptMod1 should be properly # inlined - self.assertExpected(str(sm.graph)) + self.assertExpected(canonical(sm.graph)) def test_module_with_params_called_fails(self): with self.assertRaisesRegex(RuntimeError, "Attempted to inline a Module with parameters. Stateful " @@ -7168,6 +7249,7 @@ def test_dcgan_models(self): self._test_dcgan_models(self, device='cpu') @unittest.skipIf(not RUN_CUDA, "no CUDA") + @skipIfRocm def test_dcgan_models_cuda(self): # XXX: export_import on CUDA modules doesn't work (#11480) self._test_dcgan_models(self, device='cuda', check_export_import=False) @@ -7290,11 +7372,13 @@ def test_mnist(self): self._test_mnist(self, device='cpu') @unittest.skipIf(not RUN_CUDA, "no CUDA") + @skipIfRocm def test_mnist_cuda(self): # XXX: export_import on CUDA modules doesn't work (#11480) self._test_mnist(self, device='cuda', check_export_import=False) @unittest.skipIf(not RUN_CUDA, "no CUDA") + @skipIfRocm def test_mnist_training_leaks_no_memory_cuda(self): net = MnistNet().cuda() # MnistNet uses dropout, don't check its trace diff --git a/test/test_nn.py b/test/test_nn.py index 0d61d72f3ceb66..eee4e3a7c74755 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -4202,6 +4202,7 @@ def get_inputs(input_shape, hidden_shape, mode): test(input_shape, hidden_shape, mode) @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") + @skipIfRocm def test_rnn_check_device(self): input_size = 3 hidden_size = 5 diff --git a/test/test_sparse.py b/test/test_sparse.py index 1304f42bda78fa..a91681d4767049 100644 --- a/test/test_sparse.py +++ b/test/test_sparse.py @@ -496,6 +496,76 @@ def test_shape(sparse_dims, nnz, with_size): test_shape(3, 10, [100, 100, 100, 5, 5, 5, 0]) test_shape(3, 0, [0, 0, 100, 5, 5, 5, 0]) + @skipIfRocm + def test_Sparse_to_Sparse_copy_(self): + # This is for testing torch.copy_(SparseTensor, SparseTensor) + sparse_dims = 3 + nnz = 10 + sizes = [2, 3, 4, 5] # hybrid sparse + x1, _, _ = self._gen_sparse(sparse_dims, nnz, sizes) + x2, _, _ = self._gen_sparse(sparse_dims, nnz + 10, sizes) + + # test copy + x2_dense = x2.to_dense() + x1.copy_(x2) + self.assertEqual(x2_dense, x1.to_dense()) + + # test type conversion (when x1.copy_(x2), x1.dtype should stay the same) + x1 = x1.to(torch.float32) + x2 = x2.to(torch.float64) + x1_dtype = x1.dtype + x1.copy_(x2) + self.assertEqual(x1_dtype, x1.dtype) + + # test no broadcast + self.assertRaises(RuntimeError, lambda: x1.copy_(x2.narrow_copy(0, 0, 1))) + + # test raise error on copy_() between dense and sparse Tensors + self.assertRaises(RuntimeError, lambda: x1.copy_(torch.randn(5, 5))) + + # test autograd + x1, _, _ = self._gen_sparse(sparse_dims, nnz, sizes) + x2, _, _ = self._gen_sparse(sparse_dims, nnz + 10, sizes) + x2.requires_grad_(True) + x1.copy_(x2) + y = x1 * 2 + x2_clone = x2.clone() + y.backward(x2_clone) + expected_grad = x2_clone * 2 + self.assertEqual(expected_grad.to_dense(), x2.grad.to_dense()) + self.assertEqual(None, x1.grad) + + @unittest.skipIf(torch.cuda.device_count() < 2, "no multi-GPU") + @skipIfRocm + def test_Sparse_to_Sparse_copy_multi_gpu(self): + # This is for testing torch.copy_(SparseTensor, SparseTensor) across GPU devices + sparse_dims = 3 + nnz = 10 + sizes = [2, 3, 4, 5] # hybrid sparse + x1, _, _ = self._gen_sparse(sparse_dims, nnz, sizes) + x2, _, _ = self._gen_sparse(sparse_dims, nnz + 10, sizes) + x1 = x1.to('cuda:0') + + def test_cross_device(x1, x2): + x1_device = x1.device + x1.copy_(x2) + self.assertEqual(x2.to('cuda:0').to_dense(), x1.to_dense()) + self.assertEqual(x1_device, x1.device) + + test_cross_device(x1, x2.to('cuda:1')) # test across gpu devices + test_cross_device(x1, x2.to('cpu')) # test between cpu and gpu + + # test autograd + x2 = x2.to('cuda:1') + x2.requires_grad_(True) + x1.copy_(x2) + y = x1 * 2 + x2_clone = x2.clone().to('cuda:0') + y.backward(x2_clone) + expected_grad = x2_clone * 2 + self.assertEqual(expected_grad.to_dense(), x2.grad.to('cuda:0').to_dense()) + self.assertEqual(None, x1.grad) + @cuda_only def test_cuda_empty(self): def test_tensor(x): @@ -1023,6 +1093,34 @@ def test_shape(i_shapes, v_shapes, nnzs): test_shape([0, 3, 4], [3, 4, 5, 6], [0]) test_shape([2, 3, 4], [0, 4, 5, 6], [9, 12]) + def _test_narrow(self, input, narrow_args): + expected = input.to_dense().narrow(*narrow_args) + self.assertEqual(expected, input.narrow_copy(*narrow_args).to_dense()) + + def _all_narrow_combs(self, shape): + for dim, dim_sz in enumerate(shape): + for start in range(dim_sz): + for length in range(dim_sz - start): + yield [dim, start, length] + + @skipIfRocm + def test_narrow(self): + shape = [3, 3, 4, 2] + input, _, _ = self._gen_sparse(4, 19, shape) + for narrow_args in self._all_narrow_combs(shape): + self._test_narrow(input, narrow_args) + + self.assertRaises(RuntimeError, lambda: input.narrow_copy(-1, 0, 3)) # dim < 0 + self.assertRaises(RuntimeError, lambda: input.narrow_copy(10, 0, 3)) # dim > input.dim() + self.assertRaises(RuntimeError, lambda: input.narrow_copy(0, shape[0] + 1, 3)) # start > size of dim + self.assertRaises(RuntimeError, lambda: input.narrow_copy(0, 2, shape[0])) # start+length > size of dim + + with_dense, _, _ = self._gen_sparse(2, 7, shape) + for narrow_args in self._all_narrow_combs(shape): + self._test_narrow(with_dense, narrow_args) + + self.assertRaises(RuntimeError, lambda: with_dense.narrow_copy(10, 0, 3)) # dim > sparseDim + denseDim + def _test_log1p_tensor(self, input, dense_tensor): expected_output = torch.tensor(dense_tensor).log1p_() self.assertEqual(expected_output, input.log1p().to_dense()) @@ -1410,6 +1508,7 @@ def test_tensor(indices, values, indices_equal, values_equal): test_tensor(indices, values, False, True) # An empty tensor's data_ptr is always equal to 0 @cpu_only # just run once, we test both cpu and cuda + @skipIfRocm def test_constructor_device_legacy(self): i = torch.tensor([[0, 1, 1], [2, 0, 2]]) v = torch.tensor([3., 4., 5.]) @@ -1556,6 +1655,7 @@ def test_resize(self): self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3], [1, 1], [1, 2, 0], [2, 2, 0]) + @skipIfRocm def test_is_nonzero(self): self.assertTrue(torch.sparse_coo_tensor(([0],), 1., (1,)).is_nonzero()) self.assertFalse(torch.sparse_coo_tensor(([0],), 0., (1,)).is_nonzero()) diff --git a/test/test_torch.py b/test/test_torch.py index 84ef8a22e050b3..3026548b99043e 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -3999,6 +3999,7 @@ def test_is_signed_cuda(self): self.assertEqual(torch.cuda.HalfTensor(10).is_signed(), True) @skipIfNoLapack + @skipIfRocm def test_gesv(self): a = torch.Tensor(((6.80, -2.11, 5.66, 5.97, 8.23), (-6.05, -3.30, 5.36, -4.44, 1.08), @@ -4130,6 +4131,7 @@ def test_gesv_batched_dims(self): self._test_gesv_batched_dims(self, lambda t: t) @skipIfNoLapack + @skipIfRocm def test_qr(self): # Since the QR decomposition is unique only up to the signs of the rows of @@ -4312,10 +4314,12 @@ def _test_trtrs(self, cast): self.assertEqual(res1, tb, 0) @skipIfNoLapack + @skipIfRocm def test_trtrs(self): self._test_trtrs(self, lambda t: t) @skipIfNoLapack + @skipIfRocm def test_gels(self): def _test_underdetermined(a, b, expectedNorm): m = a.size()[0] @@ -4431,6 +4435,7 @@ def check_norm(a, b, expected_norm, gels_result): self.assertEqual((torch.mm(a, tb) - b).norm(), expectedNorm, 1e-8) @skipIfNoLapack + @skipIfRocm def test_eig(self): a = torch.Tensor(((1.96, 0.00, 0.00, 0.00, 0.00), (-6.49, 3.80, 0.00, 0.00, 0.00), diff --git a/test/test_utils.py b/test/test_utils.py index 971e8a4f05f8e0..dff6102e4579e7 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -25,16 +25,6 @@ from common import TestCase, run_tests, download_file -try: - import cffi - HAS_CFFI = True -except ImportError: - HAS_CFFI = False - - -if HAS_CFFI: - from torch.utils.ffi import create_extension - class SimplePlugin(Plugin): @@ -371,74 +361,9 @@ def test_model_gradient(self): class TestFFI(TestCase): - - def setUp(self): - self.tmpdir = tempfile.mkdtemp() - os.chdir(self.tmpdir) - sys.path.append(self.tmpdir) - - def tearDown(self): - shutil.rmtree(self.tmpdir) - - @unittest.skipIf(not HAS_CFFI, "ffi tests require cffi package") - @unittest.skipIf(IS_WINDOWS, "ffi doesn't currently work on Windows") - @unittest.skipIf(IS_PPC, "skip for ppc64le due to incompatible exception handling") - def test_cpu(self): - create_extension( - name='test_extensions.cpulib', - headers=[test_dir + '/ffi/src/cpu/lib.h'], - sources=[ - test_dir + '/ffi/src/cpu/lib1.c', - test_dir + '/ffi/src/cpu/lib2.c', - ], - verbose=False, - ).build() - from test_extensions import cpulib - tensor = torch.ones(2, 2).float() - - cpulib.good_func(tensor, 2, 1.5) - self.assertEqual(tensor, torch.ones(2, 2) * 2 + 1.5) - - new_tensor = cpulib.new_tensor(4) - self.assertEqual(new_tensor, torch.ones(4, 4) * 4) - - f = cpulib.int_to_float(5) - self.assertIs(type(f), float) - - self.assertRaises(TypeError, - lambda: cpulib.good_func(tensor.double(), 2, 1.5)) - self.assertRaises(torch.FatalError, - lambda: cpulib.bad_func(tensor, 2, 1.5)) - - @unittest.skipIf(not HAS_CFFI or not HAS_CUDA, "ffi tests require cffi package") - @unittest.skipIf(IS_WINDOWS, "ffi doesn't currently work on Windows") - @skipIfRocm - def test_gpu(self): - from torch.utils.cpp_extension import CUDA_HOME - create_extension( - name='gpulib', - headers=[test_dir + '/ffi/src/cuda/cudalib.h'], - sources=[ - test_dir + '/ffi/src/cuda/cudalib.c', - ], - with_cuda=True, - verbose=False, - include_dirs=[os.path.join(CUDA_HOME, 'include')], - ).build() - import gpulib - tensor = torch.ones(2, 2).float() - - gpulib.good_func(tensor, 2, 1.5) - self.assertEqual(tensor, torch.ones(2, 2) * 2 + 1.5) - - ctensor = tensor.cuda().fill_(1) - gpulib.cuda_func(ctensor, 2, 1.5) - self.assertEqual(ctensor, torch.ones(2, 2) * 2 + 1.5) - - self.assertRaises(TypeError, - lambda: gpulib.cuda_func(tensor, 2, 1.5)) - self.assertRaises(TypeError, - lambda: gpulib.cuda_func(ctensor.storage(), 2, 1.5)) + def test_deprecated(self): + with self.assertRaisesRegex(ImportError, "torch.utils.ffi is deprecated. Please use cpp extensions instead."): + from torch.utils.ffi import create_extension @unittest.skipIf('SKIP_TEST_BOTTLENECK' in os.environ.keys(), 'SKIP_TEST_BOTTLENECK is set') diff --git a/third_party/ideep b/third_party/ideep index 4bd9a6800bf7db..dedff8fb8193fe 160000 --- a/third_party/ideep +++ b/third_party/ideep @@ -1 +1 @@ -Subproject commit 4bd9a6800bf7db068187619e0582d34dec9651dc +Subproject commit dedff8fb8193fe3a1ea893d4bc852f8ea395b6b3 diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py index 81856c62ad07d9..7e26d84432182c 100644 --- a/tools/autograd/gen_python_functions.py +++ b/tools/autograd/gen_python_functions.py @@ -28,7 +28,8 @@ '_cumsum.*', '_cumprod.*', '_sum.*', '_prod.*', '_th_.*', 'arange.*', 'range.*', '_gesv.*', '_getri.*', 'slice', 'randint(_out)?', '_local_scalar', '_local_scalar_dense', - 'max_pool1d', 'max_pool2d', 'max_pool3d', 'linear', 'to' + 'max_pool1d', 'max_pool2d', 'max_pool3d', 'linear', 'to', + 'copy_sparse_to_sparse_' ] # These function signatures are not exposed to Python. Note that this signature diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp index 24ac92dd63926f..64ad9fc5e6d185 100644 --- a/tools/autograd/templates/VariableType.cpp +++ b/tools/autograd/templates/VariableType.cpp @@ -416,7 +416,9 @@ Tensor & VariableType::s_copy_(Tensor & self, const Tensor & src, bool non_block grad_fn->src_device = src.get_device(); } } - baseType->s_copy_(self_, src_, non_blocking); + if (self.is_sparse() && src.is_sparse()) baseType->copy_sparse_to_sparse_(self_, src_, non_blocking); + else if (!self.is_sparse() && !src.is_sparse()) baseType->s_copy_(self_, src_, non_blocking); + else AT_ERROR("copy_() between dense and sparse Tensors is not implemented! Found self type = ", self.type(), " and src type = ", src.type()); increment_version(self); rebase_history(as_variable_ref( self ), std::move(grad_fn)); if(torch::jit::tracer::isTracing()) { diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index ce337e93c85463..27e580e8965edf 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -149,6 +149,7 @@ set(TORCH_SRCS ${TORCH_SRC_DIR}/csrc/jit/ir.cpp ${TORCH_SRC_DIR}/csrc/jit/operator.cpp ${TORCH_SRC_DIR}/csrc/jit/operator.cpp + ${TORCH_SRC_DIR}/csrc/jit/passes/annotate_effects.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/batch_mm.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/canonicalize.cpp ${TORCH_SRC_DIR}/csrc/jit/passes/constant_propagation.cpp @@ -174,6 +175,7 @@ set(TORCH_SRCS ${TORCH_SRC_DIR}/csrc/jit/register_special_ops.cpp ${TORCH_SRC_DIR}/csrc/jit/register_symbols.cpp ${TORCH_SRC_DIR}/csrc/jit/script/compiler.cpp + ${TORCH_SRC_DIR}/csrc/jit/script/builtin_functions.cpp ${TORCH_SRC_DIR}/csrc/jit/script/lexer.cpp ${TORCH_SRC_DIR}/csrc/jit/script/module.cpp ${TORCH_SRC_DIR}/csrc/jit/test_jit.cpp diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py index f5c7d41c199e0b..832de8d76db4b0 100644 --- a/torch/_tensor_docs.py +++ b/torch/_tensor_docs.py @@ -1453,6 +1453,17 @@ def callable(a, b) -> number [ 8, 9]]) """) +add_docstr_all('narrow_copy', + r""" +narrow_copy(dimension, start, length) -> Tensor + +Same as :meth:`Tensor.narrow` except returning a copy rather +than shared storage. This is primarily for sparse tensors, which +do not have a shared-storage narrow method. Calling ```narrow_copy`` +with ```dimemsion > self._sparseDims()``` will return a copy with the +relevant dense dimension narrowed, and ```self.shape``` updated accordingly. +""") + add_docstr_all('ndimension', r""" ndimension() -> int diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index 7601ce3c268d4c..8f3c1ae6ebf73c 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -2938,7 +2938,7 @@ def parse_kwargs(desc): .. math:: \log(\Gamma_{p}(a)) = C + \displaystyle \sum_{i=1}^{p} \log\left(\Gamma\left(a - \frac{i - 1}{2}\right)\right) -where :math:`C = \log(\pi) \times \frac{p (p - 1)}{2}` and :math:`\Gamma(.)` is the Gamma function. +where :math:`C = \log(\pi) \times \frac{p (p - 1)}{2}` and :math:`\Gamma(\cdot)` is the Gamma function. If any of the elements are less than or equal to :math:`\frac{p - 1}{2}`, then an error is thrown. diff --git a/torch/csrc/api/src/serialize/input-archive.cpp b/torch/csrc/api/src/serialize/input-archive.cpp index bd6995d67d69e9..11e97bce08f564 100644 --- a/torch/csrc/api/src/serialize/input-archive.cpp +++ b/torch/csrc/api/src/serialize/input-archive.cpp @@ -6,7 +6,7 @@ #include #include -#include +#include #include #include diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp index 1847bb65b08f8a..a5edc29833633a 100644 --- a/torch/csrc/autograd/engine.cpp +++ b/torch/csrc/autograd/engine.cpp @@ -8,7 +8,7 @@ #include #include -#include +#include #include #include diff --git a/torch/csrc/generic/serialization.cpp b/torch/csrc/generic/serialization.cpp index 2299cce245a16b..1e4e7bf7b9e37f 100644 --- a/torch/csrc/generic/serialization.cpp +++ b/torch/csrc/generic/serialization.cpp @@ -2,8 +2,6 @@ #define TH_GENERIC_FILE "generic/serialization.cpp" #else -#define SYSCHECK(call) { ssize_t __result = call; if (__result < 0) throw std::system_error((int) __result, std::system_category()); } - template void THPStorage_(writeFileRaw)(THWStorage *self, io fd) { @@ -16,23 +14,10 @@ void THPStorage_(writeFileRaw)(THWStorage *self, io fd) data = (scalar_t*)cpu_data.get(); THCudaCheck(cudaMemcpy(data, THWStorage_(data)(LIBRARY_STATE self), size * sizeof(scalar_t), cudaMemcpyDeviceToHost)); #endif - ssize_t result = doWrite(fd, &size, sizeof(int64_t)); - if (result != sizeof(int64_t)) - throw std::system_error(result, std::system_category()); + doWrite(fd, &size, sizeof(int64_t)); // fast track for bytes and little endian if (sizeof(scalar_t) == 1 || THP_nativeByteOrder() == THPByteOrder::THP_LITTLE_ENDIAN) { - char *bytes = (char *) data; - int64_t remaining = sizeof(scalar_t) * size; - while (remaining > 0) { - // we write and read in 1GB blocks to avoid bugs on some OSes - ssize_t result = doWrite(fd, bytes, THMin(remaining, 1073741824)); - if (result < 0) - throw std::system_error(result, std::system_category()); - bytes += result; - remaining -= result; - } - if (remaining != 0) - throw std::system_error(result, std::system_category()); + doWrite(fd, data, sizeof(scalar_t) * size); } else { int64_t buffer_size = std::min(size, (int64_t)5000); std::unique_ptr le_buffer(new uint8_t[buffer_size * sizeof(scalar_t)]); @@ -54,7 +39,7 @@ void THPStorage_(writeFileRaw)(THWStorage *self, io fd) THPByteOrder::THP_LITTLE_ENDIAN, to_convert); } - SYSCHECK(doWrite(fd, le_buffer.get(), to_convert * sizeof(scalar_t))); + doWrite(fd, le_buffer.get(), to_convert * sizeof(scalar_t)); } } } @@ -67,11 +52,7 @@ THWStorage * THPStorage_(readFileRaw)(io file, THWStorage *_storage) { scalar_t *data; int64_t size; - ssize_t result = doRead(file, &size, sizeof(int64_t)); - if (result == 0) - throw std::runtime_error("unexpected EOF. The file might be corrupted."); - if (result != sizeof(int64_t)) - throw std::system_error(result, std::system_category()); + doRead(file, &size, sizeof(int64_t)); THWStoragePtr storage; if (_storage == nullptr) { storage = THWStorage_(newWithSize)(LIBRARY_STATE size); @@ -91,20 +72,7 @@ THWStorage * THPStorage_(readFileRaw)(io file, THWStorage *_storage) // fast track for bytes and little endian if (sizeof(scalar_t) == 1 || THP_nativeByteOrder() == THPByteOrder::THP_LITTLE_ENDIAN) { - char *bytes = (char *) data; - int64_t remaining = sizeof(scalar_t) * THWStorage_(size)(LIBRARY_STATE storage); - while (remaining > 0) { - // we write and read in 1GB blocks to avoid bugs on some OSes - ssize_t result = doRead(file, bytes, THMin(remaining, 1073741824)); - if (result == 0) // 0 means EOF, which is also an error - throw std::runtime_error("unexpected EOF. The file might be corrupted."); - if (result < 0) - throw std::system_error(result, std::system_category()); - bytes += result; - remaining -= result; - } - if (remaining != 0) - throw std::system_error(result, std::system_category()); + doRead(file, data, sizeof(scalar_t) * THWStorage_(size)(LIBRARY_STATE storage)); } else { int64_t buffer_size = std::min(size, (int64_t)5000); std::unique_ptr le_buffer(new uint8_t[buffer_size * sizeof(scalar_t)]); @@ -112,7 +80,7 @@ THWStorage * THPStorage_(readFileRaw)(io file, THWStorage *_storage) for (int64_t i = 0; i < size; i += buffer_size) { size_t to_convert = std::min(size - i, buffer_size); - SYSCHECK(doRead(file, le_buffer.get(), sizeof(scalar_t) * to_convert)); + doRead(file, le_buffer.get(), sizeof(scalar_t) * to_convert); if (sizeof(scalar_t) == 2) { THP_decodeInt16Buffer((int16_t*)data + i, @@ -142,6 +110,4 @@ THWStorage * THPStorage_(readFileRaw)(io file, THWStorage *_storage) template THWStorage* THPStorage_(readFileRaw)(int fd, THWStorage* storage); template THWStorage* THPStorage_(readFileRaw)(PyObject* fd, THWStorage* storage); -#undef SYSCHECK - #endif diff --git a/torch/csrc/jit/argument_spec.h b/torch/csrc/jit/argument_spec.h index 10b0cad6749128..2d1b9f7b147abb 100644 --- a/torch/csrc/jit/argument_spec.h +++ b/torch/csrc/jit/argument_spec.h @@ -61,7 +61,6 @@ static_assert(sizeof(ArgumentInfo) == sizeof(ArgumentInfo::plain_data_type), struct ArgumentSpec { ArgumentSpec(bool with_grad, at::ArrayRef inputs, size_t num_flat_inputs) { hash_code = num_flat_inputs; - args.resize(num_flat_inputs); size_t offset = 0; for (size_t i = 0; i < inputs.size(); ++i) { diff --git a/torch/csrc/jit/autodiff.cpp b/torch/csrc/jit/autodiff.cpp index 009bf68ae3f6da..a59c856eaba751 100644 --- a/torch/csrc/jit/autodiff.cpp +++ b/torch/csrc/jit/autodiff.cpp @@ -27,14 +27,10 @@ bool isDifferentiable(Node * n) { static OperatorSet differentiable_ops = { "aten::add(Tensor self, Tensor other, *, Scalar alpha) -> Tensor", "aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor", - "aten::add(Scalar other, Tensor self) -> Tensor", "aten::sub(Tensor self, Tensor other, *, Scalar alpha) -> Tensor", "aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor", - "aten::sub(Scalar other, Tensor self) -> Tensor", "aten::mul(Tensor self, Tensor other) -> Tensor", "aten::mul(Tensor self, Scalar other) -> Tensor", - "aten::mul(Scalar other, Tensor self) -> Tensor", - "aten::div(Scalar other, Tensor self) -> Tensor", "aten::div(Tensor self, Tensor other) -> Tensor", "aten::div(Tensor self, Scalar other) -> Tensor", "aten::sigmoid(Tensor self) -> Tensor", @@ -132,9 +128,6 @@ static std::vector gradientForNode(Node* node, ArrayRef grad_val } else if (node->matches("aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor")) { return {grads.at(0), nullptr, nullptr}; - } else if (node->matches("aten::add(Scalar other, Tensor self) -> Tensor")) { - return {nullptr, grads.at(0)}; - } else if (node->kind() == prim::AutogradAdd) { return {grads.at(0), grads.at(0)}; @@ -144,29 +137,23 @@ static std::vector gradientForNode(Node* node, ArrayRef grad_val } else if (node->matches("aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor")) { return {grads.at(0), nullptr, nullptr}; - } else if (node->matches("aten::sub(Scalar other, Tensor self) -> Tensor")) { - return {nullptr, -grads.at(0)}; - } else if (node->matches("aten::mul(Tensor self, Tensor other) -> Tensor")) { return {grads.at(0) * inputs.at(1), grads.at(0) * inputs.at(0)}; } else if (node->matches("aten::mul(Tensor self, Scalar other) -> Tensor")) { return {grads.at(0) * inputs.at(1), nullptr}; - } else if (node->matches("aten::mul(Scalar other, Tensor self) -> Tensor")) { - return {nullptr, grads.at(0) * inputs.at(0)}; - } else if (node->matches("aten::div(Tensor self, Tensor other) -> Tensor")) { return {grads.at(0) / inputs.at(1), -grads.at(0) * inputs.at(0) / (inputs.at(1) * inputs.at(1))}; } else if (node->matches("aten::div(Tensor self, Scalar other) -> Tensor")) { return {grads.at(0) / inputs.at(1), nullptr}; - } else if (node->matches("aten::div(Scalar other, Tensor self) -> Tensor")) { - return {nullptr, -grads.at(0) * inputs.at(0) / (inputs.at(1) * inputs.at(1))}; - } else if (node->matches("aten::sigmoid(Tensor self) -> Tensor")) { - return {grads.at(0) * outputs.at(0) * (1 - outputs.at(0))}; + // TODO: The order of operations matter in this case. This + // works for ppc64le and x86_64. Need to look at why the + // order matters. + return {(1 - outputs.at(0)) * outputs.at(0) * grads.at(0)}; } else if (node->matches("aten::tanh(Tensor self) -> Tensor")) { return {grads.at(0) * (1 - outputs.at(0) * outputs.at(0))}; diff --git a/torch/csrc/jit/constants.cpp b/torch/csrc/jit/constants.cpp index f1844d2bac6651..1633ac0de4d6aa 100644 --- a/torch/csrc/jit/constants.cpp +++ b/torch/csrc/jit/constants.cpp @@ -42,6 +42,8 @@ Value* insertConstant( n->destroy(); n = g.create(prim::None); n->output()->setType(NoneType::get()); + } else if(val.isWorld()) { + n->output()->setType(WorldType::get()); } else { throw constant_not_supported_error("Unsupported value kind: " + val.tagKind()); } diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp index 437d0f6c779972..973780b7d7d62f 100644 --- a/torch/csrc/jit/export.cpp +++ b/torch/csrc/jit/export.cpp @@ -15,6 +15,7 @@ #include #include #include +#include namespace torch { namespace jit { @@ -425,7 +426,7 @@ void GraphEncoder::EncodeTensor( class ModuleEncoder: public EncoderBase { public: ModuleEncoder(const script::Module &module, - const std::string &filename); + std::ostream& out); private: void EncodeModule(onnx::GraphProto *graph_proto, const script::Module &module); @@ -448,7 +449,7 @@ class ModuleEncoder: public EncoderBase { virtual void EncodeTensor(onnx::TensorProto *tensor_proto, const at::Tensor &tensor, - const at::optional external_ref) override; + const at::optional external_ref = {}) override; virtual void EncodeIntermediateValueInfo(onnx::GraphProto *graph_proto, const Value* n) override; @@ -462,7 +463,7 @@ class ModuleEncoder: public EncoderBase { const TypePtr& type, const std::string& name); - PyTorchFileWriter file_writer_; + PyTorchStreamWriter stream_writer_; // Used to deduplicate tensor storages std::unordered_map storage_dedup_map_; @@ -475,9 +476,9 @@ class ModuleEncoder: public EncoderBase { ModuleEncoder::ModuleEncoder( const script::Module &module, - const std::string &filename) + std::ostream& out) : EncoderBase(onnx_torch::OperatorExportTypes::RAW, false), - file_writer_(filename) { + stream_writer_(out) { model_proto_.set_doc_string("THIS PROTO IS NOT STANDARD ONNX"); EncodeModule(model_proto_.mutable_graph(), module); } @@ -564,6 +565,10 @@ void ModuleEncoder::EncodeTypeInfo( type_proto->set_denotation("GeneratorType"); } else if (kind == TypeKind::StringType) { type_proto->set_denotation("StringType"); + } else if (kind == TypeKind::VarType) { + type_proto->set_denotation("TypeVar:" + type->expect()->name()); + } else if (kind == TypeKind::WorldType) { + type_proto->set_denotation("WorldType"); } else { throw std::runtime_error("unexpected type kind"); } @@ -582,7 +587,7 @@ void ModuleEncoder::EncodeModule( EncodeParameters(graph_proto, module, ""); EncodeMethods(graph_proto, module, ""); auto str = model_proto_.SerializeAsString(); - file_writer_.writeRecord(str.data(), str.size()); + stream_writer_.writeRecord(str.data(), str.size()); } void ModuleEncoder::EncodeParameters( @@ -670,7 +675,7 @@ void ModuleEncoder::EncodeMethod( void ModuleEncoder::EncodeTensor( onnx::TensorProto *tensor_proto, const at::Tensor &tensor, - const at::optional external_ref = {}) { + const at::optional external_ref) { auto storage_ptr = tensor.storage().unsafeGetStorageImpl(); auto dedup_it = storage_dedup_map_.find(storage_ptr); if (dedup_it != storage_dedup_map_.end()) { @@ -689,7 +694,7 @@ void ModuleEncoder::EncodeTensor( .cpu(); } - auto record_number = file_writer_.writeRecord( + auto record_number = stream_writer_.writeRecord( static_cast(t.storage().data()), t.type().elementSizeInBytes() * t.storage().size()); tensor_proto->add_int64_data(record_number); storage_dedup_map_[storage_ptr] = record_number; @@ -915,8 +920,14 @@ std::tuple ExportGraph( graph_encoder.get_raw_data_export_map()); } +void ExportModule(const script::Module& module, std::ostream& out) { + ModuleEncoder(module, out); +} + void ExportModule(const script::Module& module, const std::string &filename) { - ModuleEncoder(module, filename); + std::ofstream out(filename, std::ios_base::binary); + + ExportModule(module, out); } }} diff --git a/torch/csrc/jit/export.h b/torch/csrc/jit/export.h index f7eee3dc77ac07..363de0b56ac169 100644 --- a/torch/csrc/jit/export.h +++ b/torch/csrc/jit/export.h @@ -4,6 +4,8 @@ #include "torch/csrc/jit/script/module.h" #include "torch/csrc/onnx/onnx.h" +#include + namespace torch { namespace jit { // This map is used to keep track of parameters that should be exported @@ -34,6 +36,10 @@ TORCH_API std::string PrettyPrintExportedGraph( = ::torch::onnx::OperatorExportTypes::ONNX, bool google_printer = false); +TORCH_API void ExportModule( + const script::Module& module, + std::ostream& out); + TORCH_API void ExportModule( const script::Module& module, const std::string& filename); diff --git a/torch/csrc/jit/function_schema.h b/torch/csrc/jit/function_schema.h index c7b53abf46c2a2..dcaaf766e18c0e 100644 --- a/torch/csrc/jit/function_schema.h +++ b/torch/csrc/jit/function_schema.h @@ -46,7 +46,10 @@ struct FunctionSchema { arguments(std::move(arguments)), returns(std::move(returns)), is_vararg(is_vararg), - is_varret(is_varret) {} + is_varret(is_varret), + is_mutable(isMutable()) { + validate(); + } FunctionSchema( Symbol name, std::vector arguments, @@ -58,7 +61,9 @@ struct FunctionSchema { std::move(std::move(arguments)), std::move(std::move(returns)), is_vararg, - is_varret) {} + is_varret) { + validate(); + } const std::string name; const std::vector arguments; @@ -69,6 +74,8 @@ struct FunctionSchema { // arguments are not checked by schema const bool is_vararg; const bool is_varret; + const bool is_mutable; + at::optional argumentIndexWithName(const std::string& name) const { for(size_t i = 0; i < arguments.size(); ++i) { if(name == arguments[i].name) @@ -76,6 +83,23 @@ struct FunctionSchema { } return at::nullopt; } + + private: + bool isMutable() const { + return std::any_of( + arguments.cbegin(), arguments.cend(), [](const Argument& arg) { + return arg.type == WorldType::get(); + }); + } + + void validate() const { + if (is_mutable) { + // Mutable schemas should have a world token as the first argument + // and return. + JIT_ASSERT(arguments.at(0).type == WorldType::get()); + JIT_ASSERT(returns.at(0).type == WorldType::get()); + } + } }; // for debugging, make sure we can describe the call site diff --git a/torch/csrc/jit/fusers/common/fused_kernel.cpp b/torch/csrc/jit/fusers/common/fused_kernel.cpp index 6095bb13748470..5718a656a7f520 100644 --- a/torch/csrc/jit/fusers/common/fused_kernel.cpp +++ b/torch/csrc/jit/fusers/common/fused_kernel.cpp @@ -24,6 +24,7 @@ #include #include #include +#include namespace torch { namespace jit { @@ -235,9 +236,22 @@ static std::string scalarValue(int64_t v) { return std::to_string(v); } +// Note: The NAN, NEG_INFINITY and POS_INFINITY strings map to device-specific +// implementations of these special values. These macros are found in the +// resource strings for each device. static std::string scalarValue(double v) { std::ostringstream out; - out << std::scientific << v << "f"; + if (std::isnan(v)) { + out << "NAN"; + } else if (std::isinf(v)) { + if (v < 0) { + out << "NEG_INFINITY"; + } else { + out << "POS_INFINITY"; + } + } else { + out << std::scientific << v << "f"; + } return out.str(); } diff --git a/torch/csrc/jit/fusers/cpu/resource_strings.h b/torch/csrc/jit/fusers/cpu/resource_strings.h index 60c1c0faaa4fea..59a92ccc19b740 100644 --- a/torch/csrc/jit/fusers/cpu/resource_strings.h +++ b/torch/csrc/jit/fusers/cpu/resource_strings.h @@ -11,6 +11,10 @@ Correct code for this case is generated, however, nvrtc does not know how to han so typedefs help it handle those cases*/ auto type_declarations_template = CodeTemplate(R"( + +#define POS_INFINITY INFINITY +#define NEG_INFINITY -INFINITY + typedef ${IndexType} IndexType; template struct TensorInfo { diff --git a/torch/csrc/jit/fusers/cuda/resource_strings.h b/torch/csrc/jit/fusers/cuda/resource_strings.h index 0063288721d727..6278a4f239636c 100644 --- a/torch/csrc/jit/fusers/cuda/resource_strings.h +++ b/torch/csrc/jit/fusers/cuda/resource_strings.h @@ -18,6 +18,10 @@ typedef long long int int64_t; ${HalfHeader} ${RandHeader} +#define NAN __int_as_float(0x7fffffff) +#define POS_INFINITY __int_as_float(0x7f800000) +#define NEG_INFINITY __int_as_float(0xff800000) + typedef ${IndexType} IndexType; template struct TensorInfo { diff --git a/torch/csrc/jit/graph_executor.cpp b/torch/csrc/jit/graph_executor.cpp index d071c464721559..20ee429b3696c8 100644 --- a/torch/csrc/jit/graph_executor.cpp +++ b/torch/csrc/jit/graph_executor.cpp @@ -7,6 +7,7 @@ #include "torch/csrc/jit/interpreter.h" #include "torch/csrc/jit/ir.h" #include "torch/csrc/jit/tracer.h" +#include "torch/csrc/jit/passes/annotate_effects.h" #include "torch/csrc/jit/passes/batch_mm.h" #include "torch/csrc/jit/passes/common_subexpression_elimination.h" #include "torch/csrc/jit/passes/create_autodiff_subgraphs.h" diff --git a/torch/csrc/jit/import.cpp b/torch/csrc/jit/import.cpp index b2fa6eba2f748a..4574addb3a4465 100644 --- a/torch/csrc/jit/import.cpp +++ b/torch/csrc/jit/import.cpp @@ -11,6 +11,7 @@ #include #include #include +#include namespace torch { namespace jit { @@ -181,7 +182,7 @@ void DecoderBase::buildBlock(const onnx::GraphProto& graph_proto, Block* block, class ModuleDecoder : DecoderBase { public: ModuleDecoder(ModuleLookup module_lookup, - const std::string& filename); + std::istream& in); private: virtual std::shared_ptr buildGraph(const onnx::GraphProto& graph_proto) override; @@ -205,7 +206,7 @@ class ModuleDecoder : DecoderBase { ModuleLookup module_lookup, const std::string fullname); - PyTorchFileReader file_reader_; + PyTorchStreamReader stream_reader_; std::unordered_map> storage_map_; std::unordered_map value_type_map_; }; @@ -260,8 +261,12 @@ TypePtr ModuleDecoder::buildType(const onnx::TypeProto& type_proto) { return NoneType::get(); } else if (kind == "GeneratorType") { return GeneratorType::get(); - }else if (kind == "StringType") { + } else if (kind == "WorldType") { + return WorldType::get(); + } else if (kind == "StringType") { return StringType::get(); + } else if (kind.find("TypeVar:") == 0) { + return VarType::create(kind.substr(strlen("TypeVar:"))); } else { throw std::runtime_error("unexpected string for type kind"); } @@ -315,7 +320,7 @@ at::Tensor ModuleDecoder::buildTensorCommon( if (storage_it == storage_map_.end()) { at::DataPtr storage_ptr; int64_t size; - std::tie(storage_ptr, size) = file_reader_.getRecordWithKey(record_number); + std::tie(storage_ptr, size) = stream_reader_.getRecordWithKey(record_number); auto storage = std::make_shared( at::CPU(type).typeMeta(), std::move(storage_ptr), @@ -349,10 +354,10 @@ std::pair, std::string> ModuleDecoder::parseFull ModuleDecoder::ModuleDecoder( ModuleLookup module_lookup, - const std::string &filename) : - file_reader_(filename) { + std::istream& in) : + stream_reader_(in) { auto model_proto = onnx::ModelProto(); - auto record = file_reader_.getLastRecord(); + auto record = stream_reader_.getLastRecord(); model_proto.ParsePartialFromArray(std::get<0>(record).get(), std::get<1>(record)); auto graph_proto = model_proto.graph(); @@ -391,13 +396,21 @@ ModuleDecoder::ModuleDecoder( } // namespace +void import_ir_module( + ModuleLookup module_lookup, + std::istream& in) { + ModuleDecoder(module_lookup, in); +} + void import_ir_module( ModuleLookup module_lookup, const std::string& filename) { - ModuleDecoder(module_lookup, filename); + std::ifstream in(filename, std::ios_base::binary); + + ModuleDecoder(module_lookup, in); } -std::shared_ptr load(const std::string& filename) { +std::shared_ptr load(std::istream& in) { auto module = std::make_shared(); auto module_lookup = [&](const std::vector& qualified_name) { @@ -410,7 +423,17 @@ std::shared_ptr load(const std::string& filename) { } return curr; }; - ModuleDecoder(module_lookup, filename); + + ModuleDecoder(module_lookup, in); + + return module; +} + +std::shared_ptr load(const std::string& filename) { + std::ifstream in(filename, std::ios_base::binary); + + auto module = load(in); + return module; } diff --git a/torch/csrc/jit/import.h b/torch/csrc/jit/import.h index 6ce901c4369961..a1e0b31fe2295a 100644 --- a/torch/csrc/jit/import.h +++ b/torch/csrc/jit/import.h @@ -3,6 +3,8 @@ #include "torch/csrc/jit/ir.h" #include "torch/csrc/jit/script/module.h" +#include + namespace torch { namespace jit { @@ -13,11 +15,18 @@ TORCH_API void import_ir_module( ModuleLookup module_lookup, const std::string& filename); +TORCH_API void import_ir_module( + ModuleLookup module_lookup, + std::istream& in); + /// Loads a serialized `script::Module` from the given `filename`. /// /// The file stored at the location given in `filename` must contain a /// serialized `script::Module`, exported either via `ScriptModule.save()` in /// Python or `torch::jit::ExportModule` in C++. + +TORCH_API std::shared_ptr load(std::istream& in); + TORCH_API std::shared_ptr load(const std::string& filename); } // namespace jit diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp index 98a7b010419324..ac6f9ac4a15c1c 100644 --- a/torch/csrc/jit/init.cpp +++ b/torch/csrc/jit/init.cpp @@ -227,7 +227,6 @@ void initJITBindings(PyObject *module) { return createPyObjectForStack(std::move(stack)); }); - py::class_(m, "PyTorchFileWriter") .def(py::init()) .def("write_record", &PyTorchFileWriter::writeRecord) diff --git a/torch/csrc/jit/interned_strings.h b/torch/csrc/jit/interned_strings.h index e1d76dde56c59d..b4e6b7c1398f1b 100644 --- a/torch/csrc/jit/interned_strings.h +++ b/torch/csrc/jit/interned_strings.h @@ -59,6 +59,11 @@ namespace torch { namespace jit { _(prim, ConstantChunk) \ _(prim, NoneGenerator) \ _(aten, floordiv) \ + _(prim, MemoryFence) \ + _(prim, LoadWorld) \ + _(prim, StoreWorld) \ + _(prim, DummyWorld) \ + _(aten, append) \ _(aten, __not__) \ FORALL_ATEN_BASE_SYMBOLS(_) \ _(onnx, Add) \ diff --git a/torch/csrc/jit/interpreter.cpp b/torch/csrc/jit/interpreter.cpp index 0d2e22307527b6..14e7fab54d9549 100644 --- a/torch/csrc/jit/interpreter.cpp +++ b/torch/csrc/jit/interpreter.cpp @@ -32,7 +32,7 @@ namespace torch { namespace jit { // to what the instructions will look like. // In particular we: // * (TODO) desugar Loop trip counts into c = 0, c += 1 instructions in the loop -// * flatten stages so that each stage starts with a load from the stack +// * flatten stages so that each stage starts with a load to registers // and ends with a store to the stack // *. computes move_flags (see Outputs), and inserts // * Drop nodes are inserted for any node that is unused to create a dummy use @@ -72,8 +72,6 @@ Value* createTripCountConjunctiveCondition( return new_cond; } -} // namespace - // this currently just _removes_ the trip count inputs and checks they are // unused. In the future they will be desugared into normal arithmetic to // provide a loop counter @@ -142,9 +140,9 @@ static std::vector> flattenStages(Graph & graph) { auto it = graph.nodes().begin(); for(size_t i = 0; i <= graph.stage(); i++) { stage_input_types.emplace_back(); - auto store = graph.create(prim::Store, 0)->insertBefore(*it); + auto load = graph.create(prim::Load, 0)->insertBefore(*it); while(input_pos < graph.inputs().size() && graph.inputs()[input_pos]->stage() == i) { - auto nv = store->addOutput(); + auto nv = load->addOutput(); auto old_node = graph.inputs()[input_pos]; nv->setType(old_node->type()); stage_input_types[i].push_back(old_node->type()); @@ -153,9 +151,9 @@ static std::vector> flattenStages(Graph & graph) { } while(it != graph.nodes().end() && it->stage() == i) ++it; - auto load = graph.create(prim::Load, 0)->insertBefore(*it); + auto store = graph.create(prim::Store, 0)->insertBefore(*it); while(output_pos < graph.outputs().size() && graph.outputs()[output_pos]->stage() == i) { - load->addInput(graph.outputs()[output_pos]); + store->addInput(graph.outputs()[output_pos]); output_pos++; } } @@ -307,6 +305,7 @@ std::unordered_map> findLastUses(Graph & g) { return FindLastUses(g).move_flags; } +} //namespace // pre-processing that happens once per graph struct PreprocessGraph { @@ -503,10 +502,10 @@ struct CodeImpl { insertInstruction(node); } break; } - // each stage ends with a load instruction + // each stage ends with a store instruction // we record where these instructions occur, and use them to // exit the interpreter - if(node->kind() == prim::Load) { + if(node->kind() == prim::Store) { stage_end.push_back(instructions.size()); } } @@ -694,7 +693,7 @@ struct InterpreterStateImpl { for(int i = inst.outputs.size - 1; i >= 0; i--) { int reg = get(inst.outputs,i); registers[reg] = pop(stack); - // std::cout << "pop reg[" << reg << "];\n" << registers[reg].pImpl << "\n"; + // std::cout << "pop reg[" << reg << "];\n" << registers[reg] << "\n"; } pc = new_pc; } catch(std::exception & e) { diff --git a/torch/csrc/jit/ir.cpp b/torch/csrc/jit/ir.cpp index 82b14fa0b6839d..90451494bacbc7 100644 --- a/torch/csrc/jit/ir.cpp +++ b/torch/csrc/jit/ir.cpp @@ -248,18 +248,22 @@ void Node::lint() const { } // Node subclass invariants - // - Return uses is zero - // - Param inputs is zero - // - Select inputs is one - // - Python operator cconv is correct - IR_IF(this,Constant) JIT_ASSERT(inputs_.size() == 0); + IR_ELSEIF(LoadWorld) + JIT_ASSERT(inputs_.size() == 0); + JIT_ASSERT(outputs_.size() == 1); + IR_ELSEIF(StoreWorld) + JIT_ASSERT(inputs_.size() == 1); + JIT_ASSERT(outputs_.size() == 0); IR_ELSEIF(Return) + // Return uses is zero JIT_ASSERT(outputs().size() == 0); IR_ELSEIF(Param) + // Param inputs is zero JIT_ASSERT(inputs_.size() == 0); IR_ELSEIFM_CONST(PythonOp) + // Python operator cconv is correct size_t n_scalars = 0, n_tensors = 0; for (auto c : value->cconv) { if (c == 'c') { @@ -381,6 +385,7 @@ void Graph::lint() const { for (auto n : b->nodes()) { JIT_ASSERT(n->kind_ != prim::Param); JIT_ASSERT(n->kind_ != prim::Return); + JIT_ASSERT(n->kind_ != prim::DummyWorld); check_node(n); } @@ -447,6 +452,7 @@ void Block::cloneFrom(Block * src, std::function value_map) { local_map[input] = this->addInput()->copyMetadata(input)->setStage(input->stage()); graph->setStage(std::max(graph->stage(), input->stage())); } + for(auto node : src->nodes()) { auto new_node = this->appendNode(graph->createClone(node, env)); new_node->setStage(node->stage()); @@ -466,8 +472,9 @@ void Block::cloneFrom(Block * src, std::function value_map) { std::shared_ptr Graph::copy() { auto new_g = std::make_shared(); - auto env = [](Value *) -> Value* { - AT_ERROR("Graph::copy() encountered a use of a value not in scope. Run lint!"); + auto env = [](Value* v) -> Value* { + AT_ERROR( + "Graph::copy() encountered a use of a value not in scope. Run lint!"); }; new_g->block()->cloneFrom(this->block(), env); return new_g; diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h index 062d0422c2be07..0bb5c899c7321d 100644 --- a/torch/csrc/jit/ir.h +++ b/torch/csrc/jit/ir.h @@ -236,7 +236,7 @@ struct Value { void replaceFirstUseWith(Value * newValue); - // Replaces all uses of this node with 'newValue'. + // Replaces all uses of this value with 'newValue'. // // Given: %3 = f(%1, %2) // %4 = g(%3) @@ -320,6 +320,9 @@ struct Node : public Attributes { Block * owningBlock() { return owning_block_; } + const Block * owningBlock() const { + return owning_block_; + } size_t stage() const { return stage_; } @@ -442,33 +445,33 @@ struct Node : public Attributes { // Given: %3 = f(%1, %2) // Execute: %3.addInput(%4) // Result: %3 = f(%1, %2, %4) - Value* addInput(Value * node) { - JIT_ASSERT(graph_ == node->owningGraph()); + Value* addInput(Value * value) { + JIT_ASSERT(graph_ == value->owningGraph()); schema_ = nullptr; - node->uses_.emplace_back(this, inputs_.size()); - inputs_.push_back(node); - return node; + value->uses_.emplace_back(this, inputs_.size()); + inputs_.push_back(value); + return value; } - // Add 'node' as an input to 'this' at the specified position in the - // arguments. Returns the added node for ease of chaining. - Value* insertInput(size_t i, Value* node) { - JIT_ASSERT(graph_ == node->owningGraph()); + // Add 'value' as an input to 'this' at the specified position in the + // arguments. Returns the added value for ease of chaining. + Value* insertInput(size_t i, Value* value) { + JIT_ASSERT(graph_ == value->owningGraph()); schema_ = nullptr; // First we update the offsets for all existing inputs that will reside // after the one we're inserting. Concretely, these are the inputs at // indices [i, # input). Since we're inserting one input before all of - // these inputs, increment their use offsets for this Node by 1 + // these inputs, increment their use offsets for this value by 1 for (size_t use_itr = i; use_itr < inputs_.size(); ++use_itr) { // See Note [User node does not uniquely identify use] auto use = findUseForInput(use_itr); use->offset += 1; } // Insert the actual input at the specified index - inputs_.insert(inputs_.begin() + i, node); + inputs_.insert(inputs_.begin() + i, value); // Register the new use of the value we're inserted as an input. - node->uses_.emplace_back(this, i); - return node; + value->uses_.emplace_back(this, i); + return value; } // Replace the input of 'this' at position 'i' with @@ -549,7 +552,7 @@ struct Node : public Attributes { return {blocks_.data(), blocks_.size()}; } - // Insert unattached 'this' node after 'n' in the topological order. + // Insert unattached 'this' node before 'n' in the topological order. // Returns this (for chaining). // // Given: %3 = f(%1, %2) @@ -804,8 +807,8 @@ struct Block { void eraseInput(size_t i) { input_->eraseOutput(i); } - size_t registerOutput(Value * n) { - output_->addInput(n); + size_t registerOutput(Value * v) { + output_->addInput(v); return outputs().size() - 1; } size_t insertOutput(size_t i, Value* n) { @@ -1107,6 +1110,12 @@ friend struct Block; return jit::insertConstant(*this, std::move(val), loc); } + Value* insertDummyWorld() { + auto node = create(prim::DummyWorld, 1); + node->output()->setType(WorldType::get()); + return insertNode(node)->output(); + } + // schema-driven insert // this inserts a node into the graph with inputs determined from args and kwargs using Python // argument matching rules, and checks that the op matches a known schema @@ -1323,11 +1332,11 @@ inline void Node::cloneFrom(Node * s) { copyAttributes(*s); } -inline Block::Block(Graph * graph_, Node * node_) -: graph_(graph_) -, output_(initOutput(graph_->create(prim::Return, 0))) -, input_(graph_->create(prim::Param,0)) -, owning_node_(node_) { +inline Block::Block(Graph* graph_, Node* node_) + : graph_(graph_), + output_(initOutput(graph_->create(prim::Return, 0))), + input_(graph_->create(prim::Param, 0)), + owning_node_(node_) { graph_->all_blocks.emplace(this); output_->owning_block_ = this; input_->owning_block_ = this; diff --git a/torch/csrc/jit/operator.cpp b/torch/csrc/jit/operator.cpp index 75e5833535bcfc..d701b536c44a0c 100644 --- a/torch/csrc/jit/operator.cpp +++ b/torch/csrc/jit/operator.cpp @@ -58,12 +58,19 @@ struct SchemaParser { {"float", FloatType::get() }, {"int", IntType::get() }, {"bool", IntType::get() }, // TODO: add separate bool type + {"World", WorldType::get() }, }; auto tok = L.expect(TK_IDENT); auto text = tok.text(); auto it = type_map.find(text); - if(it == type_map.end()) + if(it == type_map.end()) { + if(text.size() > 0 && islower(text[0])) { + // lower case identifiers that are not otherwise valid types + // are treated as type variables + return VarType::create(text); + } throw ErrorReport(tok.range) << "unknown type specifier"; + } return it->second; } void parseArgumentType(std::vector& arguments) { @@ -358,9 +365,16 @@ bool Operator::matches(const Node* node) const { if(actuals.size() < formals.size()) return false; + + TypeEnv type_env; for(size_t i = 0; i < formals.size(); ++i) { - // mismatched input type - if (!actuals[i]->type()->isSubtypeOf(formals[i].type)) { + try { + TypePtr formal = matchTypeVariables(formals[i].type, actuals[i]->type(), type_env); + // mismatched input type + if (!actuals[i]->type()->isSubtypeOf(formal)) { + return false; + } + } catch(TypeMatchError& err) { return false; } } diff --git a/torch/csrc/jit/passes/annotate_effects.cpp b/torch/csrc/jit/passes/annotate_effects.cpp new file mode 100644 index 00000000000000..b8aaec83dddbac --- /dev/null +++ b/torch/csrc/jit/passes/annotate_effects.cpp @@ -0,0 +1,320 @@ +#include "torch/csrc/jit/passes/annotate_effects.h" + +#include +#include "torch/csrc/jit/passes/dead_code_elimination.h" + +namespace torch { +namespace jit { +namespace { + +/** + * AnnotateEffects + * + * This pass annotates effectful operations (such as ones that mutate existing + * values) to prevent subsequent passes from re-ordering ops in a way that + * changes the meaning of the program. + * + * It does this by threading a "world token" value through nodes that use + * mutable values. This models effects explicitly in the IR and forces all + * annotated nodes to be linearized during optimization. + * + * For mutating operators: the world token is threaded directly through the node + * For purely functional operators: their node will be "fenced" by two + * `prim::MemoryFence` nodes that take world tokens as their input. + * + * Graphs have special EntryWorld and ExitWorld nodes that provide end-points + * for the world token. They are similar to graph inputs/outputs in that they + * are not in the node list and only accessible via special methods. + * + * When inlined, graphs will manifest the EntryWorld/ExitWorld nodes explicitly + * as StoreWorld/LoadWorld ops so that they can act as endpoints where the + * callee "world thread" can be joined to the caller world thread. + */ +class AnnotateEffectsImpl { + public: + void annotateEffects(Graph* g) { + if (!shouldAnnotate(g->block())) { + return; + } + + // Generate the first world token + Value* curToken = nullptr; + { + WithInsertPoint guard(*g->nodes().begin()); + auto loadWorld = g->insertNode(g->create(prim::LoadWorld, 1)); + curToken = loadWorld->output()->setType(WorldType::get()); + } + + auto lastToken = visitBlock(g->block(), curToken); + + auto storeWorld = g->insertNode(g->create(prim::StoreWorld, 0)); + storeWorld->addInput(lastToken); + } + + private: + Value* visitBlock(Block* block, Value* curToken) { + for (auto* node : block->nodes()) { + curToken = visitNode(node, curToken); + } + return curToken; + } + + // General node annotation. If a node uses a mutable variable (or mutates a + // previously constant variable), annotate it + // + // Returns the last world token emitted for subsequent annotations to use. + Value* visitNode(Node* node, Value* curToken) { + // Avoid annotating memory fences. This avoids an infinite loop as we add + // fences and continue to iterate through nodes. + if (node->kind() == prim::MemoryFence) { + // Return this memory fence's world token + return node->outputs().at(0); + } + + // Handle inlined functions. Inlined functions will expose their Entry and + // Exit tokens as regular nodes. These exposed nodes provide fixed points + // to thread the current world token through. + if (node->kind() == prim::LoadWorld) { + auto inlinedEntryToken = node->output(); + inlinedEntryToken->replaceAllUsesWith(curToken); + return curToken; + } + + if (node->kind() == prim::StoreWorld) { + return node->input(); + } + + if (node->kind() == prim::If) { + JIT_ASSERT(node->blocks().size() == 2); + + auto trueBlock = node->blocks().at(0); + auto falseBlock = node->blocks().at(1); + + auto trueToken = visitBlock(trueBlock, curToken); + auto falseToken = visitBlock(falseBlock, curToken); + + // If any branch has a mutating op, this node has to output a world token + if (trueToken != curToken || falseToken != curToken) { + trueBlock->registerOutput(trueToken); + falseBlock->registerOutput(falseToken); + + return node->addOutput()->setType(WorldType::get()); + } + return curToken; + } + + if (node->kind() == prim::Loop) { + JIT_ASSERT(node->blocks().size() == 1); + auto block = node->blocks().at(0); + if (!shouldAnnotate(block)) { + // Bail out early if there's no mutable variables used inside + return curToken; + } + + // Register the world token as a loop carried dependency + auto beginLoopToken = block->addInput()->setType(WorldType::get()); + auto endLoopToken = visitBlock(block, beginLoopToken); + block->registerOutput(endLoopToken); + + JIT_ASSERT(endLoopToken != beginLoopToken); + + // Thread the world token through the loop node + node->addInput(curToken); + return node->addOutput()->setType(WorldType::get()); + } + + // For mutating ops, just thread the world token through the node. + if (isMutatingOp(node)) { + // Replace the "dummy" token generated by the compiler + node->replaceInput(0, curToken); + return node->outputs().at(0); + } + + JIT_ASSERT(node->blocks().size() == 0); + + // For pure ops that need to be annotated, fence them. + if (shouldAnnotate(node)) { + if (isFenced(node)) { + // If the node has already been fenced, just return the value from the + // end fence. This can happen when another graph is inlined. + return getTokenForFencedNode(node); + } + return addFenceForNode(node, curToken); + } + + return curToken; + } + + bool shouldAnnotate(const Node* node) { + // Check if this node uses a known mutable value + for (const auto* input : node->inputs()) { + if (!isMutableType(input)) { + // TODO(suo): Right now, we only support mutable lists. + // If we remove this check, it's not clear whether: + // + // append(int[] a, int b) + // + // mutates `a` or `b`. We'll need to extend the schema language to be + // able to express which argument is mutated. + continue; + } + // First check the cache + if (mutableValues_.count(input) != 0) { + return true; + } + + // Check whether any mutating op uses this input + for (const auto& use : input->uses()) { + if (isMutatingOp(use.user)) { + mutableValues_.insert(input); + return true; + } + } + } + + // Check that any sub-blocks need to be annotated + for (auto block : node->blocks()) { + if (shouldAnnotate(block)) { + return true; + } + } + + return false; + } + + bool shouldAnnotate(const Block* block) { + return std::any_of( + block->nodes().begin(), block->nodes().end(), [this](const Node* node) { + return shouldAnnotate(node); + }); + } + + bool isMutableType(const Value* value) { + return value->type()->kind() == TypeKind::ListType; + } + + bool isMutatingOp(const Node* node) { + return !node->inputs().empty() && + node->inputs()[0]->type() == WorldType::get(); + } + + // Returns true iff this node has already been fenced. This can happen if + // another graph was inlined into the current one. + bool isFenced(const Node* node) { + // A node is fenced if all its inputs/outputs are used by memory fences. + const auto inputsFenced = std::all_of( + node->inputs().begin(), node->inputs().end(), [&](const Value* input) { + return std::any_of( + input->uses().cbegin(), + input->uses().cend(), + [&](const Use& use) { + return use.user->kind() == prim::MemoryFence; + }); + }); + if (!inputsFenced) { + return false; + } + + const auto outputsFenced = std::all_of( + node->outputs().begin(), + node->outputs().end(), + [&](const Value* input) { + return std::any_of( + input->uses().cbegin(), + input->uses().cend(), + [&](const Use& use) { + return use.user->kind() == prim::MemoryFence; + }); + }); + if (!outputsFenced) { + return false; + } + + return true; + } + + // Given a fenced node, return the world token outputted from its end fence + Value* getTokenForFencedNode(const Node* node) { + // Take advantage of the fact that the end fence consumes the node's + // outputs, i.e. it will be the only user. + const auto output = node->outputs().at(0); + JIT_ASSERT(output->uses().size() == 1); + const auto endFence = output->uses()[0].user; + const auto token = endFence->outputs().at(0); + JIT_ASSERT(token->type() == WorldType::get()); + return token; + } + + // Create a memory fence around a node, using the world token. + // + // Input: + // %size : Int = prim::len(%mut_list) + // + // Output: + // %t.1 : World, %list.2 : int[] = prim::MemoryFence(%curToken, %mut_list) + // %size : Int = prim::len(%mut_list) + // %t.2 : World, %size.2 : int = prim::MemoryFence(%t.1, %size) + // + // Returns the new world token (%t.2) for subsequent fences to use. + Value* addFenceForNode(Node* node, Value* curToken) { + // Add a start fence + auto startFence = + node->owningGraph()->create(prim::MemoryFence, /*outputs=*/0); + + // Add world tokens as the first input and output + startFence->addInput(curToken); + curToken = startFence->addOutput()->setType(WorldType::get()); + + // Fence off all node's inputs + for (const auto input : node->inputs()) { + startFence->addInput(input); + startFence->addOutput()->setType(input->type()); + } + + startFence->insertBefore(node); + + JIT_ASSERT(node->inputs().size() == startFence->outputs().size() - 1); + + // modify the node to take in the start fence's output values + for (size_t i = 0; i < node->inputs().size(); i++) { + node->replaceInput(i, startFence->outputs()[i + 1]); + } + + // Add an end fence + auto endFence = + node->owningGraph()->create(prim::MemoryFence, /*outputs=*/0); + + // Add world tokens as the first input and output + endFence->addInput(curToken); + curToken = endFence->addOutput()->setType(WorldType::get()); + + // Fence off all the node's outputs + for (auto output : node->outputs()) { + endFence->addInput(output); + auto fencedOutput = endFence->addOutput()->setType(output->type()); + output->replaceAllUsesWith(fencedOutput); + // replaceAllUsesWith() replaces the fence's INPUT value with the new + // output as well, so we need to manually add the "real" input back + endFence->replaceInputWith(fencedOutput, output); + } + + endFence->insertAfter(node); + + return curToken; + } + + // Memoize which values will be mutated at some point in the program + std::set mutableValues_; +}; +} // namespace + +void AnnotateEffects(std::shared_ptr& graph) { + AnnotateEffectsImpl impl; + impl.annotateEffects(graph.get()); + + // Prune the dummy world tokens + EliminateDeadCode(graph); +} + +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/passes/annotate_effects.h b/torch/csrc/jit/passes/annotate_effects.h new file mode 100644 index 00000000000000..9c8e969d54ba41 --- /dev/null +++ b/torch/csrc/jit/passes/annotate_effects.h @@ -0,0 +1,11 @@ +#pragma once + +#include "torch/csrc/jit/ir.h" + +namespace torch { +namespace jit { + +TORCH_API void AnnotateEffects(std::shared_ptr& graph); + +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp index b9d36d0e4b88e3..179f3751526c4e 100644 --- a/torch/csrc/jit/passes/constant_propagation.cpp +++ b/torch/csrc/jit/passes/constant_propagation.cpp @@ -17,6 +17,8 @@ std::unordered_set skip_list = { prim::Loop, //TODO: handle Loop prim::Print, prim::PythonOp, //may have side effects + prim::LoadWorld, + prim::StoreWorld, //all the rand functions from native_functions.yaml aten::rand, aten::rand_like, diff --git a/torch/csrc/jit/passes/dead_code_elimination.cpp b/torch/csrc/jit/passes/dead_code_elimination.cpp index d8341cbb99c6aa..6424eb70a6cafc 100644 --- a/torch/csrc/jit/passes/dead_code_elimination.cpp +++ b/torch/csrc/jit/passes/dead_code_elimination.cpp @@ -13,12 +13,13 @@ bool hasSideEffects(Node * node, bool_memo_type& memo) { auto it = memo.find(node); if (it != memo.end()) return it->second; - bool has_side_effects = node->kind() == prim::Print || - std::any_of(node->blocks().begin(), node->blocks().end(), - [&](Block *b) { - return std::any_of(b->nodes().begin(), b->nodes().end(), - [&](Node *n) { return hasSideEffects(n, memo); }); - }); + bool has_side_effects = + node->kind() == prim::Print || node->kind() == prim::StoreWorld || + std::any_of(node->blocks().begin(), node->blocks().end(), [&](Block* b) { + return std::any_of(b->nodes().begin(), b->nodes().end(), [&](Node* n) { + return hasSideEffects(n, memo); + }); + }); memo.emplace(node, has_side_effects); return has_side_effects; } diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp index 6c7166f3b43552..c8a1ef566f2a3c 100644 --- a/torch/csrc/jit/passes/graph_fuser.cpp +++ b/torch/csrc/jit/passes/graph_fuser.cpp @@ -208,16 +208,12 @@ struct GraphFuser { /*const=*/attr::alpha) || node->matches("aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor", /*const=*/{attr::other, attr::alpha}) || - node->matches("aten::add(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) || node->matches("aten::sub(Tensor self, Tensor other, *, Scalar alpha) -> Tensor", /*const=*/attr::alpha) || node->matches("aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor", /*const=*/{attr::other, attr::alpha}) || - node->matches("aten::sub(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) || node->matches("aten::mul(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) || - node->matches("aten::mul(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) || node->matches("aten::div(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) || - node->matches("aten::div(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) || node->matches("aten::clamp(Tensor self, Scalar min, Scalar max) -> Tensor", /*const=*/{attr::min, attr::max})) { auto inputs = tensorInputs(node); return haveSupportedType(inputs); @@ -225,22 +221,16 @@ struct GraphFuser { else if ( node->matches("aten::lt(Tensor self, Tensor other) -> Tensor") || node->matches("aten::lt(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) || - node->matches("aten::lt(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) || node->matches("aten::le(Tensor self, Tensor other) -> Tensor") || node->matches("aten::le(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) || - node->matches("aten::le(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) || node->matches("aten::gt(Tensor self, Tensor other) -> Tensor") || node->matches("aten::gt(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) || - node->matches("aten::gt(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) || node->matches("aten::ge(Tensor self, Tensor other) -> Tensor") || node->matches("aten::ge(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) || - node->matches("aten::ge(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) || node->matches("aten::eq(Tensor self, Tensor other) -> Tensor") || node->matches("aten::eq(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) || - node->matches("aten::eq(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) || node->matches("aten::ne(Tensor self, Tensor other) -> Tensor") || - node->matches("aten::ne(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) || - node->matches("aten::ne(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other)) { + node->matches("aten::ne(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other)) { // comparison operators produce Byte type, and it's ok, check only inputs auto inputs = tensorInputs(node); return haveSupportedType(inputs); diff --git a/torch/csrc/jit/passes/onnx/peephole.cpp b/torch/csrc/jit/passes/onnx/peephole.cpp index 8045a46a4af1ba..6e90780ecbf695 100644 --- a/torch/csrc/jit/passes/onnx/peephole.cpp +++ b/torch/csrc/jit/passes/onnx/peephole.cpp @@ -159,7 +159,7 @@ void eliminateNopTranspose(Block *b) { } if (n->kind() == onnx::Transpose) { if (isNopTranspose(n->is(attr::perm))) { - n->replaceAllUsesWith(n->input()->node()); + n->output()->replaceAllUsesWith(n->input()); it.destroyCurrent(); continue; } diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp index b01d9d3b61359c..eedc7fd0a8a686 100644 --- a/torch/csrc/jit/passes/shape_analysis.cpp +++ b/torch/csrc/jit/passes/shape_analysis.cpp @@ -557,10 +557,6 @@ bool PropagateTensorShapeOnNode(Node * node, bool insert_expands) { "aten::pow(Tensor self, Scalar exponent) -> Tensor", "aten::fmod(Tensor self, Scalar other) -> Tensor", "aten::remainder(Tensor self, Scalar other) -> Tensor", - "aten::add(Scalar other, Tensor self) -> Tensor", - "aten::sub(Scalar other, Tensor self) -> Tensor", - "aten::mul(Scalar other, Tensor self) -> Tensor", - "aten::div(Scalar other, Tensor self) -> Tensor", "aten::pow(Scalar base, Tensor self) -> Tensor", "aten::__and__(Tensor self, Scalar other) -> Tensor", "aten::__or__(Tensor self, Scalar other) -> Tensor", @@ -1139,10 +1135,7 @@ bool PropagateCompleteShapeOnNode(Node * node, bool insert_expands, } else if (node->matches("aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor") || node->matches("aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor") || node->matches("aten::mul(Tensor self, Scalar other) -> Tensor") || - node->matches("aten::pow(Tensor self, Scalar exponent) -> Tensor") || - node->matches("aten::add(Scalar other, Tensor self) -> Tensor") || - node->matches("aten::sub(Scalar other, Tensor self) -> Tensor") || - node->matches("aten::mul(Scalar other, Tensor self) -> Tensor")) { + node->matches("aten::pow(Tensor self, Scalar exponent) -> Tensor")) { node->output()->setType(tensor_types.at(0)); return true; } else if (insert_expands && ( diff --git a/torch/csrc/jit/passes/to_batch.cpp b/torch/csrc/jit/passes/to_batch.cpp index f78da9b92baccc..0d56ca2255286f 100644 --- a/torch/csrc/jit/passes/to_batch.cpp +++ b/torch/csrc/jit/passes/to_batch.cpp @@ -525,11 +525,10 @@ void ToBatch::toBatch(Block* block, Block* res_block) { } std::shared_ptr to_batch_graph(std::shared_ptr& graph){ - // std::cout<toString()< res_graph = std::make_shared(graph->scope_root()); ToBatch to_batch; to_batch.toBatch(graph->block(), res_graph->block()); - // std::cout<toString()< +#include #include #include @@ -86,7 +86,7 @@ inline IValue createGenericList(py::handle obj, const TypePtr& elem_type) { for(auto elem : obj) { elems.push_back(toIValue(elem, elem_type)); } - return ConstantList::create(std::move(elems)); + return List::create(std::move(elems)); } inline IValue toIValue(py::handle obj, const TypePtr& type) { @@ -140,8 +140,11 @@ inline IValue toIValue(py::handle obj, const TypePtr& type) { return createGenericList(obj, elem_type); } } + case TypeKind::WorldType: + AT_ERROR("World arguments should not be passed in by users"); case TypeKind::NumberType: case TypeKind::GeneratorType: + case TypeKind::VarType: break; } AT_ERROR("Missing cases in toIValue for type: ", type->str(), "! File a bug report."); @@ -199,6 +202,14 @@ inline py::object toPyObject(IValue&& ivalue) { return py::cast(ivalue.toDoubleListRef()); } else if (ivalue.isTensorList()) { return py::cast(ivalue.toTensorListRef()); + } else if (ivalue.isGenericList()) { + auto list = ivalue.toGenericList(); + const auto & elements = list->elements(); + py::list t { elements.size() }; + for (size_t i = 0; i < elements.size(); ++i) { + t[i] = toPyObject(IValue{elements[i]}); + } + return t; } else if (ivalue.isTuple()) { auto tuple = ivalue.toTuple(); const auto & elements = tuple->elements(); diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp index 5aa053f626faa1..ad03ac556cd272 100644 --- a/torch/csrc/jit/python_ir.cpp +++ b/torch/csrc/jit/python_ir.cpp @@ -455,6 +455,10 @@ void initPythonIRBindings(PyObject * module_) { return "StringType"; case TypeKind::GeneratorType: return "GeneratorType"; + case TypeKind::VarType: + return "VarType"; + case TypeKind::WorldType: + return "WorldType"; } // not reachable, but some compilers complain AT_ERROR("Unknown Type Kind"); diff --git a/torch/csrc/jit/register_prim_ops.cpp b/torch/csrc/jit/register_prim_ops.cpp index 71168cd3ee3d4d..cdea4ab894b253 100644 --- a/torch/csrc/jit/register_prim_ops.cpp +++ b/torch/csrc/jit/register_prim_ops.cpp @@ -52,6 +52,13 @@ void checkImplicitTensorToNum(at::Tensor t, bool toInt) { } RegisterOperators reg({ + Operator( + prim::MemoryFence, + [](Node* node) { + return [](Stack& stack) { + return 0; + }; + }), Operator( prim::FusionGroup, [](Node* node) { @@ -204,6 +211,30 @@ RegisterOperators reg({ return 0; }; }), + Operator( + prim::LoadWorld, + [](Node* node) { + return [](Stack& stack) { + push(stack, World{0}); + return 0; + }; + }), + Operator( + prim::StoreWorld, + [](Node* node) { + return [](Stack& stack) { + drop(stack, 1); + return 0; + }; + }), + Operator( + prim::DummyWorld, + [](Node* node) { + return [](Stack& stack) { + AT_ERROR("Encountered a dummy world during graph execution."); + return 0; + }; + }), Operator( onnx::Reshape, [](Node* node) { @@ -399,9 +430,17 @@ RegisterOperators reg({ return 0; }; } else { - std::stringstream ss; - ss << "unsupported list type: " << *lt->getElementType(); - throw std::runtime_error(ss.str()); + return [=](Stack& stack) { + const size_t stack_size = stack.size(); + std::vector vals; + vals.reserve(num_inputs); + for (size_t i = stack_size - num_inputs; i < stack_size; ++i) { + vals.push_back(std::move(stack[i])); + } + drop(stack, num_inputs); + push(stack, std::move(vals)); + return 0; + }; } }), }); @@ -441,26 +480,6 @@ RegisterOperators reg({ #define DEFINE_BINARY_OP(aten_op, op) DEFINE_GENERIC_OP(aten_op, op, op, float) #define DEFINE_COMPARISON_OP(aten_op, op) DEFINE_GENERIC_OP(aten_op, op, op, int) -// define helpers for where aten is missing scalar overloads -// note: it would be better to define these in a standard library as -// script functions and have the compiler substitute them in -// however, we need to add type annotations to the parser in order for us -// to move them there. -// e.g. s + t ==> t + s -// e.g. s - d == -d + s - -#define DEFINE_ST_OP(aten_op, reverse_exp) \ - Operator("aten::" #aten_op "(Scalar other, Tensor self) -> Tensor", [](Node* node) { \ - return [=](Stack& stack) { \ - at::Scalar a; \ - at::Tensor b; \ - pop(stack, a, b); \ - at::DeviceGuard guard(b); \ - push(stack, reverse_exp); \ - return 0; \ - }; \ - }), - // Convert an python index (which may be negative) into an index usable for a // C++ container int64_t normalizeIndex(int64_t idx, int64_t list_size) { @@ -471,6 +490,19 @@ int64_t normalizeIndex(int64_t idx, int64_t list_size) { return idx; } +template +Operation listAppend(Node* node) { + return [](Stack& stack) { + TList a; + TElement el; + pop(stack, a, el); + + a->elements().push_back(el); + + return 0; + }; +} + template Operation listSelect(Node* node) { return [=](Stack& stack) { @@ -506,11 +538,7 @@ Operation listEq(Node* node) { T a; T b; pop(stack, a, b); - if (a->elements() == b->elements()) { - push(stack, 1); - } else { - push(stack, 0); - } + push(stack, a->elements() == b->elements() ? 1 : 0); return 0; }; } @@ -604,32 +632,29 @@ Operation listSlice(Node* node) { } RegisterOperators reg2({ - Operator("aten::select(int[] a, int b) -> int", listSelect>), - Operator("aten::select(float[] a, int b) -> float", listSelect>), - Operator("aten::select(Tensor[] a, int b) -> Tensor", listSelect>), - Operator("aten::len(int[] a) -> int", listLen>), - Operator("aten::len(float[] a) -> int", listLen>), - Operator("aten::len(Tensor[] a) -> int", listLen>), +#define CREATE_LIST_OPS(decl_type, c_type) \ + Operator("aten::select(" decl_type "[] a, int b) -> " decl_type, listSelect>), \ + Operator("aten::len(" decl_type "[] a) -> int", listLen>), \ + Operator("aten::add(" decl_type "[] a, " decl_type "[] b) -> " decl_type "[]", listAdd, c_type::ElemType>), \ + Operator( \ + "aten::slice(" decl_type "[] l, int start, int end=9223372036854775807, int step=1) -> " decl_type "[]", \ + listSlice, c_type::ElemType>), \ + Operator( \ + "aten::append(World w, " decl_type "[] list, " decl_type " el) -> World", \ + listAppend, c_type::ElemType>), \ + + + CREATE_LIST_OPS("int", IntList) + CREATE_LIST_OPS("float", DoubleList) + CREATE_LIST_OPS("Tensor", TensorList) + CREATE_LIST_OPS("t", GenericList) + Operator("aten::eq(int[] a, int[] b) -> int", listEq>), Operator("aten::eq(float[] a, float[] b) -> int", listEq>), Operator("aten::eq(Tensor[] a, Tensor[] b) -> int", listEq>), - Operator("aten::add(int[] a, int[] b) -> int[]", listAdd, int64_t>), - Operator("aten::add(float[] a, float[] b) -> float[]", listAdd, double>), - Operator("aten::add(Tensor[] a, Tensor[] b) -> Tensor[]", listAdd, at::Tensor>), - - Operator( - "aten::slice(int[] l, int start, int end=9223372036854775807, int step=1) -> int[]", - listSlice, int64_t>), - Operator( - "aten::slice(float[] l, int start, int end=9223372036854775807, int step=1) -> float[]", - listSlice, double>), - Operator( - "aten::slice(Tensor[] l, int start, int end=9223372036854775807, int step=1) -> Tensor[]", - listSlice, at::Tensor>), - DEFINE_BINARY_OP(aten::add, a + b) DEFINE_BINARY_OP(aten::sub, a - b) DEFINE_BINARY_OP(aten::mul, a * b) @@ -748,21 +773,5 @@ RegisterOperators reg2({ return 0; }; }), - // commutative - DEFINE_ST_OP(mul, at::mul(b, a)) - DEFINE_ST_OP(add, at::add(b, a)) - DEFINE_ST_OP(ne, at::ne(b, a)) - DEFINE_ST_OP(eq, at::eq(b, a)) - - // comparisons, reverse the condition - DEFINE_ST_OP(lt, b > a) - DEFINE_ST_OP(le, b >= a) - DEFINE_ST_OP(gt, b < a) - DEFINE_ST_OP(ge, b <= a) - - // rsub - DEFINE_ST_OP(sub, at::add(b.neg(), a)) - // rdiv - DEFINE_ST_OP(div, at::mul(at::reciprocal(b), a)) }); }}} // torch::jit::anon diff --git a/torch/csrc/jit/script/builtin_functions.cpp b/torch/csrc/jit/script/builtin_functions.cpp new file mode 100644 index 00000000000000..ea82d06879d7c7 --- /dev/null +++ b/torch/csrc/jit/script/builtin_functions.cpp @@ -0,0 +1,83 @@ +#include "torch/csrc/jit/script/builtin_functions.h" +#include "torch/csrc/api/include/torch/jit.h" +#include "torch/csrc/jit/code_template.h" + +namespace torch { namespace jit { namespace script { + +auto scalar_operators_source = CodeTemplate( +R"SCRIPT( +def mul(a : ${Scalar}, b : Tensor) -> Tensor: + return b * a +def add(a : ${Scalar}, b : Tensor) -> Tensor: + return b + a +def ne(a : ${Scalar}, b : Tensor) -> Tensor: + return b != a +def eq(a : ${Scalar}, b : Tensor) -> Tensor: + return b == a +def lt(a : ${Scalar}, b : Tensor) -> Tensor: + return b > a +def le(a : ${Scalar}, b : Tensor) -> Tensor: + return b >= a +def gt(a : ${Scalar}, b : Tensor) -> Tensor: + return b < a +def ge(a : ${Scalar}, b : Tensor) -> Tensor: + return b <= a +def sub(a : ${Scalar}, b : Tensor) -> Tensor: + return torch.neg(b) + a +def div(a : ${Scalar}, b : Tensor) -> Tensor: + return torch.reciprocal(b) * a +)SCRIPT"); + +struct BuiltinFunctionRegistry { + + const std::vector& getAllBuiltinFunctionsFor(Symbol name) { + const static std::vector empty; + // when initializing the builtin function library, we will re-enter + // getAllBuiltinFunctionsFor since it is called in the compiler to + // lookup builtins and initializing the builtin functions calls the compiler. + // To avoid deadlocking, we use a recursive mutex (same thread can re-lock, + // the mutex without waiting), and report no loaded builtins during init. + std::lock_guard guard(mutex); + if(state == INTIIALIZING) { + return empty; + } else if (state == UNINITIALIZED) { + state = INTIIALIZING; + loadBuiltinFunctions(); + state = INITIALIZED; + } + JIT_ASSERT(state == INITIALIZED); + auto it = builtins_by_name.find(name); + if(it == builtins_by_name.end()) + return empty; + return it->second; + } +private: + void loadSource(const std::string& source) { + auto module = std::make_shared(); + defineMethodsInModule( + *module, source, script::nativeResolver, /*self=*/nullptr); + modules.push_back(module); + for (auto& method : module->get_methods()) { + builtins_by_name[Symbol::fromQualString("aten::" + method.key)].push_back( + method.value.get()); + } + } + void loadBuiltinFunctions() { + for(auto scalar : {"float", "int"}) { + TemplateEnv env; + env.s("Scalar", scalar); + loadSource(scalar_operators_source.format(env)); + } + } + enum {UNINITIALIZED, INTIIALIZING, INITIALIZED} state = UNINITIALIZED; + std::recursive_mutex mutex; + std::vector> modules; + std::unordered_map> builtins_by_name; +}; + +TORCH_API const std::vector& getAllBuiltinFunctionsFor(Symbol name) { + static BuiltinFunctionRegistry registry; + return registry.getAllBuiltinFunctionsFor(name); +} + +}}} diff --git a/torch/csrc/jit/script/builtin_functions.h b/torch/csrc/jit/script/builtin_functions.h new file mode 100644 index 00000000000000..042dc96b1826f0 --- /dev/null +++ b/torch/csrc/jit/script/builtin_functions.h @@ -0,0 +1,13 @@ +#pragma once + +#include "torch/csrc/WindowsTorchApiMacro.h" +#include "torch/csrc/jit/script/module.h" + +namespace torch { namespace jit { namespace script { + + +TORCH_API const std::vector& getAllBuiltinFunctionsFor(Symbol name); + + + +}}} diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp index b66b96dd5eb6fb..28aa735fc37249 100644 --- a/torch/csrc/jit/script/compiler.cpp +++ b/torch/csrc/jit/script/compiler.cpp @@ -1,5 +1,6 @@ #include "torch/csrc/jit/script/compiler.h" #include "torch/csrc/jit/passes/lower_tuples.h" +#include "torch/csrc/jit/passes/annotate_effects.h" #include "torch/csrc/jit/operator.h" #include "torch/csrc/jit/interpreter.h" #include "torch/csrc/jit/ir.h" @@ -7,6 +8,7 @@ #include "torch/csrc/jit/assertions.h" #include "torch/csrc/utils/object_ptr.h" #include "torch/csrc/jit/operator.h" +#include "torch/csrc/jit/script/builtin_functions.h" #include "torch/csrc/jit/constants.h" @@ -449,7 +451,8 @@ Value* tryMatchArgument( const SourceRange& loc, const NamedValue& named_value, std::function err, - bool convert_tensors_to_nums) { + bool convert_tensors_to_nums, + TypeEnv & type_env) { Value* value = named_value.value(graph); // some functions that take lists of integers for fixed size arrays @@ -460,35 +463,44 @@ Value* tryMatchArgument( value = graph.insertNode(graph.createList(IntType::get(), repeated))->output(); } + TypePtr concrete_type; + try { + concrete_type = matchTypeVariables(arg.type, value->type(), type_env); + } catch(TypeMatchError& e) { + err() << "could not match type " << value->type()->str() << " to " + << arg.type->str() << " in argument '" << arg.name << "': " << e.what() << "\n" + << named_value.locOr(loc); + return nullptr; + } + // Allow homogeneous tuples to be casted implicitly to lists of appropriate types - if (convertibleToList(value->type(), arg.type) && + if (convertibleToList(value->type(), concrete_type) && value->type()->kind() == TypeKind::TupleType) { auto unpacked = createTupleUnpack(value); - auto elem_type = arg.type->expect()->getElementType(); + auto elem_type = concrete_type->expect()->getElementType(); value = graph.insertNode(graph.createList(elem_type, unpacked))->output(); } if (value->node()->kind() == prim::None){ - if (arg.type->isSubtypeOf(NumberType::get())) + if (concrete_type->isSubtypeOf(NumberType::get())) value = graph.insertConstant(at::Scalar(NAN), loc); - else if (arg.type->isSubtypeOf(GeneratorType::get())) { - value = graph.insertNode(graph.createNoneGenerator()) - ->output()->setType(GeneratorType::get()); + else if (concrete_type->isSubtypeOf(GeneratorType::get())) { + value = graph.insertNode(graph.createNoneGenerator())->output(); } else value = graph.insertNode(graph.createUndefined())->output(); } //implicit conversion of tensors to scalars - if(convert_tensors_to_nums && arg.type->isSubtypeOf(NumberType::get()) + if(convert_tensors_to_nums && concrete_type->isSubtypeOf(NumberType::get()) && value->type()->isSubtypeOf(DynamicType::get())) { - auto n = graph.createImplicitTensorToNum(arg.type, value); + auto n = graph.createImplicitTensorToNum(concrete_type, value); value = graph.insertNode(n) ->setSourceLocation(std::make_shared(loc)) ->output(); } - if(!value->type()->isSubtypeOf(arg.type)) { - err() << "expected a value of type " << arg.type->str() << " for argument '" << arg.name << "' but found " + if(!value->type()->isSubtypeOf(concrete_type)) { + err() << "expected a value of type " << concrete_type->str() << " for argument '" << arg.name << "' but found " << value->type()->str() << "\n" << named_value.locOr(loc); return nullptr; @@ -510,11 +522,12 @@ Value* tryCreateList( const SourceRange& loc, at::ArrayRef varargs, std::function err, - bool convert_tensor_to_num) { - Argument elem_arg("", elem_type); + bool convert_tensor_to_num, + TypeEnv & type_env) { + Argument elem_arg("", elem_type); std::vector list_ctor; for(const auto& a : varargs) { - Value* av = tryMatchArgument(elem_arg, graph, loc, a, err, convert_tensor_to_num); + Value* av = tryMatchArgument(elem_arg, graph, loc, a, err, convert_tensor_to_num, type_env); if(!av) return nullptr; list_ctor.push_back(av); @@ -537,117 +550,129 @@ static Value* materializeConstant(T val, Graph& graph, return new_constant; } -at::optional> tryMatchSchema( - const FunctionSchema& schema, - const SourceRange& loc, - Graph& graph, - at::ArrayRef args, - at::ArrayRef kwargs, - std::ostream& failure_messages, - bool convert_tensors_to_nums) { - auto err = [&]() -> std::ostream& { - failure_messages << "\nfor operator " << schema << ":\n"; - return failure_messages; - }; - - std::vector positional_inputs; - std::vector used_kwarg(kwargs.size(), false); - - // if we finish the loop will we have consumed all arguments? - size_t used_args = 0; - - for(size_t schema_i = 0; schema_i < schema.arguments.size(); ++schema_i) { - const auto& arg = schema.arguments[schema_i]; - at::optional v; - if(!arg.kwarg_only && schema_i < args.size()) { +at::optional tryMatchSchema( + const FunctionSchema& schema, + const SourceRange& loc, + Graph& graph, + at::ArrayRef raw_args, + at::ArrayRef kwargs, + std::ostream& failure_messages, + bool convert_tensors_to_nums) { + // Match against a potentially mutable schema. + // + // We need to treat mutable schemas differently because the IR explicitly + // expresses effects by including a world token in mutable ops. Users do not + // know about the world token, so we need to generate a dummy one and add + // it to the inputs for schema matching. + // + // Example: + // append(int[] list, int el) + // becomes + // append(World w, int[] list, int el) + // + // NOTE: The dummy world token has no meaning; the AnnotateEffects pass is + // necessary to enforce linearization on effectful ops. + std::vector modifiedArgs(raw_args.begin(), raw_args.end()); + if (schema.is_mutable) { + // Add a dummy world token to be matched against + const auto worldToken = graph.insertDummyWorld(); + modifiedArgs.insert(modifiedArgs.begin(), worldToken); + } + auto err = [&]() -> std::ostream& { + failure_messages << "\nfor operator " << schema << ":\n"; + return failure_messages; + }; - // allow zeros(IntList sizes) to work with zeros(1, 2) or zeros(1) - if (arg.type->kind() == TypeKind::ListType && // the formal must be a list - !arg.N && // it must not be a broadcasting list like int[3], otherwise a single int is a valid input - (schema_i + 1 == schema.arguments.size() || schema.arguments[schema_i + 1].kwarg_only) && // must be the last position argument - !convertibleToList(args[schema_i].value(graph)->type(), arg.type)) { // and the actual should not be a list already + TypeEnv type_env; + std::vector positional_inputs; + std::vector used_kwarg(kwargs.size(), false); + + // if we finish the loop will we have consumed all arguments? + size_t used_args = 0; + + for (size_t schema_i = 0; schema_i < schema.arguments.size(); ++schema_i) { + const auto& arg = schema.arguments[schema_i]; + at::optional v; + if (!arg.kwarg_only && schema_i < modifiedArgs.size()) { + // allow zeros(IntList sizes) to work with zeros(1, 2) or zeros(1) + if (arg.type->kind() == TypeKind::ListType && // the formal must be a list + !arg.N && // it must not be a broadcasting list like int[3], otherwise + // a single int is a valid input + (schema_i + 1 == schema.arguments.size() || + schema.arguments[schema_i + 1] + .kwarg_only)) { // must be the last position argument + auto actual_type = modifiedArgs[schema_i].value(graph)->type(); + if (actual_type->kind() != TypeKind::ListType && + !convertibleToList( + actual_type, + arg.type)) { // and the actual should not be a list already auto elem_type = arg.type->expect()->getElementType(); - Value* list = tryCreateList(elem_type, graph, loc, args.slice(schema_i), - err, convert_tensors_to_nums); - if(!list) + Value* list = tryCreateList( + elem_type, + graph, + loc, + at::ArrayRef(modifiedArgs).slice(schema_i), + err, + convert_tensors_to_nums, + type_env); + if (!list) return at::nullopt; - used_args = args.size(); + used_args = modifiedArgs.size(); positional_inputs.push_back(list); continue; } + } - v = args[schema_i]; - used_args++; - } else if(auto idx = findInputWithName(arg.name, kwargs)) { - const NamedValue& nv = kwargs[*idx]; - if(used_kwarg[*idx]) { - err() << "argument " << nv.name() << " specified twice in schema, submit a bug report!\n" << nv.locOr(loc); - return at::nullopt; - } - used_kwarg[*idx] = true; - v = nv; - } else if(arg.default_value) { - v = NamedValue(*arg.default_value); - } else { - err() << "argument " << schema.arguments[schema_i].name << " not provided.\n" << loc; + v = modifiedArgs[schema_i]; + used_args++; + } else if (auto idx = findInputWithName(arg.name, kwargs)) { + const NamedValue& nv = kwargs[*idx]; + if (used_kwarg[*idx]) { + err() << "argument " << nv.name() + << " specified twice in schema, submit a bug report!\n" + << nv.locOr(loc); return at::nullopt; } - Value * positional = tryMatchArgument(arg, graph, loc, *v, err, convert_tensors_to_nums); - if(!positional) - return at::nullopt; - positional_inputs.push_back(positional); - } - - // check for unused positional arguments - if(used_args < args.size()) { - err() << "expected at most " << used_args << " arguments " - << "but found " << args.size() << " positional arguments.\n" << loc << "\n"; + used_kwarg[*idx] = true; + v = nv; + } else if (arg.default_value) { + v = NamedValue(*arg.default_value); + } else { + err() << "argument " << schema.arguments[schema_i].name + << " not provided.\n" + << loc; return at::nullopt; } - // check for unused kwargs - for(size_t i = 0; i < kwargs.size(); ++i) { - const auto& nv = kwargs[i]; - if (!used_kwarg[i]) { - if(!schema.argumentIndexWithName(nv.name())) { - err() << "keyword argument " << nv.name() << " unknown\n"; - } else { - err() << "keyword argument " << nv.name() << " specified twice\n"; - } - return at::nullopt; + Value* positional = tryMatchArgument( + arg, graph, loc, *v, err, convert_tensors_to_nums, type_env); + if (!positional) + return at::nullopt; + positional_inputs.push_back(positional); + } + + // check for unused positional arguments + if (used_args < modifiedArgs.size()) { + err() << "expected at most " << used_args << " arguments " + << "but found " << modifiedArgs.size() << " positional arguments.\n" + << loc << "\n"; + return at::nullopt; + } + // check for unused kwargs + for (size_t i = 0; i < kwargs.size(); ++i) { + const auto& nv = kwargs[i]; + if (!used_kwarg[i]) { + if (!schema.argumentIndexWithName(nv.name())) { + err() << "keyword argument " << nv.name() << " unknown\n"; + } else { + err() << "keyword argument " << nv.name() << " specified twice\n"; } + return at::nullopt; } - return positional_inputs; -} - - -static Value* tryEmitBuiltin( - const std::shared_ptr& op, - std::stringstream& failure_messages, - const SourceRange& loc, - Graph& graph, - Symbol name, - at::ArrayRef inputs, - at::ArrayRef attributes, - bool convert_tensors_to_nums) { - - auto matched_inputs = tryMatchSchema(op->schema(), loc, graph, inputs, attributes, - failure_messages, convert_tensors_to_nums); - if(!matched_inputs) - return nullptr; - // we successfully matched this schema, construct the node - - auto n = graph.insertNode(graph.create(name, *matched_inputs, 0)) - ->setSourceLocation(std::make_shared(loc)); - - for(auto & ret : op->schema().returns) { - n->addOutput()->setType(ret.type); } - - // assert that we did indeed create an op that has implementation - // otherwise schema and dispatch are not in sync - getOperation(n); - - return packOutputs(graph, n->outputs()); + auto return_types = fmap(schema.returns, [&](const Argument& r) { + return evalTypeVariables(r.type, type_env); + }); + return MatchedSchema{std::move(positional_inputs), std::move(return_types)}; } static std::string prefixLine(const std::string& str, std::string prefix) { @@ -662,6 +687,29 @@ static std::string prefixLine(const std::string& str, std::string prefix) { return ss.str(); } +// Given a successful match between operator schema and symbol, emit a node +// with the appropriate inputs and outputs. +static Value* emitBuiltinNode( + const MatchedSchema& matched_schema, + const SourceRange& loc, + Graph& graph, + Symbol name) { + auto n = graph.insertNode(graph.create(name, matched_schema.inputs, 0)) + ->setSourceLocation(std::make_shared(loc)); + + for(auto & ret : matched_schema.return_types) { + n->addOutput()->setType(ret); + } + + // assert that we did indeed create an op that has implementation + // otherwise schema and dispatch are not in sync + getOperation(n); + + return packOutputs(graph, n->outputs()); +} + +// Search for operators matching the provided symbol name and input types. +// If one is found, emit a node to the graph for that operator. Value* emitBuiltinCall( const SourceRange& loc, Graph& graph, @@ -674,23 +722,45 @@ Value* emitBuiltinCall( const auto& variants = getAllOperatorsFor(name); + const auto& builtin_functions = getAllBuiltinFunctionsFor(name); + std::stringstream failure_messages; //first we try to match the schema without any conversion //if no schema matches then insert ImplicitTensorToNum - for(bool convert_tensors_to_nums : {false, true}) { - //clear previous error messages + for (bool convert_tensors_to_nums : {false, true}) { + // clear previous error messages failure_messages.str(""); for (const std::shared_ptr& op : variants) { - if (auto result = tryEmitBuiltin( - op, failure_messages, loc, graph, name, inputs, attributes, + const auto matched_schema = tryMatchSchema( + op->schema(), + loc, + graph, + inputs, + attributes, + failure_messages, + convert_tensors_to_nums); + + if (matched_schema) { + return emitBuiltinNode(*matched_schema, loc, graph, name); + } + } + for (Method* method : builtin_functions) { + if (auto result = try_emit_call_to( + graph, + loc, + *method, + inputs, + attributes, + failure_messages, + nullptr, convert_tensors_to_nums)) { - return result; + return packOutputs(graph, *result); } } } // none of the options worked - if(!required) { + if (!required) { return nullptr; } if(variants.size() == 0) { @@ -719,8 +789,8 @@ std::shared_ptr BuiltinFunction::call( if (value) inputs.push_back(*value); inputs.insert(inputs.end(), inputs_.begin(), inputs_.end()); - return std::make_shared( - emitBuiltinCall(loc, *m.graph(), symbol, inputs, attributes, true)); + return std::make_shared(emitBuiltinCall( + loc, *m.graph(), symbol, inputs, attributes, true)); } inline bool isSupportedListElementType(TypePtr type) { @@ -728,19 +798,6 @@ inline bool isSupportedListElementType(TypePtr type) { type->isSubtypeOf(NumberType::get()); } -// guard for List types we do not currently have operations for -inline void ensureLegalType(const SourceRange& range, TypePtr ptr) { - if(TupleTypePtr tt = ptr->cast()) { - for(auto elem : tt->elements()) { - ensureLegalType(range, elem); - } - } else if(ListTypePtr lt = ptr->cast()) { - if(!isSupportedListElementType(lt->getElementType())) { - throw ErrorReport(range) << "Lists can only contain numbers or Tensors, but found " << lt->getElementType()->str(); - } - } -} - struct to_ir { to_ir( Def def, @@ -791,7 +848,6 @@ struct to_ir { // Record the type for the schema and set the Type on the Value* arguments.push_back(schema.arguments.at(arg_annotation_idx++)); new_input->setType(arguments.back().type); - ensureLegalType((*it).ident().range(), arguments.back().type); } // body auto stmts = def.statements(); @@ -841,6 +897,8 @@ struct to_ir { } method.setSchema({def.name().name(), std::move(arguments), std::move(returns)}); + // annotate effects to prevent reordering + AnnotateEffects(graph); // remove any uses of tuples that we inserted that are not needed LowerSimpleTuples(graph); } @@ -1577,7 +1635,6 @@ struct to_ir { } Value* result = graph->insertNode(graph->createList(elem_type, values)) ->output(); - ensureLegalType(tree->range(), result->type()); return result; } break; case TK_TUPLE_LITERAL: { diff --git a/torch/csrc/jit/script/compiler.h b/torch/csrc/jit/script/compiler.h index deef6a5c2ca8f5..745137d1a9ad05 100644 --- a/torch/csrc/jit/script/compiler.h +++ b/torch/csrc/jit/script/compiler.h @@ -164,7 +164,13 @@ TORCH_API void ensureTensors(const SourceRange& range, at::ArrayRef valu // if it returns nullopt, then failure_messages contains a good error report // set convert_tensor_to_num to true if ImplicitTensorToNums should be inserted to // match the schema -TORCH_API at::optional> tryMatchSchema( + +struct MatchedSchema { + std::vector inputs; + std::vector return_types; +}; + +TORCH_API at::optional tryMatchSchema( const FunctionSchema& schema, const SourceRange& loc, Graph& graph, diff --git a/torch/csrc/jit/script/init.cpp b/torch/csrc/jit/script/init.cpp index f0dfda81cc0926..4c7df820b13b4f 100644 --- a/torch/csrc/jit/script/init.cpp +++ b/torch/csrc/jit/script/init.cpp @@ -114,9 +114,9 @@ struct VISIBILITY_HIDDEN PythonValue : public SugaredValue { auto schema = getSchema(inputs.size(), n_binders); std::stringstream failure_messages; - at::optional> all_inputs = + at::optional matched_schema = tryMatchSchema(schema, loc, *m.graph(), inputs_, attributes, failure_messages, /*conv_tensor_to_num*/true); - if (!all_inputs) + if (!matched_schema) throw ErrorReport(loc) << failure_messages.str(); // Release the function object so we can wrap it in a PythonOp @@ -125,12 +125,12 @@ struct VISIBILITY_HIDDEN PythonValue : public SugaredValue { Node* new_node = m.graph()->insertNode(m.graph()->createPythonOp( THPObjectPtr(func.release().ptr()), cconv, {})); new_node->setSourceLocation(std::make_shared(loc)); - for(auto &i : *all_inputs) + for(auto &i : matched_schema->inputs) new_node->addInput(i); std::vector outputs; - for(auto & ret_arg : schema.returns) { - outputs.push_back(new_node->addOutput()->setType(ret_arg.type)); + for(auto & ret_arg : matched_schema->return_types) { + outputs.push_back(new_node->addOutput()->setType(ret_arg)); } return std::make_shared(packOutputs(*m.graph(), outputs)); } @@ -371,7 +371,14 @@ void initJitScriptBindings(PyObject* module) { // public. py::class_>(m, "ScriptModule") .def(py::init<>()) - .def("save", &Module::save) + .def("save", [](std::shared_ptr m, const std::string& filename) { + m->save(filename); + }) + .def("save_to_buffer", [](std::shared_ptr m) { + std::ostringstream buf; + m->save(buf); + return py::bytes(buf.str()); + }) .def("_set_optimized", &Module::set_optimized) .def( "_define", @@ -534,7 +541,13 @@ void initJitScriptBindings(PyObject* module) { }); m.def("merge_type_from_type_comment", &mergeTypesFromTypeComment); - m.def("import_ir_module", import_ir_module); + m.def("import_ir_module", [](ModuleLookup module_lookup, const std::string& filename) { + import_ir_module(module_lookup, filename); + }); + m.def("import_ir_module_from_buffer", [](ModuleLookup module_lookup, const std::string& buffer) { + std::istringstream in(buffer); + import_ir_module(module_lookup, in); + }); } } // namespace script diff --git a/torch/csrc/jit/script/module.cpp b/torch/csrc/jit/script/module.cpp index b1f6a6e220bbc9..61261a352d456e 100644 --- a/torch/csrc/jit/script/module.cpp +++ b/torch/csrc/jit/script/module.cpp @@ -37,8 +37,15 @@ const FunctionSchema& Method::getSchema() const { return *schema; } -std::vector Method::emit_call_to(SourceRange loc, Method & callee, ArrayRef args, ArrayRef kwargs) { - JIT_ASSERT(!executor); +at::optional> try_emit_call_to( + Graph& graph, + SourceRange loc, + Method& callee, + ArrayRef args, + ArrayRef kwargs, + std::stringstream& failure_messages, + Method* caller, + bool conv_tensors_to_nums) { try { callee.ensure_defined(); } catch (RecursiveMethodCallError&) { @@ -47,19 +54,38 @@ std::vector Method::emit_call_to(SourceRange loc, Method & callee, Array } auto fn = callee.graph(); - std::stringstream failure_messages; - auto all_inputs = tryMatchSchema( + auto matched_schema = tryMatchSchema( callee.getSchema(), - loc, *graph(), args, kwargs, failure_messages, /*conv_tensors_to_nums*/true); - if(!all_inputs) - throw ErrorReport(loc) << failure_messages.str(); + loc, graph, args, kwargs, failure_messages, conv_tensors_to_nums); + if(!matched_schema) + return at::nullopt; // parameters to callee method (which become parameters to _this_ method // if they were not already) - for(at::Tensor* member : callee.member_inputs) { - all_inputs->push_back(get_or_add_parameter(member)); + for(at::Tensor* member : callee.params()) { + if(!caller) { + throw ErrorReport(loc) << " attempting to call a method with parameters from a raw graph. File a bug report"; + } + matched_schema->inputs.push_back(caller->get_or_add_parameter(member)); } - return inlineCallTo(*graph(), *callee.graph(), *all_inputs); + return inlineCallTo(graph, *callee.graph(), matched_schema->inputs); +} + +std::vector Method::emit_call_to(SourceRange loc, Method & callee, ArrayRef args, ArrayRef kwargs) { + JIT_ASSERT(!executor); + std::stringstream failure_messages; + if (auto result = try_emit_call_to( + *graph(), + loc, + callee, + args, + kwargs, + failure_messages, + this, + /*conv_tensors_to_nums=*/true)) { + return *result; + } + throw ErrorReport(loc) << failure_messages.str(); } void Method::ensure_defined() { @@ -71,6 +97,10 @@ void Method::ensure_defined() { } } +void Module::save(std::ostream& out) { + ExportModule(*this, out); +} + void Module::save(const std::string& filename) { ExportModule(*this, filename); } diff --git a/torch/csrc/jit/script/module.h b/torch/csrc/jit/script/module.h index 50ae9f48fb3c93..cafd084ce2cca3 100644 --- a/torch/csrc/jit/script/module.h +++ b/torch/csrc/jit/script/module.h @@ -20,6 +20,7 @@ #include #include #include +#include // This file contains classes which assist in desugaring Python style // modules and their methods into flattened graphs which don't have any @@ -84,6 +85,7 @@ struct Method { // defined here to keep details of member_input handling confined to this class std::vector emit_call_to(SourceRange loc, Method & callee, ArrayRef args, ArrayRef kwargs); + // if this isn't yet defined, run its method_creator function void ensure_defined(); @@ -376,6 +378,8 @@ struct Module { return get_method(method_name)({IValue(std::forward(args))...}); } + void save(std::ostream& out); + void save(const std::string& filename); private: @@ -390,4 +394,18 @@ struct Module { bool optimize; }; +// returns at::nullopt and fills in failure_messages if the callee does not +// match the functions schema +at::optional> try_emit_call_to( + Graph& graph, + SourceRange loc, + Method& callee, + ArrayRef args, + ArrayRef kwargs, + std::stringstream& failure_messages, + // when callee uses no parameters (e.g. it is a function in a compilation unit, + // and not a method), then nullptr can be passed as caller. + Method* caller, + bool conv_tensors_to_nums); + }}} diff --git a/torch/csrc/jit/script/parser.h b/torch/csrc/jit/script/parser.h index 14e5e4f5ae1354..64f7f9c8db935a 100644 --- a/torch/csrc/jit/script/parser.h +++ b/torch/csrc/jit/script/parser.h @@ -64,6 +64,8 @@ struct Parser { std::vector exprs = { prefix }; while(L.cur().kind != end) { L.expect(','); + if (L.cur().kind == end) + break; exprs.push_back(parseExp()); } auto list = List::create(prefix.range(), exprs); diff --git a/torch/csrc/jit/serialization.h b/torch/csrc/jit/serialization.h index a4ebd864ac6cc3..9fc8d4a1b688dd 100644 --- a/torch/csrc/jit/serialization.h +++ b/torch/csrc/jit/serialization.h @@ -3,6 +3,9 @@ #include #include #include +#include +#include +#include namespace torch { namespace jit { @@ -75,25 +78,16 @@ namespace { static constexpr uint64_t kFileFormatVersion = 0x1L; static constexpr uint8_t kPadValue = 0xEF; - void wrapPErrorAndThrow(const std::string& msg) { - std::ostringstream oss; - oss << msg << " : " << strerror(errno); - throw std::runtime_error(oss.str()); - } } // namespace -class PyTorchFileReader { +class PyTorchStreamReader { public: - PyTorchFileReader(std::string filename) { - fp = std::fopen(filename.c_str(), "rb"); - if (!fp) { - wrapPErrorAndThrow("Couldn't open file for reading!"); - } + PyTorchStreamReader(std::istream& in_) : in(in_) { // Store file size so we know when we're done reading because the f* APIs // don't do a good job of that - std::fseek(fp, 0L, SEEK_END); - file_size = std::ftell(fp); - std::fseek(fp, 0L, SEEK_SET); + in.seekg(0L, in.end); + file_size = in.tellg(); + in.seekg(0L); readAndValidateFileHeader(); // Do this now since we're reasonably sure this is actually a PyT file from // the header. @@ -115,7 +109,7 @@ class PyTorchFileReader { } // Seek to the provided offset cursor = key; - std::fseek(fp, cursor, SEEK_SET); + in.seekg(cursor); auto tag = read64BitIntegerLittleEndian(); if (tag != RecordTags::STORAGE) { throw std::runtime_error("Attempted to read a record of non-storage type"); @@ -124,18 +118,16 @@ class PyTorchFileReader { seekToNextAlignmentBoundary(); auto ptr = malloc(size); at::DataPtr retval(ptr, ptr, free, at::kCPU); - if (!std::fread(ptr, size, 1, fp)) { - wrapPErrorAndThrow("Failed to read data from record"); - } + + in.read((char*)ptr, size); cursor += size; seekToNextAlignmentBoundary(); return std::tuple(std::move(retval), size); } - ~PyTorchFileReader() { - std::fclose(fp); + ~PyTorchStreamReader() { } private: - FILE *fp; + std::istream& in; size_t cursor = 0; size_t file_size; size_t last_record_offset; @@ -144,8 +136,9 @@ class PyTorchFileReader { uint64_t read64BitIntegerLittleEndian() { uint64_t retval; // TODO endian swap on platforms that need it? - size_t read_bytes = std::fread(&retval, 1u, 8u, fp); - if (read_bytes != 8u) { + in.read(reinterpret_cast(&retval), 8); + std::streamsize read_bytes = in.gcount(); + if (read_bytes != 8) { std::ostringstream errmsg; errmsg << "Expected to read 8 bytes but got " << read_bytes; throw std::runtime_error(errmsg.str()); @@ -158,7 +151,7 @@ class PyTorchFileReader { size_t next_offset = (cursor + kFieldAlignment) - (cursor % kFieldAlignment); size_t pad_amount = next_offset - cursor; cursor += pad_amount; - std::fseek(fp, cursor, SEEK_SET); + in.seekg(cursor); } // File format deserialization functions @@ -183,7 +176,7 @@ class PyTorchFileReader { // Seek to location of file footer. We've already validated that the file // length is a multiple of the alignment size cursor = file_size - kFieldAlignment; - std::fseek(fp, cursor, SEEK_SET); + in.seekg(cursor); auto tag = read64BitIntegerLittleEndian(); if (tag != RecordTags::FOOTER) { throw std::runtime_error("File footer has wrong record type. Is this" @@ -197,13 +190,9 @@ class PyTorchFileReader { } }; -class PyTorchFileWriter { +class PyTorchStreamWriter { public: - PyTorchFileWriter(const std::string& filename) { - fp = std::fopen(filename.c_str(), "wb"); - if (!fp) { - wrapPErrorAndThrow("Unable to open PyTorch file for writing!"); - } + PyTorchStreamWriter(std::ostream& out_) : out(out_) { writeFileHeader(); // In the case that we do not write any records into this file, the last // record index written into the footer will point to the footer itself. @@ -224,15 +213,14 @@ class PyTorchFileWriter { JIT_ASSERT(!finalized); writeFileFooter(); finalized = true; - std::fclose(fp); } - ~PyTorchFileWriter() { + ~PyTorchStreamWriter() { if (!finalized) { writeEndOfFile(); } } private: - FILE *fp; + std::ostream& out; size_t cursor = 0; bool finalized = false; size_t last_record_idx = 0; @@ -240,17 +228,13 @@ class PyTorchFileWriter { // Utility functions void write64BitIntegerLittleEndian(const uint64_t value) { // TODO endian swap on platforms that need it? - if (!std::fwrite(&value, 8u, 1u, fp)) { - wrapPErrorAndThrow("Unable to write to file!"); - } + out.write(reinterpret_cast(&value), 8); cursor += 8u; } void writePad(const size_t num_bytes) { static std::vector pad_buffer(kPadValue, kFieldAlignment); - if (!std::fwrite(pad_buffer.data(), num_bytes, 1u, fp)) { - wrapPErrorAndThrow("Unable to write to file!"); - } + out.write(pad_buffer.data(), num_bytes); cursor += num_bytes; } @@ -261,9 +245,7 @@ class PyTorchFileWriter { } void writeBuffer(const char* data, size_t size) { - if (!std::fwrite(data, size, 1u, fp)) { - wrapPErrorAndThrow("Unable to write to file!"); - } + out.write(data, size); cursor += size; } @@ -281,5 +263,43 @@ class PyTorchFileWriter { } }; +class PyTorchFileReader { + public: + PyTorchFileReader(const std::string& filename) : + in(filename, std::ios_base::binary), + stream_reader(in) {} + + std::tuple getLastRecord() { + return stream_reader.getLastRecord(); + } + + std::tuple getRecordWithKey(uint64_t key) { + return stream_reader.getRecordWithKey(key); + } + + private: + std::ifstream in; + PyTorchStreamReader stream_reader; +}; + +class PyTorchFileWriter { + public: + PyTorchFileWriter(const std::string& filename) : + out(filename, std::ios_base::binary), + stream_writer(out) {} + + uint64_t writeRecord(const char* data, size_t size) { + return stream_writer.writeRecord(data, size); + } + + void writeEndOfFile() { + stream_writer.writeEndOfFile(); + out.close(); + } + + private: + std::ofstream out; + PyTorchStreamWriter stream_writer; +}; }} // namespace torch::jit diff --git a/torch/csrc/jit/type.cpp b/torch/csrc/jit/type.cpp index a4ada2647af7f5..855adad429191f 100644 --- a/torch/csrc/jit/type.cpp +++ b/torch/csrc/jit/type.cpp @@ -55,6 +55,10 @@ std::ostream& operator<<(std::ostream & out, const Type & t) { out << "string"; } else if(t.kind() == TypeKind::GeneratorType) { out << "Generator"; + } else if(t.kind() == TypeKind::VarType) { + out << t.expect()->name(); + } else if(t.kind() == TypeKind::WorldType) { + out << "World"; } else { AT_ERROR("unknown type kind"); } @@ -89,6 +93,10 @@ GeneratorTypePtr GeneratorType::get() { static auto value = GeneratorType::create(); return value; } +WorldTypePtr WorldType::get() { + static auto value = WorldType::create(); + return value; +} StringTypePtr StringType::get() { static auto value = StringType::create(); return value; @@ -170,4 +178,71 @@ at::optional unifyTypes(const TypePtr& t1, const TypePtr& t2) { return at::nullopt; } +TypePtr matchTypeVariables(TypePtr formal, TypePtr actual, TypeEnv& type_env) { + if(!formal->hasFreeVariables()) + return formal; + if(auto vt = formal->cast()) { + auto it = type_env.find(vt->name()); + if(it == type_env.end()) { + type_env[vt->name()] = actual; + return actual; + } else if(auto unified = unifyTypes(it->second, actual)) { + type_env[vt->name()] = *unified; + return *unified; + } + std::stringstream ss; + ss << "type variable '" << vt->name() <<"' previously matched to type " << + it->second->str() << " is matched to type " << actual->str(); + throw TypeMatchError(ss.str()); + } else if(auto lt_formal = formal->cast()) { + if(auto lt_actual = actual->cast()) { + return ListType::create(matchTypeVariables(lt_formal->getElementType(), lt_actual->getElementType(), type_env)); + } else { + std::stringstream ss; + ss << "cannot match a list to " << actual->str(); + throw TypeMatchError(ss.str()); + } + } else if(auto tp_formal = formal->cast()) { + if(auto tp_actual = actual->cast()) { + if(tp_formal->elements().size() != tp_actual->elements().size()) { + std::stringstream ss; + throw TypeMatchError("cannot match tuples of mismatched size"); + } + std::vector elements; + for(size_t i = 0; i < tp_formal->elements().size(); ++i) { + TypePtr result = matchTypeVariables( + tp_formal->elements()[i], + tp_actual->elements()[i], + type_env); + elements.push_back(result); + } + return TupleType::create(std::move(elements)); + } else { + std::stringstream ss; + ss << "cannot match a tuple to " << actual->str(); + throw TypeMatchError(ss.str()); + } + } + AT_ERROR("unhandled free variable container: ", formal->str()); +} + +// change return types like List[List[t]] into List[List[int]] +TORCH_API TypePtr evalTypeVariables(TypePtr type, std::unordered_map& type_env) { + if(!type->hasFreeVariables()) + return type; + + if(auto vt = type->cast()) { + auto it = type_env.find(vt->name()); + AT_ASSERTM(it != type_env.end(), "schema has unbound type variable '", vt->name(), "' in its return type"); + return it->second; + } else if(auto lt = type->cast()) { + return ListType::create(evalTypeVariables(lt->getElementType(), type_env)); + } else if(auto tp = type->cast()) { + return TupleType::create(fmap(tp->elements(), [&](const TypePtr& typ) { + return evalTypeVariables(typ, type_env); + })); + } + return type; +} + }} // namespace torch::jit diff --git a/torch/csrc/jit/type.h b/torch/csrc/jit/type.h index 96e9f45496a34b..49748de239e2b2 100644 --- a/torch/csrc/jit/type.h +++ b/torch/csrc/jit/type.h @@ -27,6 +27,8 @@ _(IntType) \ _(NoneType) \ _(StringType) \ _(GeneratorType) \ +_(VarType) \ +_(WorldType) \ enum class TypeKind { #define DEFINE_TYPE(T) T, @@ -133,6 +135,9 @@ struct TORCH_API Type : std::enable_shared_from_this { return r; } virtual ~Type() = default; + virtual bool hasFreeVariables() const { + return false; + } }; inline bool operator!=(const Type & lhs, const Type & rhs) { @@ -366,6 +371,32 @@ struct TORCH_API CompleteTensorType : public TensorType { std::vector strides_; }; +// This type is a token used to represent effectful computation in the IR. +// See the AnnotateEffects pass for how it is used. +struct WorldType; +using WorldTypePtr = std::shared_ptr; +struct TORCH_API WorldType : public Type { + template + static WorldTypePtr create(T&&... all) { + return WorldTypePtr(new WorldType(std::forward(all)...)); + } + bool operator==(const Type& rhs) const override { + return rhs.kind() == kind(); + } + std::string str() const override { + return "world"; + } + bool isSubtypeOf(const TypePtr rhs) const override { + return *this == *rhs; + } + static const TypeKind Kind = TypeKind::WorldType; + // global singleton + static WorldTypePtr get(); + + private: + WorldType() : Type(TypeKind::WorldType) {} +}; + struct ListType; using ListTypePtr = std::shared_ptr; @@ -400,6 +431,9 @@ struct TORCH_API ListType : public Type { TypePtr getElementType() const { return elem; } + bool hasFreeVariables() const override { + return has_free_variables_; + } // common cast List[Tensor] static ListTypePtr ofTensors(); static ListTypePtr ofInts(); @@ -408,8 +442,11 @@ struct TORCH_API ListType : public Type { static const TypeKind Kind = TypeKind::ListType; private: ListType(TypePtr elem) - : Type(TypeKind::ListType), elem(std::move(elem)) {} + : Type(TypeKind::ListType) + , elem(std::move(elem)) + , has_free_variables_(getElementType()->hasFreeVariables()) {} TypePtr elem; + bool has_free_variables_; }; struct TupleType; @@ -461,12 +498,20 @@ struct TORCH_API TupleType : public Type { ss << "]"; return ss.str(); } + bool hasFreeVariables() const override { + return has_free_variables_; + } static const TypeKind Kind = TypeKind::TupleType; private: TupleType(std::vector elements_) : Type(TypeKind::TupleType) - , elements_(std::move(elements_)) {} + , elements_(std::move(elements_)) { + has_free_variables_ = + std::any_of(elements_.begin(), elements_.end(), [](TypePtr v) { + return v->hasFreeVariables(); + }); + } bool compare(const Type& rhs, std::function fn) const { if(rhs.kind() != kind()) @@ -482,6 +527,7 @@ struct TORCH_API TupleType : public Type { return true; } std::vector elements_; + bool has_free_variables_; }; struct NumberType; @@ -631,6 +677,34 @@ struct GeneratorType : public Type { }; +// a type variable, used in FunctionSchema +struct VarType; +using VarTypePtr = std::shared_ptr; +struct VarType : public Type { + static constexpr bool is_singleton = false; + template + static VarTypePtr create(std::string name_) { + return VarTypePtr(new VarType(std::move(name_))); + } + bool operator==(const Type& rhs) const override { + return rhs.kind() == kind(); + } + std::string str() const override { + return name(); + } + static const TypeKind Kind = TypeKind::VarType; + const std::string& name() const { + return name_; + } + bool hasFreeVariables() const override { + return true; + } +private: + VarType(std::string name_) + : Type(TypeKind::VarType), name_(name_) {} + std::string name_; +}; + TORCH_API std::ostream& operator<<(std::ostream & out, const Type & t); // what is the type, ignoring extra size/shape information? // e.g. Tensor(2x3) -> Dynamic, and Tuple(Tensor(2x3),...) -> Tuple(Dynamic,...) @@ -689,4 +763,17 @@ template<> inline TypePtr getTypePtr>() { return ListType:: TORCH_API TypePtr inferTypeFrom(const IValue& value); +struct TORCH_API TypeMatchError : public std::exception { + TypeMatchError(std::string msg_) + : msg_(std::move(msg_)) {} + const char * what() const noexcept override { + return msg_.c_str(); + } +private: + std::string msg_; +}; +using TypeEnv = std::unordered_map; +TORCH_API TypePtr matchTypeVariables(TypePtr formal, TypePtr actual, TypeEnv & type_env); +TORCH_API TypePtr evalTypeVariables(TypePtr type, TypeEnv & type_env); + }} // namespace torch::jit diff --git a/torch/csrc/serialization.cpp b/torch/csrc/serialization.cpp index eaf93b92be14bb..de98d278d11a10 100644 --- a/torch/csrc/serialization.cpp +++ b/torch/csrc/serialization.cpp @@ -4,34 +4,41 @@ #include "THP.h" #include "serialization.h" -static ssize_t doPythonReadBuffered(PyObject* fildes, void* buf, size_t nbytes); -static ssize_t doPythonReadInto(PyObject* fildes, void* buf, size_t nbytes); -static ssize_t doPythonWrite(PyObject* fildes, void* buf, size_t nbytes); +template +ssize_t doPartialRead(io fildes, void* buf, size_t nbytes); + +template +ssize_t doPartialWrite(io fildes, void* buf, size_t nbytes); + +static ssize_t doPartialPythonReadBuffered(PyObject* fildes, void* buf, size_t nbytes); +static ssize_t doPartialPythonReadInto(PyObject* fildes, void* buf, size_t nbytes); +static ssize_t doPartialPythonWrite(PyObject* fildes, void* buf, size_t nbytes); template <> -ssize_t doRead(int fildes, void* buf, size_t nbytes) { +ssize_t doPartialRead(int fildes, void* buf, size_t nbytes) { return read(fildes, buf, nbytes); } template <> -ssize_t doRead(PyObject* fildes, void* buf, size_t nbytes) { +ssize_t doPartialRead(PyObject* fildes, void* buf, size_t nbytes) { // Try to use fildes.readinto() instead of fildes.read() // because it is more memory efficient. + // TODO: Stop calling PyObject_HasAttrString() in a loop on our read loop auto has_readinto = PyObject_HasAttrString(fildes, "readinto") == 1; if (has_readinto) { - return doPythonReadInto(fildes, buf, nbytes); + return doPartialPythonReadInto(fildes, buf, nbytes); } - return doPythonReadBuffered(fildes, buf, nbytes); + return doPartialPythonReadBuffered(fildes, buf, nbytes); } template <> -ssize_t doWrite(int fildes, void* buf, size_t nbytes) { +ssize_t doPartialWrite(int fildes, void* buf, size_t nbytes) { return write(fildes, buf, nbytes); } template <> -ssize_t doWrite(PyObject* fildes, void* buf, size_t nbytes) { - return doPythonWrite(fildes, buf, nbytes); +ssize_t doPartialWrite(PyObject* fildes, void* buf, size_t nbytes) { + return doPartialPythonWrite(fildes, buf, nbytes); } static inline bool isUnsupportedOperation() { @@ -43,39 +50,39 @@ static inline bool isUnsupportedOperation() { } // Call Python fildes.read(nbytes) and copy it to buf. -static inline ssize_t doPythonReadBuffered(PyObject* fildes, void* buf, size_t nbytes) { - const size_t buffer_size = 262144; // 2^18 - size_t read_bytes = 0; - - while (read_bytes < nbytes) { - auto remaining = nbytes - read_bytes; - auto to_read = remaining > buffer_size ? buffer_size : remaining; - THPObjectPtr r(PyObject_CallMethod(fildes, "read", "i", to_read)); - if (!r) throw python_error(); - - // read output is String (Python 2) / Bytes (Python 3) +static inline ssize_t doPartialPythonReadBuffered(PyObject* fildes, void* buf, size_t raw_nbytes) { + // If we request a large amount of data, f.read() will internally try to + // allocate a buffer of that size. This is counterproductive, because + // it's not the buffer we ultimately want to write the data into. Read + // less than that and avoid allocating too much extra memory. + // TODO: Maybe 260 KB is a bit small... + const size_t nbytes = std::min(raw_nbytes, 262144u); // 2^18 (~260 KB) + + THPObjectPtr r(PyObject_CallMethod(fildes, "read", "i", nbytes)); + if (!r) throw python_error(); + + // read output is String (Python 2) / Bytes (Python 3) #if PY_MAJOR_VERSION >= 3 - auto size = PyBytes_GET_SIZE(r.get()); - const void* bytes = PyBytes_AsString(r.get()); + auto size = PyBytes_GET_SIZE(r.get()); + const void* py_buf = PyBytes_AsString(r.get()); #else - auto size = PyString_GET_SIZE(r.get()); - const void* bytes = PyString_AsString(r.get()); + auto size = PyString_GET_SIZE(r.get()); + const void* py_buf = PyString_AsString(r.get()); #endif - // we read EOF - if (size == 0) { - return read_bytes; - } + // we read EOF + if (size == 0) { + return 0; + } - memcpy(reinterpret_cast(buf) + read_bytes, bytes, size); - read_bytes += size; - } // Reading loop + // Slurp it into the buffer we actually want + memcpy(buf, py_buf, size); - return read_bytes; + return size; } // Either does fildes.readinto(buf) or fildes.write(buf) -static inline ssize_t doPythonIO(PyObject* fildes, void* buf, size_t nbytes, bool is_read) { +static inline ssize_t doPartialPythonIO(PyObject* fildes, void* buf, size_t nbytes, bool is_read) { #if PY_MAJOR_VERSION >= 3 auto rw_flag = is_read ? PyBUF_WRITE : PyBUF_READ; THPObjectPtr memview(PyMemoryView_FromMemory( @@ -97,19 +104,77 @@ static inline ssize_t doPythonIO(PyObject* fildes, void* buf, size_t nbytes, boo // fildes.readinto can return UnsupportedOperation so fall back to fildes.read. if (is_read && isUnsupportedOperation()) { PyErr_Clear(); - return doPythonReadBuffered(fildes, buf, nbytes); + return doPartialPythonReadBuffered(fildes, buf, nbytes); } throw python_error(); } // Call Python fildes.readinto(buf) -static ssize_t doPythonReadInto(PyObject* fildes, void* buf, size_t nbytes) { - return doPythonIO(fildes, buf, nbytes, /* is_read */ true); +static ssize_t doPartialPythonReadInto(PyObject* fildes, void* buf, size_t nbytes) { + return doPartialPythonIO(fildes, buf, nbytes, /* is_read */ true); } // Call Python fildes.write(buf) -static ssize_t doPythonWrite(PyObject* fildes, void* buf, size_t nbytes) { - return doPythonIO(fildes, buf, nbytes, /* is_read */ false); +static ssize_t doPartialPythonWrite(PyObject* fildes, void* buf, size_t nbytes) { + return doPartialPythonIO(fildes, buf, nbytes, /* is_read */ false); +} + +// Requires that we read EXACTLY nbytes; fails if we don't. +template +void doRead(io fildes, void* raw_buf, size_t nbytes) { + char* buf = static_cast(raw_buf); + while (nbytes > 0) { + errno = 0; // doPartialRead may not set errno + // we read in 1GB blocks to avoid bugs on Mac OS X Lion + // see https://github.com/pytorch/pytorch/issues/1031 for more details + ssize_t r = doPartialRead(fildes, buf, std::min(nbytes, 1073741824)); + if (r < 0) { + int err = errno; + AT_ASSERTM(err != 0, "read(): impossible! r < 0, but no errno was set"); + AT_ASSERTM(err != EAGAIN, "read(): non-blocking fd ", fildes, + " read EAGAIN; cowardly refusing to spin-wait"); + if (err == EINTR) { + continue; + } else { + AT_ERROR("read(): fd ", fildes, " failed with ", strerror(err)); + } + } else if (r == 0) { + break; + } + buf += r; + // This is guaranteed by POSIX, but I just want to be double-sure + // to not underflow a signed integer. + AT_ASSERT(static_cast(r) <= nbytes); + nbytes -= r; + } + if (nbytes != 0) { + AT_ERROR("unexpected EOF, expected ", nbytes, " more bytes. The file might be corrupted."); + } +} + +template +void doWrite(io fildes, void* raw_buf, size_t nbytes) { + char* buf = static_cast(raw_buf); + while (nbytes > 0) { + errno = 0; // doPartialWrite may not set errno + // we write in 1GB blocks to avoid bugs on Mac OS X Lion + // see https://github.com/pytorch/pytorch/issues/1031 for more details + ssize_t r = doPartialWrite(fildes, buf, std::min(nbytes, 1073741824)); + if (r < 0) { + int err = errno; + AT_ASSERTM(err != 0, "write(): impossible! r < 0, but no errno was set"); + AT_ASSERTM(err != EAGAIN, "write(): non-blocking fd ", fildes, + " read EAGAIN; cowardly refusing to spin-wait"); + if (err == EINTR) { + continue; + } else { + AT_ERROR("write(): fd ", fildes, " failed with ", strerror(err)); + } + } + buf += r; + AT_ASSERT(static_cast(r) <= nbytes); + nbytes -= r; + } } #include "generic/serialization.cpp" diff --git a/torch/csrc/serialization.h b/torch/csrc/serialization.h index 410619a68422c5..df811052fe7cda 100644 --- a/torch/csrc/serialization.h +++ b/torch/csrc/serialization.h @@ -8,9 +8,9 @@ #include template -ssize_t doRead(io fildes, void* buf, size_t nbytes); +void doRead(io fildes, void* buf, size_t nbytes); template -ssize_t doWrite(io fildes, void* buf, size_t nbytes); +void doWrite(io fildes, void* buf, size_t nbytes); #endif diff --git a/torch/distributions/multivariate_normal.py b/torch/distributions/multivariate_normal.py index 014a07e53c9532..345fe35ceee614 100644 --- a/torch/distributions/multivariate_normal.py +++ b/torch/distributions/multivariate_normal.py @@ -125,27 +125,29 @@ def __init__(self, loc, covariance_matrix=None, precision_matrix=None, scale_tri if scale_tril.dim() < 2: raise ValueError("scale_tril matrix must be at least two-dimensional, " "with optional leading batch dimensions") - self._unbroadcasted_scale_tril = scale_tril self.scale_tril, loc_ = torch.broadcast_tensors(scale_tril, loc_) elif covariance_matrix is not None: if covariance_matrix.dim() < 2: raise ValueError("covariance_matrix must be at least two-dimensional, " "with optional leading batch dimensions") - self._unbroadcasted_scale_tril = _batch_potrf_lower(covariance_matrix) self.covariance_matrix, loc_ = torch.broadcast_tensors(covariance_matrix, loc_) else: if precision_matrix.dim() < 2: raise ValueError("precision_matrix must be at least two-dimensional, " "with optional leading batch dimensions") - covariance_matrix = _batch_inverse(precision_matrix) - self._unbroadcasted_scale_tril = _batch_potrf_lower(covariance_matrix) - self.covariance_matrix, self.precision_matrix, loc_ = torch.broadcast_tensors( - covariance_matrix, precision_matrix, loc_) + self.precision_matrix, loc_ = torch.broadcast_tensors(precision_matrix, loc_) self.loc = loc_[..., 0] # drop rightmost dim batch_shape, event_shape = self.loc.shape[:-1], self.loc.shape[-1:] super(MultivariateNormal, self).__init__(batch_shape, event_shape, validate_args=validate_args) + if scale_tril is not None: + self._unbroadcasted_scale_tril = scale_tril + else: + if precision_matrix is not None: + self.covariance_matrix = _batch_inverse(precision_matrix).expand_as(loc_) + self._unbroadcasted_scale_tril = _batch_potrf_lower(self.covariance_matrix) + def expand(self, batch_shape, _instance=None): new = self._get_checked_instance(MultivariateNormal, _instance) batch_shape = torch.Size(batch_shape) diff --git a/torch/distributions/studentT.py b/torch/distributions/studentT.py index 6530940b328e7f..3e995d1477faed 100644 --- a/torch/distributions/studentT.py +++ b/torch/distributions/studentT.py @@ -41,8 +41,8 @@ def variance(self): def __init__(self, df, loc=0., scale=1., validate_args=None): self.df, self.loc, self.scale = broadcast_all(df, loc, scale) - self._chi2 = Chi2(df) - batch_shape = torch.Size() if isinstance(df, Number) else self.df.size() + self._chi2 = Chi2(self.df) + batch_shape = self.df.size() super(StudentT, self).__init__(batch_shape, validate_args=validate_args) def expand(self, batch_shape, _instance=None): diff --git a/torch/functional.py b/torch/functional.py index 0eac8f16741766..4290f78585a965 100644 --- a/torch/functional.py +++ b/torch/functional.py @@ -439,6 +439,8 @@ def unique(input, sorted=False, return_inverse=False, dim=None): before returning as output. return_inverse (bool): Whether to also return the indices for where elements in the original input ended up in the returned unique list. + dim (int): the dimension to apply unique. If ``None``, the unique of the + flattened input is returned. default: ``None`` Returns: (Tensor, Tensor (optional)): A tensor or a tuple of tensors containing @@ -646,8 +648,9 @@ def norm(input, p="fro", dim=None, keepdim=False, out=None): Args: input (Tensor): the input tensor - p ({int, float, inf, -inf, 'fro', 'nuc'}): the order of norm + p (int, float, inf, -inf, 'fro', 'nuc', optional): the order of norm. Default: ``'fro'`` The following norms can be calculated: + ===== ============================ ========================== ord matrix norm vector norm ===== ============================ ========================== @@ -656,20 +659,22 @@ def norm(input, p="fro", dim=None, keepdim=False, out=None): 'nuc' nuclear norm -- Other as vec norm when dim is None sum(abs(x)**ord)**(1./ord) ===== ============================ ========================== - dim ({int, 2-tuple of ints, 2-list of ints}, optional): If it is an int, - vector norm will be calculated, if it is 2-tuple of ints, matrix norm - will be calculated. If the value is None, matrix norm will be calculated - when the input tensor only has two dimensions, vector norm will be - calculated when the input tensor only has one dimension. If the input - tensor has more than two dimensions, the vector norm will be applied to - last dimension. - keepdim (bool): whether the output tensors have :attr:`dim` - retained or not. Ignored if attr:`dim`=``None`` and - :attr:`out`=``None``. + + dim (int, 2-tuple of ints, 2-list of ints, optional): If it is an int, + vector norm will be calculated, if it is 2-tuple of ints, matrix norm + will be calculated. If the value is None, matrix norm will be calculated + when the input tensor only has two dimensions, vector norm will be + calculated when the input tensor only has one dimension. If the input + tensor has more than two dimensions, the vector norm will be applied to + last dimension. + keepdim (bool, optional): whether the output tensors have :attr:`dim` + retained or not. Ignored if :attr:`dim` = ``None`` and + :attr:`out` = ``None``. Default: ``False`` out (Tensor, optional): the output tensor. Ignored if - attr:`dim`=``None`` and :attr:`out`=``None``. + :attr:`dim` = ``None`` and :attr:`out` = ``None``. Example:: + >>> import torch >>> a = torch.arange(9, dtype= torch.float) - 4 >>> b = a.reshape((3, 3)) diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py index f7cea00e6292fd..5fd90b5fd95382 100644 --- a/torch/jit/__init__.py +++ b/torch/jit/__init__.py @@ -20,6 +20,8 @@ import numbers import collections import re +if sys.version_info[0] > 2: + import pathlib def _parse_env(name, default, true_message, false_message): @@ -58,19 +60,27 @@ def scope(scope_name): tracing_state.pop_scope() -def load(filename): +def load(f): r""" - Load a ``ScriptModule`` previously saved with :func:`save ` + Load a ``ScriptModule`` previously saved with :func:`save ` .. DANGER:: All previously saved modules, no matter their device, are always loaded onto the CPU. This is different from :func:`torch.load`'s semantics and may change in the future. Arguments: - filename (string): the file to load + f: a file-like object (has to implement read, readline, tell, and seek), + or a string containing a file name Returns: A ``ScriptModule`` object. + + Example: + >>> torch.jit.load('scriptmodule.pt') + # Load ScriptModule from io.BytesIO object + >>> with open('scriptmodule.pt', 'rb') as f: + buffer = io.BytesIO(f.read()) + >>> torch.jit.load(buffer) """ m = ScriptModule() @@ -82,10 +92,48 @@ def module_lookup(names): curr = getattr(curr, name) return curr - torch._C.import_ir_module(module_lookup, filename) + if isinstance(f, str) or \ + (sys.version_info[0] == 2 and isinstance(f, unicode)) or \ + (sys.version_info[0] == 3 and isinstance(f, pathlib.Path)): + torch._C.import_ir_module(module_lookup, f) + else: + torch._C.import_ir_module_from_buffer(module_lookup, f.read()) return m +def save(m, f): + """ + Saves a ScriptModule to a file. + + Args: + m: a ScriptModule to save + f: a file-like object (has to implement write and flush) or a string + containing a file name + + .. warning:: + If you are using Python 2, torch.save does NOT support StringIO.StringIO + as a valid file-like object. This is because the write method should return + the number of bytes written; StringIO.write() does not do this. + + Please use something like io.BytesIO instead. + + Example: + >>> m = torch.jit.ScriptModule() + >>> # Save to file + >>> torch.jit.save(m, 'scriptmodule.pt') + >>> # Save to io.BytesIO buffer + >>> buffer = io.BytesIO() + >>> torch.jit.save(m, buffer) + """ + if isinstance(f, str) or \ + (sys.version_info[0] == 2 and isinstance(f, unicode)) or \ + (sys.version_info[0] == 3 and isinstance(f, pathlib.Path)): + m.save(f) + else: + ret = m.save_to_buffer() + f.write(ret) + + def get_trace_graph(f, args=(), kwargs=None): """ Trace a function or model, returning a tuple consisting of the both the @@ -317,6 +365,8 @@ def __init__(self, graph_diff_error, tensor_compare_error, extra_msg=None): # Check the traced module against a set of user-provided validation inputs @torch.no_grad() def _check_trace(check_inputs, func, executor_options, module, check_tolerance): + # Note: tracing is independent of optimizations, which consume the trace + executor_options['optimize'] = False for inputs in check_inputs: if isinstance(inputs, torch.Tensor): inputs = (inputs,) diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py index a1bfcbc08e097e..313bad93fea4bd 100644 --- a/torch/nn/modules/conv.py +++ b/torch/nn/modules/conv.py @@ -664,10 +664,10 @@ class ConvTranspose2d(_ConvTransposeMixin, _ConvNd): - Input: :math:`(N, C_{in}, H_{in}, W_{in})` - Output: :math:`(N, C_{out}, H_{out}, W_{out})` where - .. math:: + .. math:: H_{out} = (H_{in} - 1) \times \text{stride}[0] - 2 \times \text{padding}[0] + \text{kernel\_size}[0] + \text{output\_padding}[0] - + .. math:: W_{out} = (W_{in} - 1) \times \text{stride}[1] - 2 \times \text{padding}[1] + \text{kernel\_size}[1] + \text{output\_padding}[1] @@ -806,13 +806,13 @@ class ConvTranspose3d(_ConvTransposeMixin, _ConvNd): - Input: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})` - Output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` where - .. math:: + .. math:: D_{out} = (D_{in} - 1) \times \text{stride}[0] - 2 \times \text{padding}[0] + \text{kernel\_size}[0] + \text{output\_padding}[0] - + .. math:: H_{out} = (H_{in} - 1) \times \text{stride}[1] - 2 \times \text{padding}[1] + \text{kernel\_size}[1] + \text{output\_padding}[1] - + .. math:: W_{out} = (W_{in} - 1) \times \text{stride}[2] - 2 \times \text{padding}[2] + \text{kernel\_size}[2] + \text{output\_padding}[2] diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py index e1468637ba4ff2..eae2e7fe2cdab3 100644 --- a/torch/onnx/symbolic.py +++ b/torch/onnx/symbolic.py @@ -633,12 +633,12 @@ def lt(g, input, other): def ge(g, input, other): other = _maybe_get_scalar(other) - return g.op("Not", lt(g, _if_scalar_type_as(g, other, input), input)) + return g.op("Not", lt(g, input, _if_scalar_type_as(g, other, input))) def le(g, input, other): other = _maybe_get_scalar(other) - return g.op("Not", gt(g, _if_scalar_type_as(g, other, input), input)) + return g.op("Not", gt(g, input, _if_scalar_type_as(g, other, input))) @parse_args('v', 'i') @@ -975,13 +975,24 @@ def zeros_like(g, input): ] -@parse_args('v', 'i', 'i', 'v') +@parse_args('v', 'i', 'v', 'v') def zeros(g, shape, scalar_type, layout, device): # NOTE: no way to set device in ONNX, so we ignore it return g.op("ConstantFill", shape, dtype_i=scalar_type_to_onnx[scalar_type], input_as_shape_i=1, value_f=0) +def full(g, shape, value, scalar_type, layout, device): + const_value = _maybe_get_const(value, 't') + if _is_value(const_value): + tmp = zeros(shape, scalar_type, layout, device) + return add(tmp, value, g.op("Constant", value_t=torch.tensor(1))) + else: + scalar_type = _get_const(scalar_type, 'i', 'dtype') + return g.op("ConstantFill", shape, dtype_i=scalar_type_to_onnx[scalar_type], + input_as_shape_i=1, value_f=const_value) + + def full_like(g, input, fill_value): # TODO: a more efficient implementation (ConstantFill?) return add(g, zeros_like(g, input), fill_value, g.op("Constant", value_t=torch.tensor(1))) diff --git a/torch/optim/adam.py b/torch/optim/adam.py index 308ec0c8cf9150..a26de99ec02b93 100644 --- a/torch/optim/adam.py +++ b/torch/optim/adam.py @@ -87,7 +87,7 @@ def step(self, closure=None): state['step'] += 1 if group['weight_decay'] != 0: - grad = grad.add(group['weight_decay'], p.data) + grad.add_(group['weight_decay'], p.data) # Decay the first and second moment running average coefficient exp_avg.mul_(beta1).add_(1 - beta1, grad) diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py index 4b1c4cbc32bc09..eff79df6f29b84 100644 --- a/torch/utils/cpp_extension.py +++ b/torch/utils/cpp_extension.py @@ -10,8 +10,6 @@ import tempfile import warnings -from future.utils import raise_from - import torch from .file_baton import FileBaton from ._cpp_extension_versioner import ExtensionVersioner @@ -858,7 +856,7 @@ def _build_extension_module(name, build_directory, verbose): message = "Error building extension '{}'".format(name) if hasattr(error, 'output') and error.output: message += ": {}".format(error.output.decode()) - raise_from(RuntimeError(message), None) + raise RuntimeError(message) def _import_module_from_library(module_name, path): diff --git a/torch/utils/ffi/__init__.py b/torch/utils/ffi/__init__.py index 086cd99839eb1f..e47a4f8a341705 100644 --- a/torch/utils/ffi/__init__.py +++ b/torch/utils/ffi/__init__.py @@ -1,213 +1 @@ -import os -import glob -import tempfile -import shutil -from functools import wraps, reduce -from string import Template -import torch -import torch.cuda -from torch._utils import _accumulate - -try: - import cffi -except ImportError: - raise ImportError("torch.utils.ffi requires the cffi package") - - -if cffi.__version_info__ < (1, 4, 0): - raise ImportError("torch.utils.ffi requires cffi version >= 1.4, but " - "got " + '.'.join(map(str, cffi.__version_info__))) - - -def _generate_typedefs(): - typedefs = [] - for t in ['Double', 'Float', 'Long', 'Int', 'Short', 'Char', 'Byte']: - for lib in ['TH', 'THCuda']: - for kind in ['Tensor', 'Storage']: - python_name = t + kind - if t == 'Float' and lib == 'THCuda': - th_name = 'THCuda' + kind - else: - th_name = lib + t + kind - th_struct = 'struct ' + th_name - - typedefs += ['typedef {} {};'.format(th_struct, th_name)] - # We have to assemble a string here, because we're going to - # do this lookup based on tensor.type(), which returns a - # string (not a type object, as this code was before) - python_module = 'torch.cuda' if lib == 'THCuda' else 'torch' - python_class = python_module + '.' + python_name - _cffi_to_torch[th_struct] = python_class - _torch_to_cffi[python_class] = th_struct - return '\n'.join(typedefs) + '\n' -_cffi_to_torch = {} -_torch_to_cffi = {} -_typedefs = _generate_typedefs() - - -PY_MODULE_TEMPLATE = Template(""" -from torch.utils.ffi import _wrap_function -from .$cffi_wrapper_name import lib as _lib, ffi as _ffi - -__all__ = [] -def _import_symbols(locals): - for symbol in dir(_lib): - fn = getattr(_lib, symbol) - if callable(fn): - locals[symbol] = _wrap_function(fn, _ffi) - else: - locals[symbol] = fn - __all__.append(symbol) - -_import_symbols(locals()) -""") - - -def _setup_wrapper(with_cuda): - here = os.path.abspath(os.path.dirname(__file__)) - lib_dir = os.path.join(here, '..', '..', 'lib') - include_dirs = [ - os.path.join(lib_dir, 'include'), - os.path.join(lib_dir, 'include', 'TH'), - ] - - wrapper_source = '#include \n' - if with_cuda: - import torch.cuda - wrapper_source += '#include \n' - if os.sys.platform == 'win32': - cuda_include_dirs = glob.glob(os.getenv('CUDA_PATH', '') + '/include') - cuda_include_dirs += glob.glob(os.getenv('NVTOOLSEXT_PATH', '') + '/include') - else: - cuda_include_dirs = glob.glob('/usr/local/cuda/include') - cuda_include_dirs += glob.glob('/Developer/NVIDIA/CUDA-*/include') - include_dirs.append(os.path.join(lib_dir, 'include', 'THC')) - include_dirs.extend(cuda_include_dirs) - return wrapper_source, include_dirs - - -def _create_module_dir(base_path, fullname): - module, _, name = fullname.rpartition('.') - if not module: - target_dir = name - else: - target_dir = reduce(os.path.join, fullname.split('.')) - target_dir = os.path.join(base_path, target_dir) - try: - os.makedirs(target_dir) - except os.error: - pass - for dirname in _accumulate(fullname.split('.'), os.path.join): - init_file = os.path.join(base_path, dirname, '__init__.py') - open(init_file, 'a').close() # Create file if it doesn't exist yet - return name, target_dir - - -def _build_extension(ffi, cffi_wrapper_name, target_dir, verbose): - try: - tmpdir = tempfile.mkdtemp() - ext_suf = '.pyd' if os.sys.platform == 'win32' else '.so' - libname = cffi_wrapper_name + ext_suf - outfile = ffi.compile(tmpdir=tmpdir, verbose=verbose, target=libname) - shutil.copy(outfile, os.path.join(target_dir, libname)) - finally: - shutil.rmtree(tmpdir) - - -def _make_python_wrapper(name, cffi_wrapper_name, target_dir): - py_source = PY_MODULE_TEMPLATE.substitute(name=name, - cffi_wrapper_name=cffi_wrapper_name) - with open(os.path.join(target_dir, '__init__.py'), 'w') as f: - f.write(py_source) - - -def create_extension(name, headers, sources, verbose=True, with_cuda=False, - package=False, relative_to='.', **kwargs): - """Creates and configures a cffi.FFI object, that builds PyTorch extension. - - Arguments: - name (str): package name. Can be a nested module e.g. ``.ext.my_lib``. - headers (str or List[str]): list of headers, that contain only exported - functions - sources (List[str]): list of sources to compile. - verbose (bool, optional): if set to ``False``, no output will be printed - (default: True). - with_cuda (bool, optional): set to ``True`` to compile with CUDA headers - (default: False) - package (bool, optional): set to ``True`` to build in package mode (for modules - meant to be installed as pip packages) (default: False). - relative_to (str, optional): path of the build file. Required when - ``package is True``. It's best to use ``__file__`` for this argument. - kwargs: additional arguments that are passed to ffi to declare the - extension. See `Extension API reference`_ for details. - - .. _`Extension API reference`: https://docs.python.org/3/distutils/apiref.html#distutils.core.Extension - """ - base_path = os.path.abspath(os.path.dirname(relative_to)) - name_suffix, target_dir = _create_module_dir(base_path, name) - if not package: - cffi_wrapper_name = '_' + name_suffix - else: - cffi_wrapper_name = (name.rpartition('.')[0] + - '.{0}._{0}'.format(name_suffix)) - - wrapper_source, include_dirs = _setup_wrapper(with_cuda) - include_dirs.extend(kwargs.pop('include_dirs', [])) - - if os.sys.platform == 'win32': - library_dirs = glob.glob(os.getenv('CUDA_PATH', '') + '/lib/x64') - library_dirs += glob.glob(os.getenv('NVTOOLSEXT_PATH', '') + '/lib/x64') - - here = os.path.abspath(os.path.dirname(__file__)) - lib_dir = os.path.join(here, '..', '..', 'lib') - - library_dirs.append(os.path.join(lib_dir)) - else: - library_dirs = [] - library_dirs.extend(kwargs.pop('library_dirs', [])) - - if isinstance(headers, str): - headers = [headers] - all_headers_source = '' - for header in headers: - with open(os.path.join(base_path, header), 'r') as f: - all_headers_source += f.read() + '\n\n' - - ffi = cffi.FFI() - sources = [os.path.join(base_path, src) for src in sources] - # NB: TH headers are C99 now - kwargs['extra_compile_args'] = ['-std=c99'] + kwargs.get('extra_compile_args', []) - ffi.set_source(cffi_wrapper_name, wrapper_source + all_headers_source, - sources=sources, - include_dirs=include_dirs, - library_dirs=library_dirs, **kwargs) - ffi.cdef(_typedefs + all_headers_source) - - _make_python_wrapper(name_suffix, '_' + name_suffix, target_dir) - - def build(): - _build_extension(ffi, cffi_wrapper_name, target_dir, verbose) - ffi.build = build - return ffi - - -def _wrap_function(function, ffi): - @wraps(function) - def safe_call(*args, **kwargs): - args = tuple(ffi.cast(_torch_to_cffi.get(arg.type(), 'void') + '*', arg._cdata) - if isinstance(arg, torch.Tensor) or torch.is_storage(arg) - else arg - for arg in args) - args = (function,) + args - result = torch._C._safe_call(*args, **kwargs) - if isinstance(result, ffi.CData): - typeof = ffi.typeof(result) - if typeof.kind == 'pointer': - cdata = int(ffi.cast('uintptr_t', result)) - cname = typeof.item.cname - if cname in _cffi_to_torch: - # TODO: Maybe there is a less janky way to eval - # off of this - return eval(_cffi_to_torch[cname])(cdata=cdata) - return result - return safe_call +raise ImportError("torch.utils.ffi is deprecated. Please use cpp extensions instead.")