diff --git a/.gitignore b/.gitignore index 329e999cf4562..4ef31ba826860 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,9 @@ aten/src/ATen/cuda/CUDAConfig.h build/ dist/ docs/src/**/* +docs/cpp/xml/ +docs/cpp/html/ +docs/cpp/api/ test/.coverage test/cpp/api/mnist test/data/gpu_tensors.pt diff --git a/aten/src/ATen/Allocator.h b/aten/src/ATen/Allocator.h index 8493ccd56ba01..fd73f8d6806f9 100644 --- a/aten/src/ATen/Allocator.h +++ b/aten/src/ATen/Allocator.h @@ -23,7 +23,7 @@ class DataPtr { public: // Choice of CPU here is arbitrary; if there's an "undefined" device // we could use that too - DataPtr() : ptr_(), device_(kCPU) {} + DataPtr() : ptr_(), device_(DeviceType::CPU) {} DataPtr(void* data, Device device) : ptr_(data), device_(device) {} DataPtr(void* data, void* ctx, DeleterFnPtr ctx_deleter, Device device) diff --git a/aten/src/ATen/Backend.h b/aten/src/ATen/Backend.h index 40db1ee67f247..622d181c75df6 100644 --- a/aten/src/ATen/Backend.h +++ b/aten/src/ATen/Backend.h @@ -3,18 +3,25 @@ #include #include #include +#include #include namespace at { +/** + * This legacy enum class defines the set of backends supported by + * old school, code generated Type-based ATen. The reason we are + * sunsetting this enum class is because it doesn't allow for + * open registration of backends. TensorTypeId is the replacement + * for Backend which supports open registration. + * + * ARE YOU SURE YOU WANT TO USE THIS TYPE? Think about if SparseCPU/SparseCUDA + * would make sense in your use case. If it doesn't make sense, maybe + * you want DeviceType. + */ enum class Backend { CPU, CUDA, SparseCPU, SparseCUDA, Undefined, NumOptions }; -constexpr Backend kCPU = Backend::CPU; -constexpr Backend kCUDA = Backend::CUDA; -constexpr Backend kSparseCPU = Backend::SparseCPU; -constexpr Backend kSparseCUDA = Backend::SparseCUDA; - static inline Backend toSparse(Backend b) { switch (b) { case Backend::CPU: @@ -78,6 +85,71 @@ static inline TensorTypeId backendToTensorTypeId(Backend b) { } } +static inline DeviceType backendToDeviceType(Backend b) { + switch (b) { + case Backend::CPU: + return DeviceType::CPU; + case Backend::CUDA: + return DeviceType::CUDA; + case Backend::SparseCPU: + return DeviceType::CPU; + case Backend::SparseCUDA: + return DeviceType::CUDA; + case Backend::Undefined: + AT_ERROR("Undefined backend is not a valid device type"); + default: + AT_ERROR("Unknown backend"); + } +} + +static inline Backend deviceTypeToBackend(DeviceType d) { + switch (d) { + case DeviceType::CPU: + return Backend::CPU; + case DeviceType::CUDA: + return Backend::CUDA; + default: + AT_ERROR("Unknown device type ", d); + } +} + +static inline Backend backendToCPU(Backend b) { + switch (b) { + case Backend::CPU: + return Backend::CPU; + case Backend::CUDA: + return Backend::CPU; + case Backend::SparseCPU: + return Backend::SparseCPU; + case Backend::SparseCUDA: + return Backend::SparseCPU; + case Backend::Undefined: + return Backend::Undefined; + default: + AT_ERROR("Unknown backend"); + } +} + +static inline Backend backendToCUDA(Backend b) { + switch (b) { + case Backend::CPU: + return Backend::CUDA; + case Backend::CUDA: + return Backend::CUDA; + case Backend::SparseCPU: + return Backend::SparseCUDA; + case Backend::SparseCUDA: + return Backend::SparseCUDA; + case Backend::Undefined: + return Backend::Undefined; + default: + AT_ERROR("Unknown backend"); + } +} + +constexpr DeviceType kCPU = DeviceType::CPU; +constexpr DeviceType kCUDA = DeviceType::CUDA; + static inline const char* toString(Backend b) { switch (b) { case Backend::CPU: diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp index 0c0e99b90906c..f85996f74c4b7 100644 --- a/aten/src/ATen/Context.cpp +++ b/aten/src/ATen/Context.cpp @@ -32,7 +32,7 @@ Context::Context() THSetDefaultErrorHandler(errorHandler,nullptr); THSetDefaultArgErrorHandler(argErrorHandler,nullptr); - generator_registry[static_cast(Backend::CPU)] + generator_registry[static_cast(DeviceType::CPU)] .reset(new CPUGenerator(this)); Type::registerCPU(this); } diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index 3171a11ada8e3..6cbc7d0d7961b 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -25,7 +25,7 @@ class AT_API Context { return type_registry[static_cast(p)][static_cast(s)].get(); } Type * getTypeOpt(Backend p, ScalarType s) { - initCUDAIfNeeded(p); + if (p != Backend::Undefined) initCUDAIfNeeded(backendToDeviceType(p)); auto type = getTypeRaw(p, s); if(!type) { @@ -42,11 +42,11 @@ class AT_API Context { if (!type) AT_ERROR(toString(p), toString(s), "Type is not enabled."); return *type; } - Generator & defaultGenerator(Backend p) { - initCUDAIfNeeded(p); - auto & generator = generator_registry[static_cast(p)]; + Generator & defaultGenerator(DeviceType device_type) { + initCUDAIfNeeded(device_type); + auto & generator = generator_registry[static_cast(device_type)]; if(!generator) - AT_ERROR(toString(p), " backend type not enabled."); + AT_ERROR(DeviceTypeName(device_type), " backend type not enabled."); return *generator; } bool hasMKL() const; @@ -64,7 +64,7 @@ class AT_API Context { THCState* lazyInitCUDA() { std::call_once(thc_init,[&] { thc_state = detail::getCUDAHooks().initCUDA(); - generator_registry[static_cast(Backend::CUDA)] = + generator_registry[static_cast(DeviceType::CUDA)] = detail::getCUDAHooks().initCUDAGenerator(this); detail::getCUDAHooks().registerCUDATypes(this); }); @@ -95,16 +95,17 @@ class AT_API Context { bool deterministicCuDNN() const; void setDeterministicCuDNN(bool); std::unique_ptr - generator_registry[static_cast(Backend::NumOptions)]; + generator_registry[static_cast(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES)]; private: // NB: type_registry has nullptr for all CUDA backends until // CUDA initialization has occurred std::unique_ptr type_registry [static_cast(Backend::NumOptions)] [static_cast(ScalarType::NumOptions)]; - void initCUDAIfNeeded(Backend p) { - if(p == Backend::CUDA) + void initCUDAIfNeeded(DeviceType p) { + if (p == DeviceType::CUDA) { lazyInitCUDA(); + } } std::once_flag thc_init; bool enabled_cudnn = true; @@ -132,6 +133,10 @@ static inline Type& getType(Backend p, ScalarType s) { return globalContext().getType(p, s); } +static inline Type& getType(DeviceType p, ScalarType s) { + return globalContext().getType(deviceTypeToBackend(p), s); +} + static inline Type& CPU(ScalarType s) { return getType(Backend::CPU, s); } diff --git a/aten/src/ATen/Device.h b/aten/src/ATen/Device.h index 8dbd9ffab2fd2..d48984a2063dc 100644 --- a/aten/src/ATen/Device.h +++ b/aten/src/ATen/Device.h @@ -1,9 +1,10 @@ #pragma once -#include +#include #include #include #include +#include #include #include @@ -24,21 +25,6 @@ namespace at { struct Device { using Type = at::DeviceType; - /// Converts a `Backend` to a `DeviceType` if possible. - static DeviceType backend_to_type(Backend backend) { - switch (backend) { - case kCPU: - case kSparseCPU: - return DeviceType::CPU; - case kCUDA: - case kSparseCUDA: - return DeviceType::CUDA; - default: - AT_ERROR( - "Invalid backend ", toString(backend), " for Device construction"); - } - } - /// Constructs a new `Device` from a `DeviceType` and an optional device /// index. /* implicit */ Device(DeviceType type, int32_t index = -1) @@ -60,11 +46,6 @@ struct Device { /// `` optionally specifies a device index. /* implicit */ Device(const std::string& device_string); - /// Constructs a new `Device` from a `Backend` (which is converted to a - /// `DeviceType`, if possible) and an optional device index. - /* implicit */ Device(Backend backend, int32_t index = -1) - : Device(backend_to_type(backend), index) {} - /// Returns true if the type and index of this `Device` matches that of /// `other`. bool operator==(const Device& other) const noexcept { diff --git a/aten/src/ATen/Formatting.cpp b/aten/src/ATen/Formatting.cpp index 1dd6b71c69386..459e7e58bdb38 100644 --- a/aten/src/ATen/Formatting.cpp +++ b/aten/src/ATen/Formatting.cpp @@ -250,7 +250,7 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi stream << "size:\n" << tensor_.sizes() << "\n"; stream << "]"; } else { - Type& cpudouble = tensor_.type().toBackend(kCPU).toScalarType(kDouble); + Type& cpudouble = tensor_.type().toBackend(Backend::CPU).toScalarType(kDouble); Tensor tensor = tensor_.toType(cpudouble).contiguous(); if(tensor.ndimension() == 0) { stream << defaultfloat << tensor.data()[0] << std::endl; diff --git a/aten/src/ATen/TensorOptions.h b/aten/src/ATen/TensorOptions.h index 20b0d1ed71d78..350bd449a31e9 100644 --- a/aten/src/ATen/TensorOptions.h +++ b/aten/src/ATen/TensorOptions.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -67,7 +68,7 @@ struct AT_API TensorOptions { type_ = &type; } this->dtype(type.scalarType()); - this->device({type.backend(), device_index}); + this->device({backendToDeviceType(type.backend()), device_index}); this->layout(type.layout()); } @@ -84,7 +85,12 @@ struct AT_API TensorOptions { /// Constructs a `TensorOptions` object from a backend, forwarded to the /// `Device` constructor. /* implicit */ TensorOptions(Backend backend) - : TensorOptions(Device(backend)) {} + : TensorOptions(Device(backendToDeviceType(backend))) {} + + /// Constructs a `TensorOptions` object from a device type, forwarded to the + /// `Device` constructor. + /* implicit */ TensorOptions(DeviceType device_type) + : TensorOptions(Device(device_type)) {} /// Constructs a `TensorOptions` object with the given dtype. /* implicit */ TensorOptions(ScalarType dtype) : TensorOptions() { @@ -190,9 +196,9 @@ struct AT_API TensorOptions { Backend backend() const noexcept { Backend backend; if (device_.type() == Device::Type::CPU) { - backend = (layout_ == kStrided) ? kCPU : kSparseCPU; + backend = (layout_ == kStrided) ? Backend::CPU : Backend::SparseCPU; } else { - backend = (layout_ == kStrided) ? kCUDA : kSparseCUDA; + backend = (layout_ == kStrided) ? Backend::CUDA : Backend::SparseCUDA; } return backend; } diff --git a/aten/src/ATen/core/Macros.h b/aten/src/ATen/core/Macros.h index 3cc64320815af..87a3e5b78ce28 100644 --- a/aten/src/ATen/core/Macros.h +++ b/aten/src/ATen/core/Macros.h @@ -7,30 +7,22 @@ // static library (in which case, saying the symbol is coming // from a DLL would be incorrect). -#define AT_CORE_EXPORT -#define AT_CORE_IMPORT - #ifdef _WIN32 - #ifndef AT_CORE_STATIC_WINDOWS - #undef AT_CORE_EXPORT - #undef AT_CORE_IMPORT - #define AT_CORE_EXPORT __declspec(dllexport) - #define AT_CORE_IMPORT __declspec(dllimport) - #endif // !defined(AT_CORE_STATIC_WINDOWS) -#else // _WIN32 - #if defined(__GNUC__) || defined(__llvm__) - #undef AT_CORE_EXPORT - #undef AT_CORE_IMPORT - #define AT_CORE_EXPORT __attribute__((__visibility__("default"))) - #define AT_CORE_IMPORT AT_CORE_EXPORT - #endif // defined(__GNUC__) || defined(__llvm__) -#endif // _WIN32 - +#if !defined(AT_CORE_STATIC_WINDOWS) +// TODO: unfiy the controlling macros. #if defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS) - #define AT_CORE_API AT_CORE_EXPORT +#define AT_CORE_API __declspec(dllexport) #else // defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS) - #define AT_CORE_API AT_CORE_IMPORT +#define AT_CORE_API __declspec(dllimport) #endif // defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS) +#else // !defined(AT_CORE_STATIC_WINDOWS) +#define AT_CORE_API +#endif // !defined(AT_CORE_STATIC_WINDOWS) +#else // _WIN32 +#if defined(__GNUC__) +#define AT_CORE_API __attribute__((__visibility__("default"))) +#endif // defined(__GNUC__) +#endif // _WIN32 // Disable the copy and assignment operator for a class. Note that this will // disable the usage of the class in std containers. diff --git a/aten/src/ATen/core/typeid.h b/aten/src/ATen/core/typeid.h index 7b3cdebd0263b..7fbaf04c3e759 100644 --- a/aten/src/ATen/core/typeid.h +++ b/aten/src/ATen/core/typeid.h @@ -391,14 +391,33 @@ inline bool operator!=(const TypeMeta& lhs, const TypeMeta& rhs) noexcept { * * NOTE: the macro needs to be invoked in ::caffe2 namespace */ - +// Implementation note: in MSVC, we will need to prepend the AT_CORE_API +// keyword in order to get things compiled properly. in Linux, gcc seems to +// create attribute ignored error for explicit template instantiations, see +// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0537r0.html +// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51930 +// and as a result, we define these two macros slightly differently. +// TODO(jiayq): AT_CORE_API below is not correct, because we may use the +// definition in third party dependent libraries. The proper way is to use +// CAFFE2_EXPORT (which explicitly requires dllexport). Marking this as a +// todo item when the unified build is finished. +#ifdef _MSC_VER #define CAFFE_KNOWN_TYPE(T) \ template <> \ - AT_CORE_EXPORT TypeIdentifier TypeMeta::Id() { \ + AT_CORE_API TypeIdentifier TypeMeta::Id() { \ static const TypeIdentifier type_id = TypeIdentifier::createTypeId(); \ static TypeNameRegisterer registerer(type_id, #T); \ return type_id; \ } +#else // _MSC_VER +#define CAFFE_KNOWN_TYPE(T) \ + template <> \ + TypeIdentifier TypeMeta::Id() { \ + static const TypeIdentifier type_id = TypeIdentifier::createTypeId(); \ + static TypeNameRegisterer registerer(type_id, #T); \ + return type_id; \ + } +#endif /** * CAFFE_DECLARE_KNOWN_TYPE and CAFFE_DEFINE_KNOWN_TYPE are used @@ -406,11 +425,19 @@ inline bool operator!=(const TypeMeta& lhs, const TypeMeta& rhs) noexcept { * can be resolved at compile time. Please use CAFFE_KNOWN_TYPE() instead * for your own types to allocate dynamic ids for them. */ -#define CAFFE_DECLARE_KNOWN_TYPE(PreallocatedId, T) \ - template <> \ - AT_CORE_EXPORT inline AT_CORE_API TypeIdentifier TypeMeta::Id() { \ - return TypeIdentifier(PreallocatedId); \ +#ifdef _MSC_VER +#define CAFFE_DECLARE_KNOWN_TYPE(PreallocatedId, T) \ + template <> \ + inline AT_CORE_API TypeIdentifier TypeMeta::Id() { \ + return TypeIdentifier(PreallocatedId); \ } +#else // _MSC_VER +#define CAFFE_DECLARE_KNOWN_TYPE(PreallocatedId, T) \ + template <> \ + inline TypeIdentifier TypeMeta::Id() { \ + return TypeIdentifier(PreallocatedId); \ + } +#endif #define CONCAT_IMPL(x, y) x##y #define MACRO_CONCAT(x, y) CONCAT_IMPL(x, y) diff --git a/aten/src/ATen/cuda/CUDAEvent.cpp b/aten/src/ATen/cuda/CUDAEvent.cpp new file mode 100644 index 0000000000000..ab6c8421816ce --- /dev/null +++ b/aten/src/ATen/cuda/CUDAEvent.cpp @@ -0,0 +1,66 @@ +#include "ATen/cuda/CUDAEvent.h" +#include "ATen/cuda/CUDAContext.h" +#include "ATen/cuda/CUDAStream.h" +#include "ATen/cuda/Exceptions.h" +#include "ATen/core/Error.h" + +#include +#include + +// Internal implementation is entirely hidden +struct CUDAEventInternals { + std::atomic refcount; + int64_t device; // Note: cudaGetDevice works with int32_t, not int64_t + cudaEvent_t event; +}; + +namespace at { +namespace cuda { + +namespace detail { + +/* +* Pointer-based event API +*/ +CUDAEventInternals* CUDAEvent_create(unsigned int flags) { + std::unique_ptr internals { new CUDAEventInternals() }; + internals->refcount = 1; + internals->device = current_device(); + AT_CUDA_CHECK(cudaEventCreateWithFlags(&internals->event, flags)); + return internals.release(); +} + +void CUDAEvent_retain(CUDAEventInternals* internals) { + internals->refcount++; +} + +void CUDAEvent_uncheckedFree(CUDAEventInternals* internals) { + if (--internals->refcount) { + cudaEventDestroy(internals->event); + } +} +cudaEvent_t CUDAEvent_event(CUDAEventInternals* internals) { + return internals->event; +} + +int64_t CUDAEvent_device(CUDAEventInternals* internals) { + return internals->device; +} + +void CUDAEvent_record(CUDAEventInternals* internals, const CUDAStream& stream) { + AT_CUDA_CHECK(cudaEventRecord(internals->event, stream)); +} + +} // namespace detail + +void CUDAEvent::record() const { + record(getCurrentCUDAStream()); +} + +void CUDAEvent::record(const CUDAStream& stream) const { + detail::CUDAEvent_record(internals_, stream); +} + + +} // namespace cuda +} // namespace at diff --git a/aten/src/ATen/cuda/CUDAEvent.h b/aten/src/ATen/cuda/CUDAEvent.h new file mode 100644 index 0000000000000..79abfd3dcc01a --- /dev/null +++ b/aten/src/ATen/cuda/CUDAEvent.h @@ -0,0 +1,80 @@ +#pragma once + +#include +#include + +#include "cuda_runtime_api.h" + +#include +#include + +/* +* A CUDA event interface with no CUDA build dependency. +* +* Includes the CUDAEvent RAII class and a pointer-based event API. +*/ + +struct CUDAEventInternals; + +namespace at { +namespace cuda { + +struct CUDAStream; + +namespace detail { + +// Pointer-based API (for internal use) +// Note: ATen/Context is preferred to work with streams safely +AT_API CUDAEventInternals* CUDAEvent_create(unsigned int flags); +AT_API void CUDAEvent_retain(CUDAEventInternals* internals); +AT_API void CUDAEvent_uncheckedFree(CUDAEventInternals* internals); +AT_API cudaEvent_t CUDAEvent_event(CUDAEventInternals* internals); +AT_API int64_t CUDAEvent_device(CUDAEventInternals* internals); + +} // namespace detail + +struct CUDAEvent { + // Constants + static constexpr unsigned int DEFAULT_FLAGS = cudaEventDisableTiming; + + // Constructors + CUDAEvent(unsigned int flags = DEFAULT_FLAGS) + : internals_(detail::CUDAEvent_create(flags)) {} + + ~CUDAEvent() { detail::CUDAEvent_uncheckedFree(internals_); } + + CUDAEvent(const CUDAEvent& other) { + detail::CUDAEvent_retain(other.internals_); + internals_ = other.internals_; + } + + CUDAEvent(CUDAEvent&& other) { + std::swap(internals_, other.internals_); + } + + CUDAEvent& operator=(CUDAEvent other) noexcept { + std::swap(internals_, other.internals_); + return *this; + } + + operator cudaEvent_t() const { return detail::CUDAEvent_event(internals_); } + + // Less than operator (to allow use in sets) + friend bool operator<(const CUDAEvent& left, const CUDAEvent& right) { + return left.internals_ < right.internals_; + } + + int64_t device() const { return detail::CUDAEvent_device(internals_); } + cudaEvent_t event() const { return detail::CUDAEvent_event(internals_); } + CUDAEventInternals* internals() const { return internals_; } + + void record() const; // Record on the current stream + void record(const CUDAStream& stream) const; + +private: + CUDAEventInternals* internals_; +}; + +} // namespace cuda +} // namespace at + diff --git a/aten/src/ATen/cuda/CUDAStream.cpp b/aten/src/ATen/cuda/CUDAStream.cpp index 2dab634bc71e9..12d571da7f459 100644 --- a/aten/src/ATen/cuda/CUDAStream.cpp +++ b/aten/src/ATen/cuda/CUDAStream.cpp @@ -1,5 +1,6 @@ #include "ATen/cuda/CUDAStream.h" #include "ATen/cuda/CUDAContext.h" +#include "ATen/cuda/CUDAEvent.h" #include "ATen/cuda/Exceptions.h" #include "ATen/core/Error.h" @@ -173,6 +174,10 @@ namespace detail { } } + void CUDAStream_synchronize_with(CUDAStreamInternals* ptr, const CUDAEvent& event) { + AT_CUDA_CHECK(cudaStreamWaitEvent(ptr->stream, event, 0)); + } + } // namespace detail /* @@ -194,5 +199,9 @@ namespace detail { std::swap(internals_, other.internals_); } + void CUDAStream::synchronize_with(const CUDAEvent& event) const { + detail::CUDAStream_synchronize_with(internals_, event); + } + } // namespace cuda } // namespace at diff --git a/aten/src/ATen/cuda/CUDAStream.h b/aten/src/ATen/cuda/CUDAStream.h index 545bccfdfbcb7..7a3e1e0595c12 100644 --- a/aten/src/ATen/cuda/CUDAStream.h +++ b/aten/src/ATen/cuda/CUDAStream.h @@ -15,12 +15,13 @@ * The ATen Context interface should be preferred when working with streams. */ -// Forward-declares internals struct CUDAStreamInternals; namespace at { namespace cuda { +struct CUDAEvent; + namespace detail { // Pointer-based API (for internal use) @@ -102,6 +103,8 @@ struct CUDAStream { cudaStream_t stream() const { return detail::CUDAStream_stream(internals_); } CUDAStreamInternals* internals() const { return internals_; } + void synchronize_with(const CUDAEvent& event) const; + private: CUDAStreamInternals* internals_ = nullptr; }; diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py index fc61ccd698e26..9f589017822bc 100644 --- a/aten/src/ATen/function_wrapper.py +++ b/aten/src/ATen/function_wrapper.py @@ -293,7 +293,7 @@ def __init__(self, reason): 'Backend::${Backend}, ScalarType::${ScalarName})'), 'THGenerator*': CodeTemplate( - 'check_generator<${Backend}Generator>(${arg_name}, &globalContext().defaultGenerator(backend()))'), + 'check_generator<${Backend}Generator>(${arg_name}, &globalContext().defaultGenerator(device_type()))'), # This is a cast done via direct-construction 'IntListStride': CodeTemplate('at::IntList ${result_name} = get_intlist_stride_th(${arg_name});'), 'real': CodeTemplate('${arg_name}.to${ScalarName}()'), diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp index 328cdb88e951c..9292994341130 100644 --- a/aten/src/ATen/native/BinaryOps.cpp +++ b/aten/src/ATen/native/BinaryOps.cpp @@ -28,7 +28,7 @@ Tensor& add_out(Tensor& result, const Tensor& self, const Tensor& other, Scalar AT_ERROR("add(sparse, dense) is not supported. Use add(dense, sparse) instead."); } auto iter = TensorIterator::binary_op(result, self, other); - add_stub(iter->backend(), *iter, alpha); + add_stub(iter->device_type(), *iter, alpha); return result; } @@ -53,7 +53,7 @@ Tensor& div_out(Tensor& result, const Tensor& self, const Tensor& other) { return at::_sparse_div_out(result, self, Scalar(other)); } auto iter = TensorIterator::binary_op(result, self, other); - div_stub(iter->backend(), *iter); + div_stub(iter->device_type(), *iter); return result; } @@ -74,7 +74,7 @@ Tensor& mul_out(Tensor& result, const Tensor& self, const Tensor& other) { return at::_sparse_mul_out(result, self, other); } auto iter = TensorIterator::binary_op(result, self, other); - mul_stub(iter->backend(), *iter); + mul_stub(iter->device_type(), *iter); return result; } @@ -105,7 +105,7 @@ Tensor& sub_out(Tensor& result, const Tensor& self, const Tensor& other, Scalar AT_ERROR("sub(sparse, dense) is not supported. Use sub(dense, sparse) instead."); } auto iter = TensorIterator::binary_op(result, self, other); - sub_stub(iter->backend(), *iter, alpha); + sub_stub(iter->device_type(), *iter, alpha); return result; } diff --git a/aten/src/ATen/native/DispatchStub.h b/aten/src/ATen/native/DispatchStub.h index 4d4d3df1bd35e..dad05dcf8b47a 100644 --- a/aten/src/ATen/native/DispatchStub.h +++ b/aten/src/ATen/native/DispatchStub.h @@ -50,17 +50,17 @@ struct AT_API DispatchStub { static_assert(std::is_pointer::value, "FnPtr should be a pointer type"); template - void operator()(Backend backend, ArgTypes... args) { - if (backend == Backend::CPU) { + void operator()(DeviceType device_type, ArgTypes&&... args) { + if (device_type == DeviceType::CPU) { if (!cpu_dispatch_ptr) { cpu_dispatch_ptr = choose_cpu_impl(); } - (*cpu_dispatch_ptr)(args...); - } else if (backend == Backend::CUDA) { + (*cpu_dispatch_ptr)(std::forward(args)...); + } else if (device_type == DeviceType::CUDA) { AT_ASSERTM(cuda_dispatch_ptr, "DispatchStub: missing CUDA kernel"); - (*cuda_dispatch_ptr)(args...); + (*cuda_dispatch_ptr)(std::forward(args)...); } else { - AT_ERROR("DispatchStub: unsupported backend", backend); + AT_ERROR("DispatchStub: unsupported device type", device_type); } } @@ -109,12 +109,33 @@ struct RegisterDispatch { #define DEFINE_DISPATCH(name) struct name name -#if defined(__CUDACC__) -#define REGISTER_DISPATCH(name, fn) \ +#define REGISTER_ARCH_DISPATCH(name, arch, fn) \ + template <> decltype(fn) DispatchStub::arch = fn; + +#ifdef HAVE_AVX_CPU_DEFINITION +#define REGISTER_AVX_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, AVX, fn) +#else +#define REGISTER_AVX_DISPATCH(name, fn) +#endif + +#ifdef HAVE_AVX2_CPU_DEFINITION +#define REGISTER_AVX2_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, AVX2, fn) +#else +#define REGISTER_AVX2_DISPATCH(name, fn) +#endif + +#define REGISTER_NO_CPU_DISPATCH(name, fn_type) \ + REGISTER_ARCH_DISPATCH(name, DEFAULT, static_cast(nullptr)) \ + REGISTER_AVX_DISPATCH(name, static_cast(nullptr)) \ + REGISTER_AVX2_DISPATCH(name, static_cast(nullptr)) + +#define REGISTER_CUDA_DISPATCH(name, fn) \ static RegisterDispatch name ## __register(name, fn); + +#if defined(__CUDACC__) +#define REGISTER_DISPATCH(name, fn) REGISTER_CUDA_DISPATCH(name, fn) #elif defined(CPU_CAPABILITY) -#define REGISTER_DISPATCH(name, fn) \ - template <> decltype(fn) DispatchStub::CPU_CAPABILITY = fn; +#define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn) #endif diff --git a/aten/src/ATen/native/Distributions.cpp b/aten/src/ATen/native/Distributions.cpp index fb1cd8cf2852f..acb9e220b967b 100644 --- a/aten/src/ATen/native/Distributions.cpp +++ b/aten/src/ATen/native/Distributions.cpp @@ -49,7 +49,7 @@ namespace { */ THGenerator* get_generator(at::Generator* gen) { - auto default_gen = &at::globalContext().defaultGenerator(at::Backend::CPU); + auto default_gen = &at::globalContext().defaultGenerator(at::kCPU); auto gen_ = at::check_generator(gen, default_gen); return gen_->generator; } diff --git a/aten/src/ATen/native/Dropout.cpp b/aten/src/ATen/native/Dropout.cpp index 8450cba142d52..efaa4a4b6f507 100644 --- a/aten/src/ATen/native/Dropout.cpp +++ b/aten/src/ATen/native/Dropout.cpp @@ -81,7 +81,7 @@ ALIAS_SPECIALIZATION(_feature_alpha_dropout, true, true ) } // anomymous namepsace Tensor dropout(const Tensor& input, double p, bool train) { - if (is_fused_kernel_acceptable(input, p)) { + if (train && is_fused_kernel_acceptable(input, p)) { return std::get<0>(input._fused_dropout(1 - p)); } return _dropout(input, p, train); diff --git a/aten/src/ATen/native/Embedding.cpp b/aten/src/ATen/native/Embedding.cpp index 0026a9907d7ec..f304c6798d11c 100644 --- a/aten/src/ATen/native/Embedding.cpp +++ b/aten/src/ATen/native/Embedding.cpp @@ -67,7 +67,7 @@ Tensor embedding_sparse_backward( int64_t num_features = grad_.size(-1); auto weight_size = std::array{{ num_weights, num_features }}; auto& dense_type = grad.type(); - auto& sparse_type = dense_type.toBackend(grad.is_cuda() ? kSparseCUDA : kSparseCPU); + auto& sparse_type = dense_type.toBackend(grad.is_cuda() ? Backend::SparseCUDA : Backend::SparseCPU); // check if all our grad come from padding_idx if (grad.numel() == 0) { diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index 388d704a834d4..d6e5ab586cc49 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -89,7 +89,7 @@ Tensor inverse(const Tensor& self) { } Tensor& inverse_out(Tensor &result, const Tensor &self) { - AT_CHECK(self.type().backend() == kCPU || self.type().backend() == kCUDA, + AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, "tensor should have CPU or CUDA backend"); AT_CHECK(self.dim() == 2, "tensor should be 2 dimensional"); AT_CHECK(self.size(0) == self.size(1), "tensor should be square"); diff --git a/aten/src/ATen/native/LossCTC.cpp b/aten/src/ATen/native/LossCTC.cpp index 7214c2c355699..ccae5fb75f5b0 100644 --- a/aten/src/ATen/native/LossCTC.cpp +++ b/aten/src/ATen/native/LossCTC.cpp @@ -364,8 +364,8 @@ Tensor ctc_loss(const Tensor& log_probs, const Tensor& targets, IntList input_le // Convenience function accepting Tensors Tensor ctc_loss(const Tensor& log_probs, const Tensor& targets, const Tensor& input_lengths, const Tensor& target_lengths, int64_t BLANK, int64_t reduction) { - Tensor ilc = input_lengths.toType(kLong).toBackend(kCPU).contiguous(); - Tensor tlc = target_lengths.toType(kLong).toBackend(kCPU).contiguous(); + Tensor ilc = input_lengths.toType(kLong).toBackend(Backend::CPU).contiguous(); + Tensor tlc = target_lengths.toType(kLong).toBackend(Backend::CPU).contiguous(); IntList il(ilc.data(), ilc.numel()); IntList tl(tlc.data(), tlc.numel()); return at::native::ctc_loss(log_probs, targets, il, tl, BLANK, reduction); diff --git a/aten/src/ATen/native/Memory.cpp b/aten/src/ATen/native/Memory.cpp index dcd04fc8d4a7c..8950cf02d857a 100644 --- a/aten/src/ATen/native/Memory.cpp +++ b/aten/src/ATen/native/Memory.cpp @@ -7,7 +7,7 @@ namespace at { namespace native { Tensor pin_memory(const Tensor& self) { - if (self.type().backend() != kCPU) { + if (self.type().backend() != Backend::CPU) { AT_ERROR("cannot pin '", self.type().toString(), "' only CPU memory can be pinned"); } auto* allocator = detail::getCUDAHooks().getPinnedMemoryAllocator(); diff --git a/aten/src/ATen/native/RNN.cpp b/aten/src/ATen/native/RNN.cpp index 4e7a23fd1acfa..b93b7c0d2627f 100644 --- a/aten/src/ATen/native/RNN.cpp +++ b/aten/src/ATen/native/RNN.cpp @@ -1,3 +1,5 @@ +#include "ATen/native/RNN.h" + #include "ATen/ATen.h" #include "ATen/NativeFunctions.h" @@ -286,7 +288,7 @@ struct FullBidirectionalLayer : Layer, pair_of< std::vector reverse(std::vector&& x) const { std::reverse(x.begin(), x.end()); - return x; + return std::move(x); } FullLayer layer_; @@ -499,100 +501,6 @@ std::tuple _lstm_impl( return std::make_tuple(result.outputs, at::stack(hy, 0), at::stack(cy, 0)); } -//////////////////////////////////////////////////////////////////////////////// -// CUDNN BINDINGS -//////////////////////////////////////////////////////////////////////////////// - -// These must line up with the CUDNN mode codes: -// https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnRNNMode_t -enum class CuDNNMode { rnn_relu = 0, rnn_tanh = 1, lstm = 2, gru = 3 }; - -std::tuple unpack_hidden(const Tensor& hidden) { - return std::make_tuple(hidden, at::Tensor{}); -} - -std::tuple unpack_hidden(const tpair_of& hidden) { - return hidden; -} - -template -hidden_type pack_hidden(const Tensor& hx, const Tensor& cx) { - static_assert(std::is_same::value, "pack_hidden not implemented for this type"); - AT_ERROR("NOT IMPLEMENTED"); -} - -template<> -Tensor pack_hidden(const Tensor& hx, const Tensor& cx) { - AT_ASSERT(cx.numel() == 0); - return hx; -} - -template<> -tpair_of pack_hidden>(const Tensor& hx, const Tensor& cx) { - return std::make_tuple(hx, cx); -} - -const char * WEIGHT_FORMAT_WARN = "RNN module weights are not part of single contiguous " - "chunk of memory. This means they need to be compacted " - "at every call, possibly greatly increasing memory usage. " - "To compact weights again call flatten_parameters()."; - -template -LayerOutput _cudnn_impl( - const Tensor& input, const Tensor& _batch_sizes, - const hidden_type& hidden, - TensorList params, bool has_biases, - CuDNNMode cudnn_mode, const Tensor& weight_buf, const Tensor& dropout_state, - int64_t num_layers, double dropout_p, bool train, bool bidirectional) { - if (!weight_buf.defined()) { - AT_WARN(WEIGHT_FORMAT_WARN); - } - - Tensor hx, cx; - std::tie(hx, cx) = unpack_hidden(hidden); - - int64_t hidden_size = hx.size(2); - - AT_CHECK(_batch_sizes.dim() == 1, "batch_sizes tensor should be 1D"); - IntList batch_sizes { _batch_sizes.data(), static_cast(_batch_sizes.size(0)) }; - // cudnn_output = std::tuple - auto cudnn_output = at::_cudnn_rnn( - input, params, has_biases ? 4 : 2, weight_buf, - hx, cx, static_cast(cudnn_mode), hidden_size, - num_layers, /*batch_first=*/false, dropout_p, train, bidirectional, - batch_sizes, dropout_state); - - return {std::get<0>(cudnn_output), - pack_hidden(std::get<1>(cudnn_output), std::get<2>(cudnn_output))}; -} - -template -LayerOutput _cudnn_impl( - const Tensor& input, - const hidden_type& hidden, - TensorList params, bool has_biases, - CuDNNMode cudnn_mode, const Tensor& weight_buf, const Tensor& dropout_state, - int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) { - if (!weight_buf.defined()) { - AT_WARN(WEIGHT_FORMAT_WARN); - } - - Tensor hx, cx; - std::tie(hx, cx) = unpack_hidden(hidden); - - int64_t hidden_size = hx.size(2); - - // cudnn_output = std::tuple - auto cudnn_output = at::_cudnn_rnn( - input, params, has_biases ? 4 : 2, weight_buf, - hx, cx, static_cast(cudnn_mode), hidden_size, - num_layers, batch_first, dropout_p, train, bidirectional, - /*batch_sizes=*/{}, dropout_state); - - return {std::get<0>(cudnn_output), - pack_hidden(std::get<1>(cudnn_output), std::get<2>(cudnn_output))}; -} - } // anonymous namespace //////////////////////////////////////////////////////////////////////////////// @@ -600,16 +508,20 @@ LayerOutput _cudnn_impl( //////////////////////////////////////////////////////////////////////////////// #define ONE_HIDDEN_RNN(NAME, CELL) \ +DEFINE_DISPATCH(NAME##_cudnn_stub); \ +DEFINE_DISPATCH(NAME##_packed_cudnn_stub); \ +REGISTER_NO_CPU_DISPATCH(NAME##_cudnn_stub, rnn_fn); \ +REGISTER_NO_CPU_DISPATCH(NAME##_packed_cudnn_stub, rnn_packed_fn); \ + \ std::tuple NAME( \ const Tensor& _input, const Tensor& hx, \ TensorList _params, bool has_biases, \ - int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first, \ - const Tensor& cudnn_weight_buf, const Tensor& cudnn_dropout_state) { \ + int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) { \ if (at::cudnn_is_acceptable(_input)) { \ - auto result = _cudnn_impl(_input, hx, _params, has_biases, \ - CuDNNMode::NAME, cudnn_weight_buf, cudnn_dropout_state, \ - num_layers, dropout_p, train, bidirectional, batch_first); \ - return std::make_tuple(result.outputs, result.final_hidden); \ + Tensor output, hy; \ + NAME##_cudnn_stub(_input.type().device_type(), output, hy, _input, hx, _params, has_biases, \ + num_layers, dropout_p, train, bidirectional, batch_first); \ + return std::make_tuple(output, hy); \ } \ auto input = batch_first ? _input.transpose(0, 1) : _input; \ auto params = gather_params(_params, has_biases); \ @@ -624,12 +536,12 @@ std::tuple NAME( \ std::tuple NAME( \ const Tensor& data, const Tensor& batch_sizes, const Tensor& hx, \ TensorList _params, bool has_biases, \ - int64_t num_layers, double dropout_p, bool train, bool bidirectional, \ - const Tensor& cudnn_weight_buf, const Tensor& cudnn_dropout_state) { \ + int64_t num_layers, double dropout_p, bool train, bool bidirectional) { \ if (at::cudnn_is_acceptable(data)) { \ - auto result = _cudnn_impl(data, batch_sizes, hx, _params, has_biases, \ - CuDNNMode::NAME, cudnn_weight_buf, cudnn_dropout_state, num_layers, dropout_p, train, bidirectional); \ - return std::make_tuple(result.outputs, result.final_hidden); \ + Tensor output, hy; \ + NAME##_packed_cudnn_stub(data.type().device_type(), output, hy, data, batch_sizes, hx, \ + _params, has_biases, num_layers, dropout_p, train, bidirectional); \ + return std::make_tuple(output, hy); \ } \ PackedSequence input { data, batch_sizes }; \ auto params = gather_params(_params, has_biases); \ @@ -643,16 +555,21 @@ ONE_HIDDEN_RNN(gru, GRUCell) ONE_HIDDEN_RNN(rnn_tanh, SimpleCell) ONE_HIDDEN_RNN(rnn_relu, SimpleCell) +DEFINE_DISPATCH(lstm_cudnn_stub); +DEFINE_DISPATCH(lstm_packed_cudnn_stub); +REGISTER_NO_CPU_DISPATCH(lstm_cudnn_stub, lstm_fn); +REGISTER_NO_CPU_DISPATCH(lstm_packed_cudnn_stub, lstm_packed_fn); + std::tuple lstm( const Tensor& _input, TensorList hx, TensorList _params, bool has_biases, - int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first, - const Tensor& cudnn_weight_buf, const Tensor& cudnn_dropout_state) { + int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) { AT_CHECK(hx.size() == 2, "lstm expects two hidden states"); if (at::cudnn_is_acceptable(_input)) { - auto result = _cudnn_impl(_input, std::make_tuple(hx[0], hx[1]), _params, has_biases, - CuDNNMode::lstm, cudnn_weight_buf, cudnn_dropout_state, num_layers, dropout_p, train, bidirectional, batch_first); - return std::make_tuple(result.outputs, std::get<0>(result.final_hidden), std::get<1>(result.final_hidden)); + Tensor output, hy, cy; + lstm_cudnn_stub(_input.type().device_type(), output, hy, cy, _input, hx, _params, has_biases, + num_layers, dropout_p, train, bidirectional, batch_first); + return std::make_tuple(output, hy, cy); } auto input = batch_first ? _input.transpose(0, 1) : _input; auto params = gather_params(_params, has_biases); @@ -667,13 +584,13 @@ std::tuple lstm( std::tuple lstm( const Tensor& data, const Tensor& batch_sizes, TensorList hx, TensorList _params, bool has_biases, - int64_t num_layers, double dropout_p, bool train, bool bidirectional, - const Tensor& cudnn_weight_buf, const Tensor& cudnn_dropout_state) { + int64_t num_layers, double dropout_p, bool train, bool bidirectional) { AT_CHECK(hx.size() == 2, "lstm expects two hidden states"); if (at::cudnn_is_acceptable(data)) { - auto result = _cudnn_impl(data, batch_sizes, std::make_tuple(hx[0], hx[1]), _params, has_biases, - CuDNNMode::lstm, cudnn_weight_buf, cudnn_dropout_state, num_layers, dropout_p, train, bidirectional); - return std::make_tuple(result.outputs, std::get<0>(result.final_hidden), std::get<1>(result.final_hidden)); + Tensor output, hy, cy; + lstm_packed_cudnn_stub(data.type().device_type(), output, hy, cy, data, batch_sizes, hx, + _params, has_biases, num_layers, dropout_p, train, bidirectional); + return std::make_tuple(output, hy, cy); } PackedSequence input { data, batch_sizes }; auto params = gather_params(_params, has_biases); diff --git a/aten/src/ATen/native/RNN.h b/aten/src/ATen/native/RNN.h new file mode 100644 index 0000000000000..3fc89993404a9 --- /dev/null +++ b/aten/src/ATen/native/RNN.h @@ -0,0 +1,23 @@ +#pragma once + +#include +#include + +namespace at { namespace native { + +using lstm_fn = void(*)(Tensor&, Tensor&, Tensor&, const Tensor&, TensorList, TensorList, bool, int64_t, double, bool, bool, bool); +using rnn_fn = void(*)(Tensor&, Tensor&, const Tensor&, const Tensor&, TensorList, bool, int64_t, double, bool, bool, bool); +using lstm_packed_fn = void(*)(Tensor&, Tensor&, Tensor&, const Tensor&, const Tensor&, TensorList, TensorList, bool, int64_t, double, bool, bool); +using rnn_packed_fn = void(*)(Tensor&, Tensor&, const Tensor&, const Tensor&, const Tensor&, TensorList, bool, int64_t, double, bool, bool); + +DECLARE_DISPATCH(lstm_fn, lstm_cudnn_stub); +DECLARE_DISPATCH(rnn_fn, gru_cudnn_stub); +DECLARE_DISPATCH(rnn_fn, rnn_tanh_cudnn_stub); +DECLARE_DISPATCH(rnn_fn, rnn_relu_cudnn_stub); +DECLARE_DISPATCH(lstm_packed_fn, lstm_packed_cudnn_stub); +DECLARE_DISPATCH(rnn_packed_fn, gru_packed_cudnn_stub); +DECLARE_DISPATCH(rnn_packed_fn, rnn_tanh_packed_cudnn_stub); +DECLARE_DISPATCH(rnn_packed_fn, rnn_relu_packed_cudnn_stub); + +}} // namespace at::native + diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp index 67bc71ca1b68e..db3833e9f8f90 100644 --- a/aten/src/ATen/native/TensorFactories.cpp +++ b/aten/src/ATen/native/TensorFactories.cpp @@ -413,7 +413,7 @@ void randperm_cpu(Tensor& result, int64_t n, THGenerator* generator) { THGenerator* get_generator(at::Generator* gen) { - auto default_gen = &at::globalContext().defaultGenerator(at::Backend::CPU); + auto default_gen = &at::globalContext().defaultGenerator(at::kCPU); auto gen_ = at::check_generator(gen, default_gen); return gen_->generator; } @@ -616,7 +616,7 @@ Tensor tensor_cpu(ArrayRef values, const TensorOptions& options) { template Tensor tensor_cuda(ArrayRef values, const TensorOptions& options) { - auto cpu_tensor = tensor_cpu(values, TensorOptions(options).device(at::kCPU)); + auto cpu_tensor = tensor_cpu(values, TensorOptions(options).device(DeviceType::CPU)); return cpu_tensor.to(options.device()); } diff --git a/aten/src/ATen/native/TensorIterator.cpp b/aten/src/ATen/native/TensorIterator.cpp index d8f0fbe1825e5..28989fc398b2a 100644 --- a/aten/src/ATen/native/TensorIterator.cpp +++ b/aten/src/ATen/native/TensorIterator.cpp @@ -97,7 +97,7 @@ void TensorIterator::compute_common_type() { op.type = &type; if (op.tensor->defined() && type != op.tensor->type()) { if (op.tensor->dim() == 0) { - if (type.backend() != at::kCUDA) { + if (type.backend() != at::Backend::CUDA) { *op.tensor = op.tensor->toType(type); } } else { @@ -300,7 +300,7 @@ bool TensorIterator::is_scalar(int arg) const { } bool TensorIterator::is_cpu_scalar(int arg) const { - return is_scalar(arg) && operands_[arg].tensor->type().backend() == at::kCPU; + return is_scalar(arg) && operands_[arg].tensor->type().backend() == at::Backend::CPU; } void* TensorIterator::data_ptr(int arg) const { diff --git a/aten/src/ATen/native/TensorIterator.h b/aten/src/ATen/native/TensorIterator.h index 0cce66cf06e3e..245866373d476 100644 --- a/aten/src/ATen/native/TensorIterator.h +++ b/aten/src/ATen/native/TensorIterator.h @@ -120,6 +120,7 @@ struct AT_API TensorIterator { } ScalarType dtype(int arg) const { return type(arg).scalarType(); } Backend backend(int arg=0) const { return type(arg).backend(); } + DeviceType device_type(int arg=0) const { return type(arg).device_type(); } bool is_scalar(int arg) const; bool is_cpu_scalar(int arg) const; diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp index 7ecedff060bf2..6733a94db3454 100644 --- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp @@ -104,7 +104,7 @@ static void sigmoid_kernel(Tensor& result, const Tensor& self) { #define IMPLEMENT_FLOAT_KERNEL(dispatchtypes, op) \ static void op##_kernel(Tensor& result, const Tensor& self) { \ - checkBackend(#op, {result}, kCPU); \ + checkBackend(#op, {result}, Backend::CPU); \ AT_DISPATCH_##dispatchtypes##_TYPES(self.type(), #op, [&] { \ if (self.is_contiguous() && result.is_contiguous()) { \ vml::v##op( \ diff --git a/aten/src/ATen/native/cuda/Gesv.cu b/aten/src/ATen/native/cuda/Gesv.cu index 7c84694056932..4d99f1b5ed22b 100644 --- a/aten/src/ATen/native/cuda/Gesv.cu +++ b/aten/src/ATen/native/cuda/Gesv.cu @@ -75,7 +75,7 @@ template static inline std::unique_ptr pin_memory(int64_t size, Tensor dummy) { int64_t adjusted_size = size * sizeof(T); auto* allocator = cuda::getPinnedMemoryAllocator(); - auto& backend = dummy.type().toBackend(kCPU).toScalarType(kByte); + auto& backend = dummy.type().toBackend(Backend::CPU).toScalarType(kByte); return backend.storageWithAllocator(adjusted_size, allocator); } diff --git a/aten/src/ATen/native/cuda/SummaryOps.cu b/aten/src/ATen/native/cuda/SummaryOps.cu index 90a13e8255f71..5427e7de0419f 100644 --- a/aten/src/ATen/native/cuda/SummaryOps.cu +++ b/aten/src/ATen/native/cuda/SummaryOps.cu @@ -249,7 +249,7 @@ Tensor _bincount_cuda_template( } if (self.dim() != 1 || (!std::is_same::value && - *self.min().toBackend(kCPU).data() < 0)) { + *self.min().cpu().data() < 0)) { AT_ERROR("bincount only supports 1-d non-negative integral inputs."); } @@ -268,7 +268,7 @@ Tensor _bincount_cuda_template( auto ret = cuda::CUDA_tensor_histogram( output, self, weights, nbins, 1); } else { - output = native::zeros({nbins}, device(kCUDA).dtype(kLong)); + output = native::zeros({nbins}, device(DeviceType::CUDA).dtype(kLong)); auto ret = cuda::CUDA_tensor_histogram( output, self, weights, nbins, 1); } diff --git a/aten/src/ATen/native/cuda/TensorFactories.cu b/aten/src/ATen/native/cuda/TensorFactories.cu index e4866a21d5523..309b54a299caa 100644 --- a/aten/src/ATen/native/cuda/TensorFactories.cu +++ b/aten/src/ATen/native/cuda/TensorFactories.cu @@ -49,7 +49,7 @@ Tensor& randperm_out_cuda(Tensor& result, int64_t n, Generator* generator) { result.copy_(randperm_out_cuda(result_float, n, generator)); } else { if (n < 30000) { // For small inputs, we offload it to CPU instead. - auto result_cpu = result.type().toBackend(kCPU).tensor({n}); + auto result_cpu = result.type().cpu().tensor({n}); randperm_out(result_cpu, n, generator); result.copy_(result_cpu); } else { diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp index 994a652dbaa68..6f2d13a1533bd 100644 --- a/aten/src/ATen/native/cudnn/RNN.cpp +++ b/aten/src/ATen/native/cudnn/RNN.cpp @@ -6,6 +6,8 @@ #include #include #include +#include +#include #if !AT_CUDNN_ENABLED() @@ -451,7 +453,7 @@ namespace { // (same for the hh weights, and the ih and hh biases). // Since we're storing all the weights in a single tensor anyway, // might as well merge the CUDNN ones into a single tensor as well - int mat_numel = *filter_dim_a.prod(at::ScalarType::Int).data(); + int mat_numel = *filter_dim_a.prod(at::ScalarType::Int).data(); if (linear_id == 0 || linear_id == num_linear_layers / 2) { std::initializer_list size = { mat_numel * num_linear_layers / 2, 1}; @@ -477,6 +479,46 @@ namespace { return std::make_pair(params, global_layer_params_count); } + // This is a lightweight version of the method above used to quickly get the expected + // parameter offsets. + std::vector get_expected_data_ptrs( + const Tensor& weight_buf, cudnnHandle_t handle, const RNNDescriptorParams& rnn, + const RNNDescriptor& rnn_desc, const TensorDescriptor& x_desc, cudnnDataType_t datatype) { + FilterDescriptor w_desc; + w_desc.set(weight_buf, 3); + + int64_t num_linear_layers = _num_linear_layers(rnn.mode); + int64_t num_dir_layers = rnn.num_directions() * rnn.num_layers; + const auto cudnn_methods = { cudnnGetRNNLinLayerMatrixParams, cudnnGetRNNLinLayerBiasParams }; + std::vector data_ptrs; + data_ptrs.reserve(num_dir_layers * 2 * 2); + for (int64_t layer = 0; layer < num_dir_layers; layer++) { + for (auto cudnn_method : cudnn_methods) { + // This API returns a separate pointer for weight of every gate, + // but we represent them as a single tensor, so we're only interested + // in a very limited subset of possible values. + const std::array linear_offsets = { 0, num_linear_layers / 2 }; + for (int64_t linear_id : linear_offsets) { + FilterDescriptor lin_layer_mat_desc; + void* matrix_pointer; + AT_CUDNN_CHECK(cudnn_method( + handle, + rnn_desc.desc(), + layer, + x_desc.desc(), + w_desc.desc(), + weight_buf.data_ptr(), + linear_id, + lin_layer_mat_desc.mut_desc(), + &matrix_pointer + )); + data_ptrs.push_back(matrix_pointer); + } + } + } + return data_ptrs; + } + void _copyParams(MatrixRef params_from, MatrixRef params_to) { AT_ASSERTM(params_from.size(0) == params_to.size(0), "number of layers mismatch"); for (size_t i = 0; i < params_from.size(0); i++) { @@ -1007,6 +1049,243 @@ Tensor _cudnn_init_dropout_state(const Type& ty, double dropout, bool train, int return dropout_desc.state; } +//////////////////////////////////////////////////////////////////////////////// +// CUDA dispatch for the generic RNN ops (at::lstm, at::gru, ...) +//////////////////////////////////////////////////////////////////////////////// + +namespace { + +// Helpers for working with different hidden types. +std::tuple unpack_hidden(const Tensor& hidden) { + return std::make_tuple(hidden, at::Tensor{}); +} + +std::tuple unpack_hidden(const std::tuple& hidden) { + return hidden; +} + +template +hidden_type pack_hidden(const Tensor& hx, const Tensor& cx) { + static_assert(std::is_same::value, "pack_hidden not implemented for this type"); + AT_ERROR("NOT IMPLEMENTED"); +} + +template<> +Tensor pack_hidden(const Tensor& hx, const Tensor& cx) { + AT_ASSERT(cx.numel() == 0); + return hx; +} + +template<> +std::tuple pack_hidden>(const Tensor& hx, const Tensor& cx) { + return std::make_tuple(hx, cx); +} + +struct DropoutState { + // Both buffer and event are lazily instantiated when a dropout state is needed + // for the first time. Note that in this case needed != used, as we don't need + // a bufer to e.g. run RNNs in test mode. + at::Tensor buffer; + at::optional event; + std::mutex mutex; + + void lock() { + // NB: We can't ignore the lock even when event is undefined, because someone + // could then define it before we get to unlock(). + mutex.lock(); + if (event) { + cuda::getCurrentCUDAStream().synchronize_with(*event); + } + } + + void unlock() { + if (event) { + event->record(); + } + mutex.unlock(); + } +}; + +DropoutState& get_dropout_state(const Type& tp, double dropout_p, bool train) { + // Each state is slightly over 2MB and initialized lazily, so it's fine to cache them. + static std::vector ten_dropout_state_cache { static_cast(cuda::getNumGPUs()) }; + static std::vector var_dropout_state_cache { static_cast(cuda::getNumGPUs()) }; + static std::mutex state_cache_mut; + + int device = cuda::current_device(); + std::unique_lock lock {state_cache_mut}; + auto& state = tp.is_variable() ? var_dropout_state_cache.at(device) + : ten_dropout_state_cache.at(device); + if (train && dropout_p > 0 && !state.buffer.defined()) { + std::unique_lock lock {state.mutex}; + int64_t seed = at::empty({}, at::kLong).random_().toCLong(); + state.buffer = at::_cudnn_init_dropout_state( + tp.toScalarType(at::kByte), dropout_p, train, seed); + // NB: CUDA binds the event to a device at creation time, so we can initialize it + // only now, when we know we're on the correct device. + state.event.emplace(); + } + return state; +} + +Tensor try_get_weight_buf( + const Tensor& input, TensorList parameters, bool has_biases, + cudnnRNNMode_t mode, int64_t hidden_size, int64_t num_layers, bool bidirectional) { + // Prepare all relevant descriptors + auto handle = getCudnnHandle(); + auto datatype = getCudnnDataType(input); + + RNNDescriptorParams rnn; + rnn.set(mode, hidden_size, num_layers, bidirectional, datatype); + RNNDescriptor rnn_desc = rnn.descriptor(handle); + + TensorGeometry x_geom ({1, input.size(-1)}); + TensorDescriptor x_desc; + x_desc.set(datatype, x_geom.sizes(), x_geom.strides(), 5); + + auto num_params = get_num_weights(handle, rnn_desc, x_desc, datatype); + + // Try to get parameter storage + auto & any_param = parameters.at(0); + auto param_storage = any_param.storage(); + auto weight_buf = any_param.type().tensor().set_(*param_storage); + if (weight_buf.size(0) < num_params) { + return {}; + } else if (weight_buf.size(0) > num_params) { + weight_buf = weight_buf.narrow(0, 0, num_params); + } + + // Get and check data pointers + auto expected_data_ptrs = get_expected_data_ptrs( + weight_buf, handle, rnn, rnn_desc, x_desc, datatype); + + int64_t num_parameters = parameters.size(); + int64_t num_ptrs = expected_data_ptrs.size(); + AT_ASSERT(num_ptrs == (num_parameters * (has_biases ? 1 : 2))); + AT_ASSERT(num_ptrs % (has_biases ? 4 : 2) == 0); + for (int64_t param_i = 0, ptr_i = 0; + ptr_i < num_ptrs; + ptr_i += (has_biases ? 2 : 4), param_i += 2) { + if (expected_data_ptrs[ptr_i] != parameters[param_i].data_ptr()) return {}; + if (expected_data_ptrs[ptr_i + 1] != parameters[param_i + 1].data_ptr()) return {}; + } + if (!parameters[num_parameters - 1].is_contiguous()) return {}; + return weight_buf; +} + +const char * WEIGHT_FORMAT_WARN = "RNN module weights are not part of single contiguous " + "chunk of memory. This means they need to be compacted " + "at every call, possibly greatly increasing memory usage. " + "To compact weights again call flatten_parameters()."; + +template +std::pair _cudnn_impl( + const Tensor& input, const Tensor& _batch_sizes, const hidden_type& hidden, + TensorList params, bool has_biases, cudnnRNNMode_t mode, + int64_t num_layers, double dropout_p, bool train, bool bidirectional) { + Tensor hx, cx; + std::tie(hx, cx) = unpack_hidden(hidden); + int64_t hidden_size = hx.size(2); + + auto weight_buf = try_get_weight_buf( + input, params, has_biases, mode, hidden_size, num_layers, bidirectional); + if (!weight_buf.defined()) { + AT_WARN(WEIGHT_FORMAT_WARN); + } + + AT_CHECK(_batch_sizes.dim() == 1, "batch_sizes tensor should be 1D"); + IntList batch_sizes { _batch_sizes.data(), static_cast(_batch_sizes.size(0)) }; + + auto & dropout_state = get_dropout_state(input.type(), dropout_p, train); + std::unique_lock lock { dropout_state }; + // cudnn_output = std::tuple + auto cudnn_output = at::_cudnn_rnn( + input, params, has_biases ? 4 : 2, weight_buf, + hx, cx, static_cast(mode), hidden_size, num_layers, /*batch_first=*/false, + dropout_p, train, bidirectional, batch_sizes, dropout_state.buffer); + + return {std::get<0>(cudnn_output), + pack_hidden(std::get<1>(cudnn_output), std::get<2>(cudnn_output))}; +} + +template +std::pair _cudnn_impl( + const Tensor& input, const hidden_type& hidden, + TensorList params, bool has_biases, cudnnRNNMode_t mode, + int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) { + Tensor hx, cx; + std::tie(hx, cx) = unpack_hidden(hidden); + int64_t hidden_size = hx.size(2); + + auto weight_buf = try_get_weight_buf( + input, params, has_biases, mode, hidden_size, num_layers, bidirectional); + if (!weight_buf.defined()) { + AT_WARN(WEIGHT_FORMAT_WARN); + } + + auto & dropout_state = get_dropout_state(input.type(), dropout_p, train); + std::unique_lock lock { dropout_state }; + // cudnn_output = std::tuple + auto cudnn_output = at::_cudnn_rnn( + input, params, has_biases ? 4 : 2, weight_buf, + hx, cx, static_cast(mode), hidden_size, num_layers, batch_first, dropout_p, + train, bidirectional, /*batch_sizes=*/{}, dropout_state.buffer); + + return {std::get<0>(cudnn_output), + pack_hidden(std::get<1>(cudnn_output), std::get<2>(cudnn_output))}; +} + +#define ONE_HIDDEN_RNN(NAME, MODE) \ +void NAME##_cudnn(Tensor& output, Tensor& hy, \ + const Tensor& input, const Tensor& hx, \ + TensorList params, bool has_biases, \ + int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) { \ + std::tie(output, hy) = _cudnn_impl(input, hx, params, has_biases, \ + MODE, num_layers, dropout_p, train, bidirectional, batch_first); \ +} \ + \ +void NAME##_packed_cudnn(Tensor& output, Tensor& hy, \ + const Tensor& data, const Tensor& batch_sizes, const Tensor& hx, \ + TensorList params, bool has_biases, \ + int64_t num_layers, double dropout_p, bool train, bool bidirectional) { \ + std::tie(output, hy) = _cudnn_impl(data, batch_sizes, hx, params, \ + has_biases, MODE, num_layers, dropout_p, train, bidirectional); \ +} \ + \ +REGISTER_CUDA_DISPATCH(NAME##_cudnn_stub, &NAME##_cudnn); \ +REGISTER_CUDA_DISPATCH(NAME##_packed_cudnn_stub, &NAME##_packed_cudnn); + +ONE_HIDDEN_RNN(gru, CUDNN_GRU) +ONE_HIDDEN_RNN(rnn_tanh, CUDNN_RNN_TANH) +ONE_HIDDEN_RNN(rnn_relu, CUDNN_RNN_RELU) + +void lstm_cudnn(Tensor& output, Tensor& hy, Tensor& cy, + const Tensor& input, TensorList hx, + TensorList params, bool has_biases, + int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) { + auto result = _cudnn_impl(input, std::make_tuple(hx[0], hx[1]), params, has_biases, + CUDNN_LSTM, num_layers, dropout_p, train, bidirectional, batch_first); + output = result.first; + hy = std::get<0>(result.second); + cy = std::get<1>(result.second); +} + +void lstm_packed_cudnn(Tensor& output, Tensor& hy, Tensor& cy, + const Tensor& data, const Tensor& batch_sizes, TensorList hx, + TensorList params, bool has_biases, + int64_t num_layers, double dropout_p, bool train, bool bidirectional) { + auto result = _cudnn_impl(data, batch_sizes, std::make_tuple(hx[0], hx[1]), + params, has_biases, CUDNN_LSTM, num_layers, dropout_p, train, bidirectional); + output = result.first; + hy = std::get<0>(result.second); + cy = std::get<1>(result.second); +} + +REGISTER_CUDA_DISPATCH(lstm_cudnn_stub, &lstm_cudnn); +REGISTER_CUDA_DISPATCH(lstm_packed_cudnn_stub, &lstm_packed_cudnn); + +} // anonymous namepsace + }} // namespace at::native #endif // AT_CUDNN_ENABLED() diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index bcac90d5d7de2..f23ee7a322d01 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -2110,28 +2110,28 @@ variants: function # RNN cells and layers -- func: lstm(Tensor input, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first, Tensor? cudnn_weight_buf={}, Tensor? cudnn_dropout_state={}) -> (Tensor, Tensor, Tensor) +- func: lstm(Tensor input, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor) variants: function -- func: lstm(Tensor data, Tensor batch_sizes, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, Tensor? cudnn_weight_buf={}, Tensor? cudnn_dropout_state={}) -> (Tensor, Tensor, Tensor) +- func: lstm(Tensor data, Tensor batch_sizes, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) -> (Tensor, Tensor, Tensor) variants: function -- func: gru(Tensor input, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first, Tensor? cudnn_weight_buf={}, Tensor? cudnn_dropout_state={}) -> (Tensor, Tensor) +- func: gru(Tensor input, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor) variants: function -- func: gru(Tensor data, Tensor batch_sizes, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, Tensor? cudnn_weight_buf={}, Tensor? cudnn_dropout_state={}) -> (Tensor, Tensor) +- func: gru(Tensor data, Tensor batch_sizes, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) -> (Tensor, Tensor) variants: function -- func: rnn_tanh(Tensor input, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first, Tensor? cudnn_weight_buf={}, Tensor? cudnn_dropout_state={}) -> (Tensor, Tensor) +- func: rnn_tanh(Tensor input, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor) variants: function -- func: rnn_tanh(Tensor data, Tensor batch_sizes, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, Tensor? cudnn_weight_buf={}, Tensor? cudnn_dropout_state={}) -> (Tensor, Tensor) +- func: rnn_tanh(Tensor data, Tensor batch_sizes, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) -> (Tensor, Tensor) variants: function -- func: rnn_relu(Tensor input, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first, Tensor? cudnn_weight_buf={}, Tensor? cudnn_dropout_state={}) -> (Tensor, Tensor) +- func: rnn_relu(Tensor input, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor) variants: function -- func: rnn_relu(Tensor data, Tensor batch_sizes, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, Tensor? cudnn_weight_buf={}, Tensor? cudnn_dropout_state={}) -> (Tensor, Tensor) +- func: rnn_relu(Tensor data, Tensor batch_sizes, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) -> (Tensor, Tensor) variants: function - func: lstm_cell(Tensor input, TensorList hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih={}, Tensor? b_hh={}) -> (Tensor, Tensor) diff --git a/aten/src/ATen/templates/SparseTypeDerived.cpp b/aten/src/ATen/templates/SparseTypeDerived.cpp index 6c3094e71aa0d..8508cf4c5463a 100644 --- a/aten/src/ATen/templates/SparseTypeDerived.cpp +++ b/aten/src/ATen/templates/SparseTypeDerived.cpp @@ -35,8 +35,8 @@ ScalarType ${Type}::scalarType() const { Backend ${Type}::backend() const { return Backend::${Backend}; } -bool ${Type}::is_cuda() const { return backend() == kCUDA || backend() == kSparseCUDA; } -bool ${Type}::is_sparse() const { return backend() == kSparseCPU || backend() == kSparseCUDA; } +bool ${Type}::is_cuda() const { return backend() == Backend::CUDA || backend() == Backend::SparseCUDA; } +bool ${Type}::is_sparse() const { return backend() == Backend::SparseCPU || backend() == Backend::SparseCUDA; } bool ${Type}::is_distributed() const { return false; } std::unique_ptr ${Type}::storage(bool resizable) const { diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h index 5a7fd278fb1f0..0d27a88b773fb 100644 --- a/aten/src/ATen/templates/Tensor.h +++ b/aten/src/ATen/templates/Tensor.h @@ -154,6 +154,9 @@ struct Tensor : public detail::TensorBase { Tensor operator[](Tensor index) const; Tensor operator[](int64_t index) const; + Tensor cpu() const; + Tensor cuda() const; + // ~~~~~ Autograd API ~~~~~ Tensor& set_requires_grad(bool requires_grad) { diff --git a/aten/src/ATen/templates/TensorMethods.h b/aten/src/ATen/templates/TensorMethods.h index 58c0198375985..1ab26d97a121d 100644 --- a/aten/src/ATen/templates/TensorMethods.h +++ b/aten/src/ATen/templates/TensorMethods.h @@ -19,6 +19,14 @@ inline Tensor Tensor::toType(const Type & t, bool non_blocking) const { return t.copy(*this, non_blocking); } +inline Tensor Tensor::cpu() const { + return toType(type().cpu()); +} + +inline Tensor Tensor::cuda() const { + return toType(type().cuda()); +} + inline Tensor & Tensor::copy_(const Tensor & src, bool non_blocking) { return type().copy_(*this, src, non_blocking); } diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h index 10036a5286b5b..4523b7b3efbc2 100644 --- a/aten/src/ATen/templates/Type.h +++ b/aten/src/ATen/templates/Type.h @@ -74,15 +74,26 @@ struct AT_API Type { Type & toDense() const { return this->toBackend(at::toDense(this->backend())); } + Type & cpu() const { + return this->toBackend(at::backendToCPU(this->backend())); + } + Type & cuda() const { + return this->toBackend(at::backendToCUDA(this->backend())); + } Context& get_context() const { return *context; } - // contingious IDs for all types in the system + // contiguous IDs for all types in the system // for external dispatch virtual TypeID ID() const = 0; // New-style TensorTypeId that supports open registration. TensorTypeId type_id() const { return type_id_; } + // NB: This will return DeviceType::CPU for Backend::SparseCPU + DeviceType device_type() const { + return backendToDeviceType(backend()); + } + Tensor copy(const Tensor & src, bool non_blocking=false) const; Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking=false) const; virtual Tensor & s_copy_(Tensor & self, const Tensor & src, bool non_blocking) const = 0; @@ -121,7 +132,7 @@ inline Layout Tensor::layout() const noexcept { } inline Device Tensor::device() const { - return Device(type().backend(), type().is_cuda() ? get_device() : -1); + return Device(type().device_type(), type().is_cuda() ? get_device() : -1); } } // namespace at diff --git a/aten/src/ATen/templates/TypeDerived.cpp b/aten/src/ATen/templates/TypeDerived.cpp index 5f2e354d8869e..82942655a8514 100644 --- a/aten/src/ATen/templates/TypeDerived.cpp +++ b/aten/src/ATen/templates/TypeDerived.cpp @@ -46,8 +46,8 @@ ScalarType ${Type}::scalarType() const { Backend ${Type}::backend() const { return Backend::${Backend}; } -bool ${Type}::is_cuda() const { return backend() == kCUDA || backend() == kSparseCUDA; } -bool ${Type}::is_sparse() const { return backend() == kSparseCPU || backend() == kSparseCUDA; } +bool ${Type}::is_cuda() const { return backend() == Backend::CUDA || backend() == Backend::SparseCUDA; } +bool ${Type}::is_sparse() const { return backend() == Backend::SparseCPU || backend() == Backend::SparseCUDA; } bool ${Type}::is_distributed() const { return false; } std::unique_ptr ${Type}::storage(bool resizable) const { @@ -80,9 +80,9 @@ std::unique_ptr ${Type}::storageFromBlob(void * data, int64_t size, con ScalarType::${ScalarName}, InefficientStdFunctionContext::makeDataPtr(data, deleter, #if ${isCUDA} - Device(kCUDA, getPointerDevice(data)) + Device(DeviceType::CUDA, getPointerDevice(data)) #else - kCPU + DeviceType::CPU #endif ), size, diff --git a/aten/src/ATen/test/apply_utils_test.cpp b/aten/src/ATen/test/apply_utils_test.cpp index 24359a05e4bf6..38027baae97b7 100644 --- a/aten/src/ATen/test/apply_utils_test.cpp +++ b/aten/src/ATen/test/apply_utils_test.cpp @@ -109,31 +109,31 @@ void test(Type& type, IntList shape, int64_t a = 0, int64_t b = 1) { } TEST_CASE("apply utils test 2-dim small contiguous", "[cpu]") { - manual_seed(123, at::Backend::CPU); + manual_seed(123, at::kCPU); test(CPU(kDouble), {2, 1}, -1, -1); } TEST_CASE("apply utils test 2-dim small", "[cpu]") { - manual_seed(123, at::Backend::CPU); + manual_seed(123, at::kCPU); test(CPU(kDouble), {2, 1}); } TEST_CASE("apply utils test 2-dim", "[cpu]") { - manual_seed(123, at::Backend::CPU); + manual_seed(123, at::kCPU); test(CPU(kDouble), {20, 10}); } TEST_CASE("apply utils test 3-dim", "[cpu]") { - manual_seed(123, at::Backend::CPU); + manual_seed(123, at::kCPU); test(CPU(kDouble), {3, 4, 2}); } TEST_CASE("apply utils test 3-dim medium", "[cpu]") { - manual_seed(123, at::Backend::CPU); + manual_seed(123, at::kCPU); test(CPU(kDouble), {3, 40, 2}); } TEST_CASE("apply utils test 10-dim", "[cpu]") { - manual_seed(123, at::Backend::CPU); + manual_seed(123, at::kCPU); test(CPU(kDouble), {3, 4, 2, 5, 2, 1, 3, 4, 2, 3}); } diff --git a/aten/src/ATen/test/atest.cpp b/aten/src/ATen/test/atest.cpp index 0271de625fd13..cc831fbd42832 100644 --- a/aten/src/ATen/test/atest.cpp +++ b/aten/src/ATen/test/atest.cpp @@ -24,8 +24,8 @@ void trace() { TEST_CASE( "atest", "[]" ) { - manual_seed(123, at::Backend::CPU); - manual_seed(123, at::Backend::CUDA); + manual_seed(123, at::kCPU); + manual_seed(123, at::kCUDA); auto foo = rand({12,6}); REQUIRE(foo.data() == foo.toFloatData()); diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp index 0e668e27919b1..cfd77986d626c 100644 --- a/aten/src/ATen/test/basic.cpp +++ b/aten/src/ATen/test/basic.cpp @@ -57,7 +57,7 @@ static void test(Type & type) { REQUIRE(Scalar(z_sorted[0][0]).toFloat() < Scalar(z_sorted[0][1]).toFloat()); } - if(type.backend() != kCUDA) + if(type.backend() != Backend::CUDA) SECTION( "randperm" ) { Tensor b = randperm(15, type); Tensor rv, ri; @@ -277,13 +277,13 @@ static void test(Type & type) { } TEST_CASE( "basic tests CPU", "[cpu]" ) { - manual_seed(123, at::Backend::CPU); + manual_seed(123, at::kCPU); test(CPU(kFloat)); } TEST_CASE( "basic tests GPU", "[cuda]" ) { - manual_seed(123, at::Backend::CUDA); + manual_seed(123, at::kCUDA); if(at::hasCUDA()) { test(CUDA(kFloat)); diff --git a/aten/src/ATen/test/broadcast_test.cpp b/aten/src/ATen/test/broadcast_test.cpp index b86f58f3deeee..395b49d4be0f5 100644 --- a/aten/src/ATen/test/broadcast_test.cpp +++ b/aten/src/ATen/test/broadcast_test.cpp @@ -8,7 +8,7 @@ using namespace at; TEST_CASE( "broadcast", "[]" ) { - manual_seed(123, at::Backend::CPU); + manual_seed(123, at::kCPU); Type & T = CPU(kFloat); diff --git a/aten/src/ATen/test/cudnn_test.cpp b/aten/src/ATen/test/cudnn_test.cpp index 7194c83c0be71..31786e88a0944 100644 --- a/aten/src/ATen/test/cudnn_test.cpp +++ b/aten/src/ATen/test/cudnn_test.cpp @@ -10,7 +10,7 @@ using namespace at; using namespace at::native; TEST_CASE( "cudnn", "[cuda]" ) { - manual_seed(123, at::Backend::CUDA); + manual_seed(123, at::kCUDA); #if CUDNN_VERSION < 7000 auto handle = getCudnnHandle(); diff --git a/aten/src/ATen/test/dlconvertor_test.cpp b/aten/src/ATen/test/dlconvertor_test.cpp index 1603e3d54b16e..4882929876027 100644 --- a/aten/src/ATen/test/dlconvertor_test.cpp +++ b/aten/src/ATen/test/dlconvertor_test.cpp @@ -13,7 +13,7 @@ using namespace at; TEST_CASE( "dlconvertor", "[cpu]" ) { - manual_seed(123, at::Backend::CPU); + manual_seed(123, at::kCPU); INFO( "convert ATen to DLTensor" ); diff --git a/aten/src/ATen/test/native_test.cpp b/aten/src/ATen/test/native_test.cpp index 99a21d36d7720..fac85da04aa60 100644 --- a/aten/src/ATen/test/native_test.cpp +++ b/aten/src/ATen/test/native_test.cpp @@ -179,13 +179,13 @@ void test(Type & T, Type & AccT) { } TEST_CASE( "native test CPU", "[cpu]" ) { - manual_seed(123, at::Backend::CPU); + manual_seed(123, at::kCPU); test(CPU(kFloat), CPU(kDouble)); } TEST_CASE( "native test CUDA", "[cuda]" ) { - manual_seed(123, at::Backend::CUDA); + manual_seed(123, at::kCUDA); if (at::hasCUDA()) { test(CUDA(kFloat), CUDA(kDouble)); diff --git a/aten/src/ATen/test/scalar_tensor_test.cpp b/aten/src/ATen/test/scalar_tensor_test.cpp index 59d8e369772fa..0907c89e09b06 100644 --- a/aten/src/ATen/test/scalar_tensor_test.cpp +++ b/aten/src/ATen/test/scalar_tensor_test.cpp @@ -264,13 +264,13 @@ void test(Type &T) { } TEST_CASE( "scalar tensor test CPU", "[cpu]" ) { - manual_seed(123, at::Backend::CPU); + manual_seed(123, at::kCPU); test(CPU(kFloat)); } TEST_CASE( "scalar tensor test CUDA", "[cuda]" ) { - manual_seed(123, at::Backend::CUDA); + manual_seed(123, at::kCUDA); if (at::hasCUDA()) { test(CUDA(kFloat)); diff --git a/aten/src/ATen/test/scalar_test.cpp b/aten/src/ATen/test/scalar_test.cpp index 2880004555a74..a83f345cc48c3 100644 --- a/aten/src/ATen/test/scalar_test.cpp +++ b/aten/src/ATen/test/scalar_test.cpp @@ -72,15 +72,15 @@ void test_overflow() { TEST_CASE( "scalar test", "[]" ) { - manual_seed(123, at::Backend::CPU); - manual_seed(123, at::Backend::CUDA); + manual_seed(123, at::kCPU); + manual_seed(123, at::kCUDA); Scalar what = 257; Scalar bar = 3.0; Half h = bar.toHalf(); Scalar h2 = h; cout << "H2: " << h2.toDouble() << " " << what.toFloat() << " " << bar.toDouble() << " " << what.isIntegral() << "\n"; - Generator & gen = at::globalContext().defaultGenerator(Backend::CPU); + Generator & gen = at::globalContext().defaultGenerator(at::kCPU); REQUIRE_NOTHROW(gen.seed()); auto && C = at::globalContext(); if(at::hasCUDA()) { diff --git a/aten/src/ATen/test/tbb_init_test.cpp b/aten/src/ATen/test/tbb_init_test.cpp index ae8b02acbc4e8..a0f21734fa6e4 100644 --- a/aten/src/ATen/test/tbb_init_test.cpp +++ b/aten/src/ATen/test/tbb_init_test.cpp @@ -23,7 +23,7 @@ void test(int given_num_threads) { } int main() { - manual_seed(123, at::Backend::CPU); + manual_seed(123, at::kCPU); test(-1); std::thread t1(test, -1); diff --git a/aten/src/ATen/test/test_parallel.cpp b/aten/src/ATen/test/test_parallel.cpp index 5dbd9676e5de9..552328029ce03 100644 --- a/aten/src/ATen/test/test_parallel.cpp +++ b/aten/src/ATen/test/test_parallel.cpp @@ -13,7 +13,7 @@ using namespace at; TEST_CASE( "parallel", "[cpu]" ) { - manual_seed(123, at::Backend::CPU); + manual_seed(123, at::kCPU); set_num_threads(1); Tensor a = rand({1,3}); diff --git a/aten/src/ATen/test/test_seed.h b/aten/src/ATen/test/test_seed.h index 16f9ecb6ed479..980a6eb823ee9 100644 --- a/aten/src/ATen/test/test_seed.h +++ b/aten/src/ATen/test/test_seed.h @@ -2,12 +2,12 @@ #include "ATen/ATen.h" -void manual_seed(uint64_t seed, at::Backend backend) { - if (backend == at::Backend::CPU) { - at::Generator & cpu_gen = at::globalContext().defaultGenerator(at::Backend::CPU); +void manual_seed(uint64_t seed, at::DeviceType backend) { + if (backend == at::kCPU) { + at::Generator & cpu_gen = at::globalContext().defaultGenerator(at::kCPU); cpu_gen.manualSeed(seed); - } else if (backend == at::Backend::CUDA && at::hasCUDA()) { - at::Generator & cuda_gen = at::globalContext().defaultGenerator(at::Backend::CUDA); + } else if (backend == at::kCUDA && at::hasCUDA()) { + at::Generator & cuda_gen = at::globalContext().defaultGenerator(at::kCUDA); cuda_gen.manualSeed(seed); } } diff --git a/aten/src/ATen/test/undefined_tensor_test.cpp b/aten/src/ATen/test/undefined_tensor_test.cpp index 7e5dad7e46f6d..d302d736c696d 100644 --- a/aten/src/ATen/test/undefined_tensor_test.cpp +++ b/aten/src/ATen/test/undefined_tensor_test.cpp @@ -9,7 +9,7 @@ using namespace at; TEST_CASE( "undefined tensor test", "[]" ) { - manual_seed(123, at::Backend::CPU); + manual_seed(123, at::kCPU); // mainly test ops on undefined tensors don't segfault and give a reasonable errror message. Tensor und; diff --git a/aten/src/ATen/test/wrapdim_test.cpp b/aten/src/ATen/test/wrapdim_test.cpp index 23d37117cb871..1c45e1d00a058 100644 --- a/aten/src/ATen/test/wrapdim_test.cpp +++ b/aten/src/ATen/test/wrapdim_test.cpp @@ -7,7 +7,7 @@ using namespace at; TEST_CASE( "wrapdim test", "[]" ) { - manual_seed(123, at::Backend::CPU); + manual_seed(123, at::kCPU); Type & T = CPU(kFloat); diff --git a/aten/src/TH/THAllocator.cpp b/aten/src/TH/THAllocator.cpp index 9dccbb384b17a..1cbc232890b4c 100644 --- a/aten/src/TH/THAllocator.cpp +++ b/aten/src/TH/THAllocator.cpp @@ -22,7 +22,7 @@ struct THDefaultAllocator final : public at::Allocator { at::DataPtr allocate(size_t size) const override { auto* ptr = THAlloc(size); - return {ptr, ptr, &THFree, at::kCPU}; + return {ptr, ptr, &THFree, at::DeviceType::CPU}; } at::DeleterFnPtr raw_deleter() const override { return &THFree; @@ -537,25 +537,25 @@ THRefcountedMapAllocator* THRefcountedMapAllocator::fromDataPtr(const at::DataPt at::DataPtr THMapAllocator::makeDataPtr(const char *filename, int flags, size_t size, size_t* actual_size_out) { auto* context = new THMapAllocator(filename, flags, size); if (actual_size_out) *actual_size_out = context->size(); - return {context->data(), context, &deleteTHMapAllocator, at::kCPU}; + return {context->data(), context, &deleteTHMapAllocator, at::DeviceType::CPU}; } at::DataPtr THMapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) { auto* context = new THMapAllocator(WITH_FD, filename, fd, flags, size); if (actual_size_out) *actual_size_out = context->size(); - return {context->data(), context, &deleteTHMapAllocator, at::kCPU}; + return {context->data(), context, &deleteTHMapAllocator, at::DeviceType::CPU}; } at::DataPtr THRefcountedMapAllocator::makeDataPtr(const char *filename, int flags, size_t size, size_t* actual_size_out) { auto* context = new THRefcountedMapAllocator(filename, flags, size); if (actual_size_out) *actual_size_out = context->size() - TH_ALLOC_ALIGNMENT; - return {context->data(), context, &deleteTHRefcountedMapAllocator, at::kCPU}; + return {context->data(), context, &deleteTHRefcountedMapAllocator, at::DeviceType::CPU}; } at::DataPtr THRefcountedMapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) { auto* context = new THRefcountedMapAllocator(WITH_FD, filename, fd, flags, size); if (actual_size_out) *actual_size_out = context->size() - TH_ALLOC_ALIGNMENT; - return {context->data(), context, &deleteTHRefcountedMapAllocator, at::kCPU}; + return {context->data(), context, &deleteTHRefcountedMapAllocator, at::DeviceType::CPU}; } void* THRefcountedMapAllocator::data() const { diff --git a/aten/src/THC/THCAllocator.cpp b/aten/src/THC/THCAllocator.cpp index c6be2f0afefbb..098ec406110c5 100644 --- a/aten/src/THC/THCAllocator.cpp +++ b/aten/src/THC/THCAllocator.cpp @@ -10,7 +10,7 @@ struct THCudaHostAllocator : public at::Allocator { if (size != 0) { THCudaCheck(cudaMallocHost(&ptr, size)); } - return {ptr, ptr, &THCudaHostDeleter, at::kCPU}; + return {ptr, ptr, &THCudaHostDeleter, at::DeviceType::CPU}; } at::DeleterFnPtr raw_deleter() const override { return &THCudaHostDeleter; @@ -34,7 +34,7 @@ struct THCUVAAllocator : public at::Allocator { if (size != 0) { THCudaCheck(cudaMallocManaged(&ptr, size, cudaMemAttachGlobal)); } - return {ptr, ptr, &THCUVADeleter, at::kCPU}; + return {ptr, ptr, &THCUVADeleter, at::DeviceType::CPU}; } at::DeleterFnPtr raw_deleter() const override { return &THCUVADeleter; @@ -64,5 +64,5 @@ at::DataPtr THCIpcDeleter::makeDataPtr(void* data, int device) { int cur_device; THCudaCheck(cudaGetDevice(&cur_device)); auto* context = new THCIpcDeleter(data, device); - return {data, context, &deleteTHCIpcDeleter, at::Device(at::kCUDA, cur_device)}; + return {data, context, &deleteTHCIpcDeleter, at::Device(at::DeviceType::CUDA, cur_device)}; } diff --git a/aten/src/THC/THCBlas.cu b/aten/src/THC/THCBlas.cu index 9a9de1f5e9b62..44c536e7e5d70 100644 --- a/aten/src/THC/THCBlas.cu +++ b/aten/src/THC/THCBlas.cu @@ -2,6 +2,8 @@ #include "THCGeneral.h" #include "THCHalf.h" +#include + float THCudaBlas_Sdot(THCState *state, int64_t n, float *x, int64_t incx, float *y, int64_t incy) { if (n == 1) { diff --git a/aten/src/THC/THCCachingAllocator.cpp b/aten/src/THC/THCCachingAllocator.cpp index b63e47d86eac7..7882e9a37d546 100644 --- a/aten/src/THC/THCCachingAllocator.cpp +++ b/aten/src/THC/THCCachingAllocator.cpp @@ -510,7 +510,7 @@ struct CudaCachingAllocator : public at::Allocator { if (size != 0) { AT_CUDA_CHECK(caching_allocator.malloc(&r, size, at::cuda::getCurrentCUDAStreamOnDevice(device))); } - return {r, r, &CudaCachingDeleter, at::Device(at::kCUDA, device)}; + return {r, r, &CudaCachingDeleter, at::Device(at::DeviceType::CUDA, device)}; } at::DeleterFnPtr raw_deleter() const override { return &CudaCachingDeleter; diff --git a/aten/src/THC/THCCachingHostAllocator.cpp b/aten/src/THC/THCCachingHostAllocator.cpp index 617c6f2f520af..b371ed9873abe 100644 --- a/aten/src/THC/THCCachingHostAllocator.cpp +++ b/aten/src/THC/THCCachingHostAllocator.cpp @@ -269,7 +269,7 @@ struct THCCachingHostAllocator final : public at::Allocator { THAssert(size >= 0); void *ptr; THCudaCheck(allocator.malloc(&ptr, size)); - return {ptr, ptr, &THCCachingHostDeleter, at::kCPU}; + return {ptr, ptr, &THCCachingHostDeleter, at::DeviceType::CPU}; } at::DeleterFnPtr raw_deleter() const override { return &THCCachingHostDeleter; diff --git a/aten/src/THC/THCGeneral.cpp b/aten/src/THC/THCGeneral.cpp index 05b76d79f59c1..1b716db430714 100644 --- a/aten/src/THC/THCGeneral.cpp +++ b/aten/src/THC/THCGeneral.cpp @@ -49,7 +49,7 @@ struct THDefaultDeviceAllocator final : public at::Allocator { if (size != 0) THCudaCheck(cudaMalloc(&p, size)); int device; THCudaCheck(cudaGetDevice(&device)); - return {p, p, &THDefaultDeviceDeleter, at::Device(at::kCUDA, device)}; + return {p, p, &THDefaultDeviceDeleter, at::Device(at::DeviceType::CUDA, device)}; } at::DeleterFnPtr raw_deleter() const override { return &THDefaultDeviceDeleter; diff --git a/aten/src/THC/THCStorage.cpp b/aten/src/THC/THCStorage.cpp index f76b39a816048..0fb6fea51f5d5 100644 --- a/aten/src/THC/THCStorage.cpp +++ b/aten/src/THC/THCStorage.cpp @@ -22,7 +22,7 @@ void THCStorage_resize(THCState *state, THCStorage *self, ptrdiff_t size) if(size == 0) { - self->set_data_ptr(at::DataPtr(nullptr, at::Device(at::kCUDA, device))); + self->set_data_ptr(at::DataPtr(nullptr, at::Device(at::DeviceType::CUDA, device))); self->set_size(0); } else diff --git a/caffe2/contrib/aten/aten_op.cc b/caffe2/contrib/aten/aten_op.cc index df3ee5326b7d9..0483ebb05d968 100644 --- a/caffe2/contrib/aten/aten_op.cc +++ b/caffe2/contrib/aten/aten_op.cc @@ -6,7 +6,7 @@ namespace caffe2 { REGISTER_CPU_OPERATOR(ATen, ATenOp); template<> at::Backend ATenOp::backend() const { - return at::kCPU; + return at::Backend::CPU; } OPERATOR_SCHEMA(ATen); diff --git a/caffe2/contrib/aten/aten_op_cuda.cc b/caffe2/contrib/aten/aten_op_cuda.cc index d416e700cb186..8e1c6bdd23645 100644 --- a/caffe2/contrib/aten/aten_op_cuda.cc +++ b/caffe2/contrib/aten/aten_op_cuda.cc @@ -6,7 +6,7 @@ namespace caffe2 { REGISTER_CUDA_OPERATOR(ATen, ATenOp); template<> at::Backend ATenOp::backend() const { - return at::kCUDA; + return at::Backend::CUDA; } namespace math { diff --git a/caffe2/operators/onnxifi_op.cc b/caffe2/operators/onnxifi_op.cc index 3cfb97292c9a2..4e9f6f2ac280f 100644 --- a/caffe2/operators/onnxifi_op.cc +++ b/caffe2/operators/onnxifi_op.cc @@ -51,6 +51,10 @@ OnnxifiOp::BuildInitializationList( std::vector* weight_names, std::vector>* weight_shapes) { const std::vector& ws_blobs = ws->Blobs(); + // Since onnxTensorDescriptorV1.name will point into the memory in + // weight_names, we need to prevent weight_names from reallocating by + // reserving enough memory ahead of time + weight_names->reserve(ws_blobs.size()); std::vector descs; for (const auto& s : ws_blobs) { auto it = initialization_list->find(s); diff --git a/caffe2/opt/fusion.cc b/caffe2/opt/fusion.cc index f5ea0f678ed51..61d5301adb72e 100644 --- a/caffe2/opt/fusion.cc +++ b/caffe2/opt/fusion.cc @@ -11,12 +11,17 @@ using namespace nom; // $$ X_{bn} = \frac{s(X - m)}{\sqrt{\sigma + \epsilon}} + b_{bn}$$ // $$ X_{conv} = X * W + b_{conv} $$ // thus, substituting $X$ with $X_{conv}$ in the BN equation we get: -// $$X_{bn} = X * \frac{sW}{\sqrt{\sigma + \epsilon}} + \frac{s(b_{conv} - m)}{\sqrt{\sigma + \epsilon}} + b_{bn}$$ -// or +// $$X_{bn} = X * \frac{sW}{\sqrt{\sigma + \epsilon}} + \frac{s(b_{conv} - +// m)}{\sqrt{\sigma + \epsilon}} + b_{bn}$$ or // $$ W' = W\frac{s}{\sqrt{\sigma + \epsilon}}$$ // $$ b' = (b_{conv} - m)\frac{s}{\sqrt{\sigma + \epsilon}} + b_{bn}$$ bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) { - for (auto convNode : repr::nn::nodeIterator(nn->dataFlow)) { + size_t convOrder = 0; + for (auto node_pair : repr::nn::dataIterator(nn->dataFlow)) { + repr::NNGraph::NodeRef convNode; + repr::Conv* conv; + std::tie(conv, convNode) = node_pair; + auto output = repr::nn::getOutputs(convNode).front(); auto consumers = repr::nn::getConsumers(output); NOM_REQUIRE_OR_CONT(consumers.size() == 1); @@ -31,9 +36,9 @@ bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) { auto bnOutput = bnOutputs.front(); auto convInputs = repr::nn::getInputs(convNode); - CAFFE_ENFORCE( - convInputs.size() >= 3, - "Invalid convolution input size (TODO: optional bias)"); + if (convInputs.size() < 2) { + continue; + } auto bnInputs = repr::nn::getInputs(bnNode); CAFFE_ENFORCE( @@ -46,13 +51,46 @@ bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) { auto name##Data = name##Tensor->mutable_data(); EXPOSE_TENSOR_DATA(filter, 1, convInputs); - EXPOSE_TENSOR_DATA(biasConv, 2, convInputs); EXPOSE_TENSOR_DATA(scale, 1, bnInputs); EXPOSE_TENSOR_DATA(biasBN, 2, bnInputs); EXPOSE_TENSOR_DATA(mean, 3, bnInputs); EXPOSE_TENSOR_DATA(variance, 4, bnInputs); + if (convInputs.size() == 2) { + NOM_REQUIRE_OR_CONT(conv->getMutableAnnotation() != nullptr); + auto annotation = + dyn_cast(conv->getMutableAnnotation()); + NOM_REQUIRE_OR_CONT(annotation != nullptr); + auto op = annotation->getOperatorDef(); + auto convName = op.name(); + + while (true) { + auto convBiasName = convName + "_bias" + to_string(convOrder); + if (!ws->HasBlob(convBiasName)) { + auto convBiasTensor = make_unique(convBiasName); + convBiasTensor->setType(repr::Tensor::DataType::Float); + auto convBiasNode = nn->dataFlow.createNode( + unique_dyn_cast(convBiasTensor)); + nn->inputs.insert(convBiasNode); + nn->dataFlow.createEdge(convBiasNode, convNode); + + auto* blob = ws->CreateBlob(convBiasName); + caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU); + CHECK_NOTNULL(tensor); + // Get output channel + size_t c = filterTensor->dim32(0); + tensor->Resize(c); + tensor->mutable_data(); + break; + } + convOrder++; + } + } + + convInputs = repr::nn::getInputs(convNode); + EXPOSE_TENSOR_DATA(biasConv, 2, convInputs); + #undef EXPOSE_TENSOR_DATA // Assume M{CHW,HWC} diff --git a/caffe2/python/dataio.py b/caffe2/python/dataio.py index a51251a23a75a..302ee8de817f5 100644 --- a/caffe2/python/dataio.py +++ b/caffe2/python/dataio.py @@ -586,6 +586,11 @@ def schema(self): def setup(self, **kwargs): for reader_builder in self._reader_builders: reader_builder.setup(**kwargs) + # limiter is stateful; it can only be used once. Since + # CompositeReader stops when one of the reader stops, + # this is fine. + if "limiter" in kwargs: + kwargs.pop("limiter") def new_reader(self, **kwargs): readers = [] diff --git a/caffe2/python/pipeline.py b/caffe2/python/pipeline.py index e2a8ac0c2102d..ee38fe52df8c4 100644 --- a/caffe2/python/pipeline.py +++ b/caffe2/python/pipeline.py @@ -55,12 +55,17 @@ def _init_output(output, capacity, global_init_net, global_exit_net): return out_queue, writer -def make_processor(processor): +def make_processor(processor, reader=None): if processor is None: return lambda rec: rec elif isinstance(processor, core.Net): return NetProcessor(processor) else: + if reader is not None and hasattr(processor, "schema_func"): + def processor_schema(): + return processor.schema_func(reader) + + processor.schema = processor_schema return processor @@ -352,7 +357,10 @@ class ProcessingReader(Reader): def __init__(self, reader, processor): Reader.__init__(self) self.reader = reader - self.processor = make_processor(processor) + self.processor = make_processor(processor, reader) + + def schema(self): + return self.processor.schema() def setup_ex(self, init_net, finish_net): self.reader.setup_ex(init_net, finish_net) @@ -404,6 +412,9 @@ def __init__(self, net, stop_signal=None, thread_init_nets=None, name=None): self._frozen = False self._cloned_init_nets = [] + def schema(self): + return self.net.output_record() + def setup(self, init_net): self._frozen = True cloned_init_nets = self._cloned_init_nets diff --git a/caffe2/python/transformations_test.py b/caffe2/python/transformations_test.py index 4e215b586e5f3..d9992116a696e 100644 --- a/caffe2/python/transformations_test.py +++ b/caffe2/python/transformations_test.py @@ -221,6 +221,116 @@ def test_transformer_FuseConvBN(self, size, input_channels, seed, order, epsilon assert np.allclose( preTransformOutput, postTransformOutput, - rtol=1e-02, + rtol=5e-02, + atol=1e-03 + ) + + @given( + size=st.integers(7, 10), + input_channels=st.integers(1, 10), + seed=st.integers(0, 65535), + order=st.sampled_from(["NCHW", "NHWC"]), + epsilon=st.floats(min_value=1e-5, max_value=1e-2), + ) + def test_transformer_FuseConvBNNoConvBias(self, size, input_channels, seed, order, epsilon): + workspace.ResetWorkspace() + net = core.Net("net") + c = input_channels + h = size + w = size + k = 3 + net.Conv(["X", "w"], ["Y"], stride=1, pad=0, kernel=k, order=order) + net.SpatialBN( + ["Y", "scale", "bias", "mean", "var"], + ["Y2"], + is_test=True, + order=order, + epsilon=epsilon, + ) + + np.random.seed(seed) + if order == "NCHW": + workspace.FeedBlob("X", np.random.rand(1, c, h, w).astype(np.float32)) + workspace.FeedBlob("w", np.random.rand(c, c, k, k).astype(np.float32)) + else: + workspace.FeedBlob("X", np.random.rand(1, h, w, c).astype(np.float32)) + workspace.FeedBlob("w", np.random.rand(c, k, k, c).astype(np.float32)) + workspace.FeedBlob("scale", np.random.rand(c).astype(np.float32)) + workspace.FeedBlob("bias", np.random.rand(c).astype(np.float32)) + workspace.FeedBlob("mean", np.random.rand(c).astype(np.float32)) + # This is necessary because 1/sqrt(var) is used and if var is too small + # we get floating point artifacts that cause test failures + workspace.FeedBlob("var", np.random.rand(c).astype(np.float32) + 0.5) + workspace.RunNetOnce(net) + preTransformOutput = workspace.FetchBlob("Y2").flatten() + workspace.FeedBlob("Y2", np.zeros((1, 1))) + transformer.FuseConvBN(net) + + # Ensure fusion + assert len(net.Proto().op) == 1 + workspace.RunNetOnce(net) + postTransformOutput = workspace.FetchBlob("Y2").flatten() + # Check that there is no numerical difference + assert np.allclose( + preTransformOutput, + postTransformOutput, + rtol=5e-02, + atol=1e-03 + ) + + @given( + size=st.integers(7, 10), + input_channels=st.integers(1, 10), + seed=st.integers(0, 65535), + order=st.sampled_from(["NCHW", "NHWC"]), + epsilon=st.floats(min_value=1e-5, max_value=1e-2), + ) + def test_transformer_FuseConvBNNoConvBiasDuplicatedName(self, size, input_channels, seed, order, epsilon): + workspace.ResetWorkspace() + net = core.Net("net") + c = input_channels + h = size + w = size + k = 3 + net.Conv(["X", "w"], ["Y"], stride=1, pad=0, kernel=k, order=order) + net.SpatialBN( + ["Y", "scale", "_bias0", "mean", "var"], + ["Y2"], + is_test=True, + order=order, + epsilon=epsilon, + ) + + np.random.seed(seed) + if order == "NCHW": + workspace.FeedBlob("X", np.random.rand(1, c, h, w).astype(np.float32)) + workspace.FeedBlob("w", np.random.rand(c, c, k, k).astype(np.float32)) + else: + workspace.FeedBlob("X", np.random.rand(1, h, w, c).astype(np.float32)) + workspace.FeedBlob("w", np.random.rand(c, k, k, c).astype(np.float32)) + workspace.FeedBlob("scale", np.random.rand(c).astype(np.float32)) + workspace.FeedBlob("_bias0", np.random.rand(c).astype(np.float32)) + workspace.FeedBlob("mean", np.random.rand(c).astype(np.float32)) + # This is necessary because 1/sqrt(var) is used and if var is too small + # we get floating point artifacts that cause test failures + workspace.FeedBlob("var", np.random.rand(c).astype(np.float32) + 0.5) + workspace.RunNetOnce(net) + preTransformOutput = workspace.FetchBlob("Y2").flatten() + workspace.FeedBlob("Y2", np.zeros((1, 1))) + transformer.FuseConvBN(net) + + # Ensure fusion + assert len(net.Proto().op) == 1 + workspace.RunNetOnce(net) + postTransformOutput = workspace.FetchBlob("Y2").flatten() + print("pre") + print(preTransformOutput) + print("after") + print(postTransformOutput) + # Check that there is no numerical difference + assert np.allclose( + preTransformOutput, + postTransformOutput, + rtol=5e-02, atol=1e-03 ) diff --git a/docs/cpp/Doxyfile b/docs/cpp/Doxyfile new file mode 100644 index 0000000000000..acb9e35fc2bb0 --- /dev/null +++ b/docs/cpp/Doxyfile @@ -0,0 +1,2468 @@ +# Doxyfile 1.8.14 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a double hash (##) is considered a comment and is placed in +# front of the TAG it is preceding. +# +# All text after a single hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists, items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (\" \"). + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all text +# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv +# built into libc) for the transcoding. See +# https://www.gnu.org/software/libiconv/ for the list of possible encodings. +# The default value is: UTF-8. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by +# double-quotes, unless you are using Doxywizard) that should identify the +# project for which the documentation is generated. This name is used in the +# title of most generated pages and in a few other places. +# The default value is: My Project. + +PROJECT_NAME = "PyTorch" + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. This +# could be handy for archiving the generated documentation or if some version +# control system is used. + +PROJECT_NUMBER = + +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer a +# quick idea about the purpose of the project. Keep the description short. + +PROJECT_BRIEF = + +# With the PROJECT_LOGO tag one can specify a logo or an icon that is included +# in the documentation. The maximum height of the logo should not exceed 55 +# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy +# the logo to the output directory. + +PROJECT_LOGO = + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path +# into which the generated documentation will be written. If a relative path is +# entered, it will be relative to the location where doxygen was started. If +# left blank the current directory will be used. + +OUTPUT_DIRECTORY = build + +# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- +# directories (in 2 levels) under the output directory of each output format and +# will distribute the generated files over these directories. Enabling this +# option can be useful when feeding doxygen a huge amount of source files, where +# putting all generated files in the same directory would otherwise causes +# performance problems for the file system. +# The default value is: NO. + +CREATE_SUBDIRS = NO + +# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII +# characters to appear in the names of generated files. If set to NO, non-ASCII +# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode +# U+3044. +# The default value is: NO. + +ALLOW_UNICODE_NAMES = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, +# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), +# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, +# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), +# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, +# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, +# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, +# Ukrainian and Vietnamese. +# The default value is: English. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member +# descriptions after the members that are listed in the file and class +# documentation (similar to Javadoc). Set to NO to disable this. +# The default value is: YES. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief +# description of a member or function before the detailed description +# +# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. +# The default value is: YES. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator that is +# used to form the text in various listings. Each string in this list, if found +# as the leading text of the brief description, will be stripped from the text +# and the result, after processing the whole list, is used as the annotated +# text. Otherwise, the brief description is used as-is. If left blank, the +# following values are used ($name is automatically replaced with the name of +# the entity):The $name class, The $name widget, The $name file, is, provides, +# specifies, contains, represents, a, an and the. + +ABBREVIATE_BRIEF = "The $name class" \ + "The $name widget" \ + "The $name file" \ + is \ + provides \ + specifies \ + contains \ + represents \ + a \ + an \ + the + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# doxygen will generate a detailed section even if there is only a brief +# description. +# The default value is: NO. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. +# The default value is: NO. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path +# before files name in the file list and in the header files. If set to NO the +# shortest path that makes the file name unique will be used +# The default value is: YES. + +FULL_PATH_NAMES = YES + +# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. +# Stripping is only done if one of the specified strings matches the left-hand +# part of the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the path to +# strip. +# +# Note that you can specify absolute paths here, but also relative paths, which +# will be relative from the directory where doxygen is started. +# This tag requires that the tag FULL_PATH_NAMES is set to YES. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the +# path mentioned in the documentation of a class, which tells the reader which +# header file to include in order to use a class. If left blank only the name of +# the header file containing the class definition is used. Otherwise one should +# specify the list of include paths that are normally passed to the compiler +# using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but +# less readable) file names. This can be useful is your file systems doesn't +# support long names like on DOS, Mac, or CD-ROM. +# The default value is: NO. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the +# first line (until the first dot) of a Javadoc-style comment as the brief +# description. If set to NO, the Javadoc-style will behave just like regular Qt- +# style comments (thus requiring an explicit @brief command for a brief +# description.) +# The default value is: NO. + +JAVADOC_AUTOBRIEF = NO + +# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first +# line (until the first dot) of a Qt-style comment as the brief description. If +# set to NO, the Qt-style will behave just like regular Qt-style comments (thus +# requiring an explicit \brief command for a brief description.) +# The default value is: NO. + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a +# multi-line C++ special comment block (i.e. a block of //! or /// comments) as +# a brief description. This used to be the default behavior. The new default is +# to treat a multi-line C++ comment block as a detailed description. Set this +# tag to YES if you prefer the old behavior instead. +# +# Note that setting this tag to YES also means that rational rose comments are +# not recognized any more. +# The default value is: NO. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the +# documentation from any documented member that it re-implements. +# The default value is: YES. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new +# page for each member. If set to NO, the documentation of a member will be part +# of the file/class/namespace that contains it. +# The default value is: NO. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen +# uses this value to replace tabs by spaces in code fragments. +# Minimum value: 1, maximum value: 16, default value: 4. + +TAB_SIZE = 4 + +# This tag can be used to specify a number of aliases that act as commands in +# the documentation. An alias has the form: +# name=value +# For example adding +# "sideeffect=@par Side Effects:\n" +# will allow you to put the command \sideeffect (or @sideeffect) in the +# documentation, which will result in a user-defined paragraph with heading +# "Side Effects:". You can put \n's in the value part of an alias to insert +# newlines (in the resulting output). You can put ^^ in the value part of an +# alias to insert a newline as if a physical newline was in the original file. + +ALIASES = "rst=\verbatim embed:rst:leading-asterisk" +ALIASES += "endrst=\endverbatim" + +# This tag can be used to specify a number of word-keyword mappings (TCL only). +# A mapping has the form "name=value". For example adding "class=itcl::class" +# will allow you to use the command class in the itcl::class meaning. + +TCL_SUBST = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources +# only. Doxygen will then generate output that is more tailored for C. For +# instance, some of the names that are used will be different. The list of all +# members will be omitted, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or +# Python sources only. Doxygen will then generate output that is more tailored +# for that language. For instance, namespaces will be presented as packages, +# qualified scopes will look different, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources. Doxygen will then generate output that is tailored for Fortran. +# The default value is: NO. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for VHDL. +# The default value is: NO. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given +# extension. Doxygen has a built-in mapping, but you can override or extend it +# using this tag. The format is ext=language, where ext is a file extension, and +# language is one of the parsers supported by doxygen: IDL, Java, Javascript, +# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: +# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: +# Fortran. In the later case the parser tries to guess whether the code is fixed +# or free formatted code, this is the default for Fortran type files), VHDL. For +# instance to make doxygen treat .inc files as Fortran files (default is PHP), +# and .f files as C (default is Fortran), use: inc=Fortran f=C. +# +# Note: For files without extension you can use no_extension as a placeholder. +# +# Note that for custom extensions you also need to set FILE_PATTERNS otherwise +# the files are not read by doxygen. + +EXTENSION_MAPPING = + +# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments +# according to the Markdown format, which allows for more readable +# documentation. See http://daringfireball.net/projects/markdown/ for details. +# The output of markdown processing is further processed by doxygen, so you can +# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in +# case of backward compatibilities issues. +# The default value is: YES. + +MARKDOWN_SUPPORT = YES + +# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up +# to that level are automatically included in the table of contents, even if +# they do not have an id attribute. +# Note: This feature currently applies only to Markdown headings. +# Minimum value: 0, maximum value: 99, default value: 0. +# This tag requires that the tag MARKDOWN_SUPPORT is set to YES. + +TOC_INCLUDE_HEADINGS = 0 + +# When enabled doxygen tries to link words that correspond to documented +# classes, or namespaces to their corresponding documentation. Such a link can +# be prevented in individual cases by putting a % sign in front of the word or +# globally by setting AUTOLINK_SUPPORT to NO. +# The default value is: YES. + +AUTOLINK_SUPPORT = YES + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should set this +# tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); +# versus func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. +# The default value is: NO. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. +# The default value is: NO. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: +# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen +# will parse them like normal C++ but will assume all classes use public instead +# of private inheritance when no explicit protection keyword is present. +# The default value is: NO. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate +# getter and setter methods for a property. Setting this option to YES will make +# doxygen to replace the get and set methods by a property in the documentation. +# This will only work if the methods are indeed getting or setting a simple +# type. If this is not the case, or you want to show the methods anyway, you +# should set this option to NO. +# The default value is: YES. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. +# The default value is: NO. + +DISTRIBUTE_GROUP_DOC = NO + +# If one adds a struct or class to a group and this option is enabled, then also +# any nested class or struct is added to the same group. By default this option +# is disabled and one has to add nested compounds explicitly via \ingroup. +# The default value is: NO. + +GROUP_NESTED_COMPOUNDS = NO + +# Set the SUBGROUPING tag to YES to allow class member groups of the same type +# (for instance a group of public functions) to be put as a subgroup of that +# type (e.g. under the Public Functions section). Set it to NO to prevent +# subgrouping. Alternatively, this can be done per class using the +# \nosubgrouping command. +# The default value is: YES. + +SUBGROUPING = YES + +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions +# are shown inside the group in which they are included (e.g. using \ingroup) +# instead of on a separate page (for HTML and Man pages) or section (for LaTeX +# and RTF). +# +# Note that this feature does not work in combination with +# SEPARATE_MEMBER_PAGES. +# The default value is: NO. + +INLINE_GROUPED_CLASSES = NO + +# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions +# with only public data fields or simple typedef fields will be shown inline in +# the documentation of the scope in which they are defined (i.e. file, +# namespace, or group documentation), provided this scope is documented. If set +# to NO, structs, classes, and unions are shown on a separate page (for HTML and +# Man pages) or section (for LaTeX and RTF). +# The default value is: NO. + +INLINE_SIMPLE_STRUCTS = NO + +# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or +# enum is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically be +# useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. +# The default value is: NO. + +TYPEDEF_HIDES_STRUCT = NO + +# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This +# cache is used to resolve symbols given their name and scope. Since this can be +# an expensive process and often the same symbol appears multiple times in the +# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small +# doxygen will become slower. If the cache is too large, memory is wasted. The +# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range +# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 +# symbols. At the end of a run doxygen will report the cache usage and suggest +# the optimal cache size from a speed point of view. +# Minimum value: 0, maximum value: 9, default value: 0. + +LOOKUP_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in +# documentation are documented, even if no documentation was available. Private +# class members and static file members will be hidden unless the +# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. +# Note: This will also disable the warnings about undocumented members that are +# normally produced when WARNINGS is set to YES. +# The default value is: NO. + +EXTRACT_ALL = YES + +# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will +# be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIVATE = YES + +# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal +# scope will be included in the documentation. +# The default value is: NO. + +EXTRACT_PACKAGE = YES + +# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be +# included in the documentation. +# The default value is: NO. + +EXTRACT_STATIC = YES + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined +# locally in source files will be included in the documentation. If set to NO, +# only classes defined in header files are included. Does not have any effect +# for Java sources. +# The default value is: YES. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. If set to YES, local methods, +# which are defined in the implementation section but not in the interface are +# included in the documentation. If set to NO, only methods in the interface are +# included. +# The default value is: NO. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base name of +# the file that contains the anonymous namespace. By default anonymous namespace +# are hidden. +# The default value is: NO. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all +# undocumented members inside documented classes or files. If set to NO these +# members will be included in the various overviews, but no documentation +# section is generated. This option has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. If set +# to NO, these classes will be included in the various overviews. This option +# has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend +# (class|struct|union) declarations. If set to NO, these declarations will be +# included in the documentation. +# The default value is: NO. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any +# documentation blocks found inside the body of a function. If set to NO, these +# blocks will be appended to the function's detailed documentation block. +# The default value is: NO. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation that is typed after a +# \internal command is included. If the tag is set to NO then the documentation +# will be excluded. Set it to YES to include the internal documentation. +# The default value is: NO. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file +# names in lower-case letters. If set to YES, upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. +# The default value is: system dependent. + +CASE_SENSE_NAMES = NO + +# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with +# their full class and namespace scopes in the documentation. If set to YES, the +# scope will be hidden. +# The default value is: NO. + +HIDE_SCOPE_NAMES = NO + +# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will +# append additional text to a page's title, such as Class Reference. If set to +# YES the compound reference will be hidden. +# The default value is: NO. + +HIDE_COMPOUND_REFERENCE= NO + +# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of +# the files that are included by a file in the documentation of that file. +# The default value is: YES. + +SHOW_INCLUDE_FILES = YES + +# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each +# grouped member an include statement to the documentation, telling the reader +# which file to include in order to use the member. +# The default value is: NO. + +SHOW_GROUPED_MEMB_INC = NO + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include +# files with double quotes in the documentation rather than with sharp brackets. +# The default value is: NO. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the +# documentation for inline members. +# The default value is: YES. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the +# (detailed) documentation of file and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. +# The default value is: YES. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief +# descriptions of file, namespace and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. Note that +# this will also influence the order of the classes in the class list. +# The default value is: NO. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the +# (brief and detailed) documentation of class members so that constructors and +# destructors are listed first. If set to NO the constructors will appear in the +# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. +# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief +# member documentation. +# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting +# detailed member documentation. +# The default value is: NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy +# of group names into alphabetical order. If set to NO the group names will +# appear in their defined order. +# The default value is: NO. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by +# fully-qualified names, including namespaces. If set to NO, the class list will +# be sorted only by class name, not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the alphabetical +# list. +# The default value is: NO. + +SORT_BY_SCOPE_NAME = NO + +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper +# type resolution of all parameters of a function it will reject a match between +# the prototype and the implementation of a member function even if there is +# only one candidate or it is obvious which candidate to choose by doing a +# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still +# accept a match between prototype and implementation in such cases. +# The default value is: NO. + +STRICT_PROTO_MATCHING = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo +# list. This list is created by putting \todo commands in the documentation. +# The default value is: YES. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test +# list. This list is created by putting \test commands in the documentation. +# The default value is: YES. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug +# list. This list is created by putting \bug commands in the documentation. +# The default value is: YES. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) +# the deprecated list. This list is created by putting \deprecated commands in +# the documentation. +# The default value is: YES. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional documentation +# sections, marked by \if ... \endif and \cond +# ... \endcond blocks. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the +# initial value of a variable or macro / define can have for it to appear in the +# documentation. If the initializer consists of more lines than specified here +# it will be hidden. Use a value of 0 to hide initializers completely. The +# appearance of the value of individual variables and macros / defines can be +# controlled using \showinitializer or \hideinitializer command in the +# documentation regardless of this setting. +# Minimum value: 0, maximum value: 10000, default value: 30. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at +# the bottom of the documentation of classes and structs. If set to YES, the +# list will mention the files that were used to generate the documentation. +# The default value is: YES. + +SHOW_USED_FILES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This +# will remove the Files entry from the Quick Index and from the Folder Tree View +# (if specified). +# The default value is: YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces +# page. This will remove the Namespaces entry from the Quick Index and from the +# Folder Tree View (if specified). +# The default value is: YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command command input-file, where command is the value of the +# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided +# by doxygen. Whatever the program writes to standard output is used as the file +# version. For an example see the documentation. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. To create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. You can +# optionally specify a file name after the option, if omitted DoxygenLayout.xml +# will be used as the name of the layout file. +# +# Note that if you run doxygen from a directory containing a file called +# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE +# tag is left empty. + +LAYOUT_FILE = + +# The CITE_BIB_FILES tag can be used to specify one or more bib files containing +# the reference definitions. This must be a list of .bib files. The .bib +# extension is automatically appended if omitted. This requires the bibtex tool +# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info. +# For LaTeX the style of the bibliography can be controlled using +# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the +# search path. See also \cite for info how to create references. + +CITE_BIB_FILES = + +#--------------------------------------------------------------------------- +# Configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated to +# standard output by doxygen. If QUIET is set to YES this implies that the +# messages are off. +# The default value is: NO. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES +# this implies that the warnings are on. +# +# Tip: Turn warnings on while writing the documentation. +# The default value is: YES. + +WARNINGS = YES + +# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate +# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag +# will automatically be disabled. +# The default value is: YES. + +WARN_IF_UNDOCUMENTED = NO + +# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some parameters +# in a documented function, or documenting parameters that don't exist or using +# markup commands wrongly. +# The default value is: YES. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that +# are documented, but have no documentation for their parameters or return +# value. If set to NO, doxygen will only warn about wrong or incomplete +# parameter documentation, but not about the absence of documentation. +# The default value is: NO. + +WARN_NO_PARAMDOC = NO + +# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when +# a warning is encountered. +# The default value is: NO. + +WARN_AS_ERROR = NO + +# The WARN_FORMAT tag determines the format of the warning messages that doxygen +# can produce. The string should contain the $file, $line, and $text tags, which +# will be replaced by the file and line number from which the warning originated +# and the warning text. Optionally the format may contain $version, which will +# be replaced by the version of the file (if it could be obtained via +# FILE_VERSION_FILTER) +# The default value is: $file:$line: $text. + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning and error +# messages should be written. If left blank the output is written to standard +# error (stderr). + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# Configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag is used to specify the files and/or directories that contain +# documented source files. You may enter file names like myfile.cpp or +# directories like /usr/src/myproject. Separate the files or directories with +# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING +# Note: If this tag is empty the current directory is searched. + +INPUT = ../../torch/csrc/api/include \ + ../../torch/csrc/api/src \ + ../../aten/src/ATen/ATen.h \ + ../../aten/src/ATen/Backend.h \ + ../../aten/src/ATen/Context.h \ + ../../aten/src/ATen/Device.h \ + ../../aten/src/ATen/DeviceGuard.h \ + ../../aten/src/ATen/Layout.h \ + ../../aten/src/ATen/OptionsGuard.h \ + ../../aten/src/ATen/Scalar.h \ + ../../aten/src/ATen/TensorOptions.h \ + ../../aten/src/ATen/core/ArrayRef.h \ + ../../aten/src/ATen/core/DeviceType.h \ + ../../aten/src/ATen/core/Error.h \ + ../../aten/src/ATen/core/Half.h \ + ../../aten/src/ATen/core/ScalarType.h \ + ../../aten/src/ATen/cuda/CUDAGuard.h \ + ../../aten/src/ATen/cuda/CUDAStream.h \ + ../../aten/src/ATen/cuda/CUDAHalf.h \ + ../../aten/src/ATen/cuda/CUDAContext.h \ + ../../aten/src/ATen/cuda/PinnedMemoryAllocator.h \ + ../../aten/src/ATen/cudnn/Descriptors.h \ + ../../aten/src/ATen/cudnn/Handles.h \ + ../../aten/src/ATen/cudnn/Types.h \ + ../../aten/src/ATen/cudnn/Utils.h \ + ../../aten/src/ATen/mkl/Descriptors.h \ + ../../build/aten/src/ATen/Tensor.h \ + ../../build/aten/src/ATen/Functions.h \ + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses +# libiconv (or the iconv built into libc) for the transcoding. See the libiconv +# documentation (see: https://www.gnu.org/software/libiconv/) for the list of +# possible encodings. +# The default value is: UTF-8. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and +# *.h) to filter out the source-files in the directories. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# read by doxygen. +# +# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, +# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, +# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, +# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, +# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf. + +FILE_PATTERNS = *.h *.cpp + +# The RECURSIVE tag can be used to specify whether or not subdirectories should +# be searched for input files as well. +# The default value is: NO. + +RECURSIVE = YES + +# The EXCLUDE tag can be used to specify files and/or directories that should be +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. +# +# Note that relative paths are relative to the directory from which doxygen is +# run. + +EXCLUDE = + +# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded +# from the input. +# The default value is: NO. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories use the pattern */test/* + +EXCLUDE_SYMBOLS = c10::* caffe2::* cereal* DL* TH* + +# The EXAMPLE_PATH tag can be used to specify one or more files or directories +# that contain example code fragments that are included (see the \include +# command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and +# *.h) to filter out the source-files in the directories. If left blank all +# files are included. + +EXAMPLE_PATTERNS = * + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude commands +# irrespective of the value of the RECURSIVE tag. +# The default value is: NO. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or directories +# that contain images that are to be included in the documentation (see the +# \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command: +# +# +# +# where is the value of the INPUT_FILTER tag, and is the +# name of an input file. Doxygen will then use the output that the filter +# program writes to standard output. If FILTER_PATTERNS is specified, this tag +# will be ignored. +# +# Note that the filter must not add or remove lines; it is applied before the +# code is scanned, but not when the output code is generated. If lines are added +# or removed, the anchors will not be placed correctly. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: pattern=filter +# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how +# filters are used. If the FILTER_PATTERNS tag is empty or if none of the +# patterns match the file name, INPUT_FILTER is applied. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will also be used to filter the input files that are used for +# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). +# The default value is: NO. + +FILTER_SOURCE_FILES = NO + +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and +# it is also possible to disable source filtering for a specific pattern using +# *.ext= (so without naming a filter). +# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. + +FILTER_SOURCE_PATTERNS = + +# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that +# is part of the input, its contents will be placed on the main page +# (index.html). This can be useful if you have a project on for instance GitHub +# and want to reuse the introduction page also for the doxygen output. + +USE_MDFILE_AS_MAINPAGE = + +#--------------------------------------------------------------------------- +# Configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will be +# generated. Documented entities will be cross-referenced with these sources. +# +# Note: To get rid of all source code in the generated output, make sure that +# also VERBATIM_HEADERS is set to NO. +# The default value is: NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body of functions, +# classes and enums directly into the documentation. +# The default value is: NO. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any +# special comment blocks from generated source code fragments. Normal C, C++ and +# Fortran comments will always remain visible. +# The default value is: YES. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES then for each documented +# function all documented functions referencing it will be listed. +# The default value is: NO. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES then for each documented function +# all documented entities called/used by that function will be listed. +# The default value is: NO. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set +# to YES then the hyperlinks from functions in REFERENCES_RELATION and +# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will +# link to the documentation. +# The default value is: YES. + +REFERENCES_LINK_SOURCE = YES + +# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the +# source code will show a tooltip with additional information such as prototype, +# brief description and links to the definition and documentation. Since this +# will make the HTML file larger and loading of large files a bit slower, you +# can opt to disable this feature. +# The default value is: YES. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +SOURCE_TOOLTIPS = YES + +# If the USE_HTAGS tag is set to YES then the references to source code will +# point to the HTML generated by the htags(1) tool instead of doxygen built-in +# source browser. The htags tool is part of GNU's global source tagging system +# (see https://www.gnu.org/software/global/global.html). You will need version +# 4.8.6 or higher. +# +# To use it do the following: +# - Install the latest version of global +# - Enable SOURCE_BROWSER and USE_HTAGS in the config file +# - Make sure the INPUT points to the root of the source tree +# - Run doxygen as normal +# +# Doxygen will invoke htags (and that will in turn invoke gtags), so these +# tools must be available from the command line (i.e. in the search path). +# +# The result: instead of the source browser generated by doxygen, the links to +# source code will now point to the output of htags. +# The default value is: NO. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a +# verbatim copy of the header file for each class for which an include is +# specified. Set to NO to disable this. +# See also: Section \class. +# The default value is: YES. + +VERBATIM_HEADERS = YES + +#--------------------------------------------------------------------------- +# Configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all +# compounds will be generated. Enable this if the project contains a lot of +# classes, structs, unions or interfaces. +# The default value is: YES. + +ALPHABETICAL_INDEX = YES + +# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in +# which the alphabetical index list will be split. +# Minimum value: 1, maximum value: 20, default value: 5. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all classes will +# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag +# can be used to specify a prefix (or a list of prefixes) that should be ignored +# while generating the index headers. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output +# The default value is: YES. + +GENERATE_HTML = NO + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a +# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of +# it. +# The default directory is: html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each +# generated HTML page (for example: .htm, .php, .asp). +# The default value is: .html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a user-defined HTML header file for +# each generated HTML page. If the tag is left blank doxygen will generate a +# standard header. +# +# To get valid HTML the header file that includes any scripts and style sheets +# that doxygen needs, which is dependent on the configuration options used (e.g. +# the setting GENERATE_TREEVIEW). It is highly recommended to start with a +# default header using +# doxygen -w html new_header.html new_footer.html new_stylesheet.css +# YourConfigFile +# and then modify the file new_header.html. See also section "Doxygen usage" +# for information on how to generate the default header that doxygen normally +# uses. +# Note: The header is subject to change so you typically have to regenerate the +# default header when upgrading to a newer version of doxygen. For a description +# of the possible markers and block names see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each +# generated HTML page. If the tag is left blank doxygen will generate a standard +# footer. See HTML_HEADER for more information on how to generate a default +# footer and what special commands can be used inside the footer. See also +# section "Doxygen usage" for information on how to generate the default footer +# that doxygen normally uses. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style +# sheet that is used by each HTML page. It can be used to fine-tune the look of +# the HTML output. If left blank doxygen will generate a default style sheet. +# See also section "Doxygen usage" for information on how to generate the style +# sheet that doxygen normally uses. +# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as +# it is more robust and this tag (HTML_STYLESHEET) will in the future become +# obsolete. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_STYLESHEET = + +# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined +# cascading style sheets that are included after the standard style sheets +# created by doxygen. Using this option one can overrule certain style aspects. +# This is preferred over using HTML_STYLESHEET since it does not replace the +# standard style sheet and is therefore more robust against future updates. +# Doxygen will copy the style sheet files to the output directory. +# Note: The order of the extra style sheet files is of importance (e.g. the last +# style sheet in the list overrules the setting of the previous ones in the +# list). For an example see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_STYLESHEET = + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that the +# files will be copied as-is; there are no commands or markers available. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_FILES = + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen +# will adjust the colors in the style sheet and background images according to +# this color. Hue is specified as an angle on a colorwheel, see +# https://en.wikipedia.org/wiki/Hue for more information. For instance the value +# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 +# purple, and 360 is red again. +# Minimum value: 0, maximum value: 359, default value: 220. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_HUE = 220 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors +# in the HTML output. For a value of 0 the output will use grayscales only. A +# value of 255 will produce the most vivid colors. +# Minimum value: 0, maximum value: 255, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_SAT = 100 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the +# luminance component of the colors in the HTML output. Values below 100 +# gradually make the output lighter, whereas values above 100 make the output +# darker. The value divided by 100 is the actual gamma applied, so 80 represents +# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not +# change the gamma. +# Minimum value: 40, maximum value: 240, default value: 80. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_GAMMA = 80 + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting this +# to YES can help to show when doxygen was last run and thus if the +# documentation is up to date. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_TIMESTAMP = NO + +# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML +# documentation will contain a main index with vertical navigation menus that +# are dynamically created via Javascript. If disabled, the navigation index will +# consists of multiple levels of tabs that are statically embedded in every HTML +# page. Disable this option to support browsers that do not have Javascript, +# like the Qt help browser. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_MENUS = YES + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_SECTIONS = NO + +# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries +# shown in the various tree structured indices initially; the user can expand +# and collapse entries dynamically later on. Doxygen will expand the tree to +# such a level that at most the specified number of entries are visible (unless +# a fully collapsed tree already exceeds this amount). So setting the number of +# entries 1 will produce a full collapsed tree by default. 0 is a special value +# representing an infinite number of entries and will result in a full expanded +# tree by default. +# Minimum value: 0, maximum value: 9999, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_INDEX_NUM_ENTRIES = 100 + +# If the GENERATE_DOCSET tag is set to YES, additional index files will be +# generated that can be used as input for Apple's Xcode 3 integrated development +# environment (see: https://developer.apple.com/tools/xcode/), introduced with +# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a +# Makefile in the HTML output directory. Running make will produce the docset in +# that directory and running make install will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at +# startup. See https://developer.apple.com/tools/creatingdocsetswithdoxygen.html +# for more information. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_DOCSET = NO + +# This tag determines the name of the docset feed. A documentation feed provides +# an umbrella under which multiple documentation sets from a single provider +# (such as a company or product suite) can be grouped. +# The default value is: Doxygen generated docs. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# This tag specifies a string that should uniquely identify the documentation +# set bundle. This should be a reverse domain-name style string, e.g. +# com.mycompany.MyDocSet. Doxygen will append .docset to the name. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify +# the documentation publisher. This should be a reverse domain-name style +# string, e.g. com.mycompany.MyDocSet.documentation. +# The default value is: org.doxygen.Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_ID = org.doxygen.Publisher + +# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. +# The default value is: Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_NAME = Publisher + +# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three +# additional HTML index files: index.hhp, index.hhc, and index.hhk. The +# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop +# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on +# Windows. +# +# The HTML Help Workshop contains a compiler that can convert all HTML output +# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML +# files are now used as the Windows 98 help format, and will replace the old +# Windows help format (.hlp) on all Windows platforms in the future. Compressed +# HTML files also contain an index, a table of contents, and you can search for +# words in the documentation. The HTML workshop also contains a viewer for +# compressed HTML files. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_HTMLHELP = NO + +# The CHM_FILE tag can be used to specify the file name of the resulting .chm +# file. You can add a path in front of the file if the result should not be +# written to the html output directory. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_FILE = + +# The HHC_LOCATION tag can be used to specify the location (absolute path +# including file name) of the HTML help compiler (hhc.exe). If non-empty, +# doxygen will try to run the HTML help compiler on the generated index.hhp. +# The file has to be specified with full path. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +HHC_LOCATION = + +# The GENERATE_CHI flag controls if a separate .chi index file is generated +# (YES) or that it should be included in the master .chm file (NO). +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +GENERATE_CHI = NO + +# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) +# and project file content. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_INDEX_ENCODING = + +# The BINARY_TOC flag controls whether a binary table of contents is generated +# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it +# enables the Previous and Next buttons. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members to +# the table of contents of the HTML help documentation and to the tree view. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that +# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help +# (.qch) of the generated HTML documentation. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify +# the file name of the resulting .qch file. The path specified is relative to +# the HTML output folder. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help +# Project output. For more information please see Qt Help Project / Namespace +# (see: http://doc.qt.io/qt-4.8/qthelpproject.html#namespace). +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt +# Help Project output. For more information please see Qt Help Project / Virtual +# Folders (see: http://doc.qt.io/qt-4.8/qthelpproject.html#virtual-folders). +# The default value is: doc. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_VIRTUAL_FOLDER = doc + +# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom +# filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's filter section matches. Qt Help Project / Filter Attributes (see: +# http://doc.qt.io/qt-4.8/qthelpproject.html#filter-attributes). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_SECT_FILTER_ATTRS = + +# The QHG_LOCATION tag can be used to specify the location of Qt's +# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the +# generated .qhp file. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be +# generated, together with the HTML files, they form an Eclipse help plugin. To +# install this plugin and make it available under the help contents menu in +# Eclipse, the contents of the directory containing the HTML and XML files needs +# to be copied into the plugins directory of eclipse. The name of the directory +# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. +# After copying Eclipse needs to be restarted before the help appears. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the Eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have this +# name. Each documentation set should have its own identifier. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# If you want full control over the layout of the generated HTML pages it might +# be necessary to disable the index and replace it with your own. The +# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top +# of each HTML page. A value of NO enables the index and the value YES disables +# it. Since the tabs in the index contain the same information as the navigation +# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +DISABLE_INDEX = NO + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. If the tag +# value is set to YES, a side panel will be generated containing a tree-like +# index structure (just like the one that is generated for HTML Help). For this +# to work a browser that supports JavaScript, DHTML, CSS and frames is required +# (i.e. any modern browser). Windows users are probably better off using the +# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can +# further fine-tune the look of the index. As an example, the default style +# sheet generated by doxygen has an example that shows how to put an image at +# the root of the tree instead of the PROJECT_NAME. Since the tree basically has +# the same information as the tab index, you could consider setting +# DISABLE_INDEX to YES when enabling this option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_TREEVIEW = NO + +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that +# doxygen will group on one line in the generated HTML documentation. +# +# Note that a value of 0 will completely suppress the enum values from appearing +# in the overview section. +# Minimum value: 0, maximum value: 20, default value: 4. +# This tag requires that the tag GENERATE_HTML is set to YES. + +ENUM_VALUES_PER_LINE = 4 + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used +# to set the initial width (in pixels) of the frame in which the tree is shown. +# Minimum value: 0, maximum value: 1500, default value: 250. +# This tag requires that the tag GENERATE_HTML is set to YES. + +TREEVIEW_WIDTH = 250 + +# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to +# external symbols imported via tag files in a separate window. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +EXT_LINKS_IN_WINDOW = NO + +# Use this tag to change the font size of LaTeX formulas included as images in +# the HTML documentation. When you change the font size after a successful +# doxygen run you need to manually remove any form_*.png images from the HTML +# output directory to force them to be regenerated. +# Minimum value: 8, maximum value: 50, default value: 10. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_FONTSIZE = 10 + +# Use the FORMULA_TRANSPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are not +# supported properly for IE 6.0, but are supported on all modern browsers. +# +# Note that when changing this option you need to delete any form_*.png files in +# the HTML output directory before the changes have effect. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_TRANSPARENT = YES + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see +# https://www.mathjax.org) which uses client side Javascript for the rendering +# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX +# installed or if you want to formulas look prettier in the HTML output. When +# enabled you may also need to install MathJax separately and configure the path +# to it using the MATHJAX_RELPATH option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +USE_MATHJAX = NO + +# When MathJax is enabled you can set the default output format to be used for +# the MathJax output. See the MathJax site (see: +# http://docs.mathjax.org/en/latest/output.html) for more details. +# Possible values are: HTML-CSS (which is slower, but has the best +# compatibility), NativeMML (i.e. MathML) and SVG. +# The default value is: HTML-CSS. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_FORMAT = HTML-CSS + +# When MathJax is enabled you need to specify the location relative to the HTML +# output directory using the MATHJAX_RELPATH option. The destination directory +# should contain the MathJax.js script. For instance, if the mathjax directory +# is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax +# Content Delivery Network so you can quickly see the result without installing +# MathJax. However, it is strongly recommended to install a local copy of +# MathJax from https://www.mathjax.org before deployment. +# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_RELPATH = https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/ + +# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax +# extension names that should be enabled during MathJax rendering. For example +# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_EXTENSIONS = + +# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces +# of code that will be used on startup of the MathJax code. See the MathJax site +# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an +# example see the documentation. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_CODEFILE = + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box for +# the HTML output. The underlying search engine uses javascript and DHTML and +# should work on any modern browser. Note that when using HTML help +# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) +# there is already a search function so this one should typically be disabled. +# For large projects the javascript based search engine can be slow, then +# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to +# search using the keyboard; to jump to the search box use + S +# (what the is depends on the OS and browser, but it is typically +# , /