diff --git a/.gitignore b/.gitignore
index 329e999cf4562..4ef31ba826860 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,6 +25,9 @@ aten/src/ATen/cuda/CUDAConfig.h
 build/
 dist/
 docs/src/**/*
+docs/cpp/xml/
+docs/cpp/html/
+docs/cpp/api/
 test/.coverage
 test/cpp/api/mnist
 test/data/gpu_tensors.pt
diff --git a/aten/src/ATen/Allocator.h b/aten/src/ATen/Allocator.h
index 8493ccd56ba01..fd73f8d6806f9 100644
--- a/aten/src/ATen/Allocator.h
+++ b/aten/src/ATen/Allocator.h
@@ -23,7 +23,7 @@ class DataPtr {
 public:
   // Choice of CPU here is arbitrary; if there's an "undefined" device
   // we could use that too
-  DataPtr() : ptr_(), device_(kCPU) {}
+  DataPtr() : ptr_(), device_(DeviceType::CPU) {}
   DataPtr(void* data, Device device)
     : ptr_(data), device_(device) {}
   DataPtr(void* data, void* ctx, DeleterFnPtr ctx_deleter, Device device)
diff --git a/aten/src/ATen/Backend.h b/aten/src/ATen/Backend.h
index 40db1ee67f247..622d181c75df6 100644
--- a/aten/src/ATen/Backend.h
+++ b/aten/src/ATen/Backend.h
@@ -3,18 +3,25 @@
 #include <ATen/core/TensorTypeId.h>
 #include <ATen/core/TensorTypeIdRegistration.h>
 #include <ATen/core/Error.h>
+#include <ATen/core/DeviceType.h>
 
 #include <stdexcept>
 
 namespace at {
 
+/**
+ * This legacy enum class defines the set of backends supported by
+ * old school, code generated Type-based ATen.  The reason we are
+ * sunsetting this enum class is because it doesn't allow for
+ * open registration of backends.  TensorTypeId is the replacement
+ * for Backend which supports open registration.
+ *
+ * ARE YOU SURE YOU WANT TO USE THIS TYPE?  Think about if SparseCPU/SparseCUDA
+ * would make sense in your use case.  If it doesn't make sense, maybe
+ * you want DeviceType.
+ */
 enum class Backend { CPU, CUDA, SparseCPU, SparseCUDA, Undefined, NumOptions };
 
-constexpr Backend kCPU = Backend::CPU;
-constexpr Backend kCUDA = Backend::CUDA;
-constexpr Backend kSparseCPU = Backend::SparseCPU;
-constexpr Backend kSparseCUDA = Backend::SparseCUDA;
-
 static inline Backend toSparse(Backend b) {
   switch (b) {
     case Backend::CPU:
@@ -78,6 +85,71 @@ static inline TensorTypeId backendToTensorTypeId(Backend b) {
   }
 }
 
+static inline DeviceType backendToDeviceType(Backend b) {
+  switch (b) {
+    case Backend::CPU:
+      return DeviceType::CPU;
+    case Backend::CUDA:
+      return DeviceType::CUDA;
+    case Backend::SparseCPU:
+      return DeviceType::CPU;
+    case Backend::SparseCUDA:
+      return DeviceType::CUDA;
+    case Backend::Undefined:
+      AT_ERROR("Undefined backend is not a valid device type");
+    default:
+      AT_ERROR("Unknown backend");
+  }
+}
+
+static inline Backend deviceTypeToBackend(DeviceType d) {
+  switch (d) {
+    case DeviceType::CPU:
+      return Backend::CPU;
+    case DeviceType::CUDA:
+      return Backend::CUDA;
+    default:
+      AT_ERROR("Unknown device type ", d);
+  }
+}
+
+static inline Backend backendToCPU(Backend b) {
+  switch (b) {
+    case Backend::CPU:
+      return Backend::CPU;
+    case Backend::CUDA:
+      return Backend::CPU;
+    case Backend::SparseCPU:
+      return Backend::SparseCPU;
+    case Backend::SparseCUDA:
+      return Backend::SparseCPU;
+    case Backend::Undefined:
+      return Backend::Undefined;
+    default:
+      AT_ERROR("Unknown backend");
+  }
+}
+
+static inline Backend backendToCUDA(Backend b) {
+  switch (b) {
+    case Backend::CPU:
+      return Backend::CUDA;
+    case Backend::CUDA:
+      return Backend::CUDA;
+    case Backend::SparseCPU:
+      return Backend::SparseCUDA;
+    case Backend::SparseCUDA:
+      return Backend::SparseCUDA;
+    case Backend::Undefined:
+      return Backend::Undefined;
+    default:
+      AT_ERROR("Unknown backend");
+  }
+}
+
+constexpr DeviceType kCPU = DeviceType::CPU;
+constexpr DeviceType kCUDA = DeviceType::CUDA;
+
 static inline const char* toString(Backend b) {
   switch (b) {
     case Backend::CPU:
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index 0c0e99b90906c..f85996f74c4b7 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -32,7 +32,7 @@ Context::Context()
   THSetDefaultErrorHandler(errorHandler,nullptr);
   THSetDefaultArgErrorHandler(argErrorHandler,nullptr);
 
-  generator_registry[static_cast<int>(Backend::CPU)]
+  generator_registry[static_cast<int>(DeviceType::CPU)]
     .reset(new CPUGenerator(this));
   Type::registerCPU(this);
 }
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index 3171a11ada8e3..6cbc7d0d7961b 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -25,7 +25,7 @@ class AT_API Context {
     return type_registry[static_cast<int>(p)][static_cast<int>(s)].get();
   }
   Type * getTypeOpt(Backend p, ScalarType s) {
-    initCUDAIfNeeded(p);
+    if (p != Backend::Undefined) initCUDAIfNeeded(backendToDeviceType(p));
     auto type = getTypeRaw(p, s);
 
     if(!type) {
@@ -42,11 +42,11 @@ class AT_API Context {
     if (!type) AT_ERROR(toString(p), toString(s), "Type is not enabled.");
     return *type;
   }
-  Generator & defaultGenerator(Backend p) {
-    initCUDAIfNeeded(p);
-    auto & generator = generator_registry[static_cast<int>(p)];
+  Generator & defaultGenerator(DeviceType device_type) {
+    initCUDAIfNeeded(device_type);
+    auto & generator = generator_registry[static_cast<int>(device_type)];
     if(!generator)
-      AT_ERROR(toString(p), " backend type not enabled.");
+      AT_ERROR(DeviceTypeName(device_type), " backend type not enabled.");
     return *generator;
   }
   bool hasMKL() const;
@@ -64,7 +64,7 @@ class AT_API Context {
   THCState* lazyInitCUDA() {
     std::call_once(thc_init,[&] {
       thc_state = detail::getCUDAHooks().initCUDA();
-      generator_registry[static_cast<int>(Backend::CUDA)] =
+      generator_registry[static_cast<int>(DeviceType::CUDA)] =
         detail::getCUDAHooks().initCUDAGenerator(this);
       detail::getCUDAHooks().registerCUDATypes(this);
     });
@@ -95,16 +95,17 @@ class AT_API Context {
   bool deterministicCuDNN() const;
   void setDeterministicCuDNN(bool);
   std::unique_ptr<Generator>
-    generator_registry[static_cast<int>(Backend::NumOptions)];
+    generator_registry[static_cast<int>(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES)];
 private:
   // NB: type_registry has nullptr for all CUDA backends until
   // CUDA initialization has occurred
   std::unique_ptr<Type> type_registry
     [static_cast<int>(Backend::NumOptions)]
     [static_cast<int>(ScalarType::NumOptions)];
-  void initCUDAIfNeeded(Backend p) {
-    if(p == Backend::CUDA)
+  void initCUDAIfNeeded(DeviceType p) {
+    if (p == DeviceType::CUDA) {
       lazyInitCUDA();
+    }
   }
   std::once_flag thc_init;
   bool enabled_cudnn = true;
@@ -132,6 +133,10 @@ static inline Type& getType(Backend p, ScalarType s) {
   return globalContext().getType(p, s);
 }
 
+static inline Type& getType(DeviceType p, ScalarType s) {
+  return globalContext().getType(deviceTypeToBackend(p), s);
+}
+
 static inline Type& CPU(ScalarType s) {
   return getType(Backend::CPU, s);
 }
diff --git a/aten/src/ATen/Device.h b/aten/src/ATen/Device.h
index 8dbd9ffab2fd2..d48984a2063dc 100644
--- a/aten/src/ATen/Device.h
+++ b/aten/src/ATen/Device.h
@@ -1,9 +1,10 @@
 #pragma once
 
-#include <ATen/ScalarType.h>
+#include <ATen/ATenGeneral.h>
 #include <ATen/core/Error.h>
 #include <ATen/core/DeviceType.h>
 #include <ATen/core/Error.h>
+#include <ATen/Backend.h>
 
 #include <cstddef>
 #include <iosfwd>
@@ -24,21 +25,6 @@ namespace at {
 struct Device {
   using Type = at::DeviceType;
 
-  /// Converts a `Backend` to a `DeviceType` if possible.
-  static DeviceType backend_to_type(Backend backend) {
-    switch (backend) {
-      case kCPU:
-      case kSparseCPU:
-        return DeviceType::CPU;
-      case kCUDA:
-      case kSparseCUDA:
-        return DeviceType::CUDA;
-      default:
-        AT_ERROR(
-            "Invalid backend ", toString(backend), " for Device construction");
-    }
-  }
-
   /// Constructs a new `Device` from a `DeviceType` and an optional device
   /// index.
   /* implicit */ Device(DeviceType type, int32_t index = -1)
@@ -60,11 +46,6 @@ struct Device {
   /// `<device-index>` optionally specifies a device index.
   /* implicit */ Device(const std::string& device_string);
 
-  /// Constructs a new `Device` from a `Backend` (which is converted to a
-  /// `DeviceType`, if possible) and an optional device index.
-  /* implicit */ Device(Backend backend, int32_t index = -1)
-      : Device(backend_to_type(backend), index) {}
-
   /// Returns true if the type and index of this `Device` matches that of
   /// `other`.
   bool operator==(const Device& other) const noexcept {
diff --git a/aten/src/ATen/Formatting.cpp b/aten/src/ATen/Formatting.cpp
index 1dd6b71c69386..459e7e58bdb38 100644
--- a/aten/src/ATen/Formatting.cpp
+++ b/aten/src/ATen/Formatting.cpp
@@ -250,7 +250,7 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi
     stream << "size:\n" << tensor_.sizes() << "\n";
     stream << "]";
   } else {
-    Type& cpudouble = tensor_.type().toBackend(kCPU).toScalarType(kDouble);
+    Type& cpudouble = tensor_.type().toBackend(Backend::CPU).toScalarType(kDouble);
     Tensor tensor = tensor_.toType(cpudouble).contiguous();
     if(tensor.ndimension() == 0) {
       stream << defaultfloat << tensor.data<double>()[0] << std::endl;
diff --git a/aten/src/ATen/TensorOptions.h b/aten/src/ATen/TensorOptions.h
index 20b0d1ed71d78..350bd449a31e9 100644
--- a/aten/src/ATen/TensorOptions.h
+++ b/aten/src/ATen/TensorOptions.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <ATen/Backend.h>
 #include <ATen/Context.h>
 #include <ATen/Device.h>
 #include <ATen/DeviceGuard.h>
@@ -67,7 +68,7 @@ struct AT_API TensorOptions {
       type_ = &type;
     }
     this->dtype(type.scalarType());
-    this->device({type.backend(), device_index});
+    this->device({backendToDeviceType(type.backend()), device_index});
     this->layout(type.layout());
   }
 
@@ -84,7 +85,12 @@ struct AT_API TensorOptions {
   /// Constructs a `TensorOptions` object from a backend, forwarded to the
   /// `Device` constructor.
   /* implicit */ TensorOptions(Backend backend)
-      : TensorOptions(Device(backend)) {}
+      : TensorOptions(Device(backendToDeviceType(backend))) {}
+
+  /// Constructs a `TensorOptions` object from a device type, forwarded to the
+  /// `Device` constructor.
+  /* implicit */ TensorOptions(DeviceType device_type)
+      : TensorOptions(Device(device_type)) {}
 
   /// Constructs a `TensorOptions` object with the given dtype.
   /* implicit */ TensorOptions(ScalarType dtype) : TensorOptions() {
@@ -190,9 +196,9 @@ struct AT_API TensorOptions {
   Backend backend() const noexcept {
     Backend backend;
     if (device_.type() == Device::Type::CPU) {
-      backend = (layout_ == kStrided) ? kCPU : kSparseCPU;
+      backend = (layout_ == kStrided) ? Backend::CPU : Backend::SparseCPU;
     } else {
-      backend = (layout_ == kStrided) ? kCUDA : kSparseCUDA;
+      backend = (layout_ == kStrided) ? Backend::CUDA : Backend::SparseCUDA;
     }
     return backend;
   }
diff --git a/aten/src/ATen/core/Macros.h b/aten/src/ATen/core/Macros.h
index 3cc64320815af..87a3e5b78ce28 100644
--- a/aten/src/ATen/core/Macros.h
+++ b/aten/src/ATen/core/Macros.h
@@ -7,30 +7,22 @@
 // static library (in which case, saying the symbol is coming
 // from a DLL would be incorrect).
 
-#define AT_CORE_EXPORT
-#define AT_CORE_IMPORT
-
 #ifdef _WIN32
-  #ifndef AT_CORE_STATIC_WINDOWS
-    #undef AT_CORE_EXPORT
-    #undef AT_CORE_IMPORT
-    #define AT_CORE_EXPORT __declspec(dllexport)
-    #define AT_CORE_IMPORT __declspec(dllimport)
-  #endif // !defined(AT_CORE_STATIC_WINDOWS)
-#else  // _WIN32
-  #if defined(__GNUC__) || defined(__llvm__)
-    #undef AT_CORE_EXPORT
-    #undef AT_CORE_IMPORT
-    #define AT_CORE_EXPORT __attribute__((__visibility__("default")))
-    #define AT_CORE_IMPORT AT_CORE_EXPORT
-  #endif // defined(__GNUC__) || defined(__llvm__)
-#endif  // _WIN32
-
+#if !defined(AT_CORE_STATIC_WINDOWS)
+// TODO: unfiy the controlling macros.
 #if defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
-  #define AT_CORE_API AT_CORE_EXPORT
+#define AT_CORE_API __declspec(dllexport)
 #else // defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
-  #define AT_CORE_API AT_CORE_IMPORT
+#define AT_CORE_API __declspec(dllimport)
 #endif // defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
+#else // !defined(AT_CORE_STATIC_WINDOWS)
+#define AT_CORE_API
+#endif // !defined(AT_CORE_STATIC_WINDOWS)
+#else  // _WIN32
+#if defined(__GNUC__)
+#define AT_CORE_API __attribute__((__visibility__("default")))
+#endif // defined(__GNUC__)
+#endif  // _WIN32
 
 // Disable the copy and assignment operator for a class. Note that this will
 // disable the usage of the class in std containers.
diff --git a/aten/src/ATen/core/typeid.h b/aten/src/ATen/core/typeid.h
index 7b3cdebd0263b..7fbaf04c3e759 100644
--- a/aten/src/ATen/core/typeid.h
+++ b/aten/src/ATen/core/typeid.h
@@ -391,14 +391,33 @@ inline bool operator!=(const TypeMeta& lhs, const TypeMeta& rhs) noexcept {
  *
  * NOTE: the macro needs to be invoked in ::caffe2 namespace
  */
-
+// Implementation note: in MSVC, we will need to prepend the AT_CORE_API
+// keyword in order to get things compiled properly. in Linux, gcc seems to
+// create attribute ignored error for explicit template instantiations, see
+//   http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0537r0.html
+//   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51930
+// and as a result, we define these two macros slightly differently.
+// TODO(jiayq): AT_CORE_API below is not correct, because we may use the
+// definition in third party dependent libraries. The proper way is to use
+// CAFFE2_EXPORT (which explicitly requires dllexport). Marking this as a
+// todo item when the unified build is finished.
+#ifdef _MSC_VER
 #define CAFFE_KNOWN_TYPE(T)                                               \
   template <>                                                             \
-  AT_CORE_EXPORT TypeIdentifier TypeMeta::Id<T>() {                       \
+  AT_CORE_API TypeIdentifier TypeMeta::Id<T>() {                          \
     static const TypeIdentifier type_id = TypeIdentifier::createTypeId(); \
     static TypeNameRegisterer<T> registerer(type_id, #T);                 \
     return type_id;                                                       \
   }
+#else // _MSC_VER
+#define CAFFE_KNOWN_TYPE(T)                                               \
+  template <>                                                             \
+  TypeIdentifier TypeMeta::Id<T>() {                                      \
+    static const TypeIdentifier type_id = TypeIdentifier::createTypeId(); \
+    static TypeNameRegisterer<T> registerer(type_id, #T);                 \
+    return type_id;                                                       \
+  }
+#endif
 
 /**
  * CAFFE_DECLARE_KNOWN_TYPE and CAFFE_DEFINE_KNOWN_TYPE are used
@@ -406,11 +425,19 @@ inline bool operator!=(const TypeMeta& lhs, const TypeMeta& rhs) noexcept {
  * can be resolved at compile time. Please use CAFFE_KNOWN_TYPE() instead
  * for your own types to allocate dynamic ids for them.
  */
-#define CAFFE_DECLARE_KNOWN_TYPE(PreallocatedId, T)                       \
-  template <>                                                             \
-  AT_CORE_EXPORT inline AT_CORE_API TypeIdentifier TypeMeta::Id<T>() {    \
-    return TypeIdentifier(PreallocatedId);                                \
+#ifdef _MSC_VER
+#define CAFFE_DECLARE_KNOWN_TYPE(PreallocatedId, T)       \
+  template <>                                             \
+  inline AT_CORE_API TypeIdentifier TypeMeta::Id<T>() {   \
+    return TypeIdentifier(PreallocatedId);                \
   }
+#else // _MSC_VER
+#define CAFFE_DECLARE_KNOWN_TYPE(PreallocatedId, T) \
+  template <>                                       \
+  inline TypeIdentifier TypeMeta::Id<T>() {         \
+    return TypeIdentifier(PreallocatedId);          \
+  }
+#endif
 
 #define CONCAT_IMPL(x, y) x##y
 #define MACRO_CONCAT(x, y) CONCAT_IMPL(x, y)
diff --git a/aten/src/ATen/cuda/CUDAEvent.cpp b/aten/src/ATen/cuda/CUDAEvent.cpp
new file mode 100644
index 0000000000000..ab6c8421816ce
--- /dev/null
+++ b/aten/src/ATen/cuda/CUDAEvent.cpp
@@ -0,0 +1,66 @@
+#include "ATen/cuda/CUDAEvent.h"
+#include "ATen/cuda/CUDAContext.h"
+#include "ATen/cuda/CUDAStream.h"
+#include "ATen/cuda/Exceptions.h"
+#include "ATen/core/Error.h"
+
+#include <mutex>
+#include <atomic>
+
+// Internal implementation is entirely hidden
+struct CUDAEventInternals {
+  std::atomic<int> refcount;
+  int64_t device; // Note: cudaGetDevice works with int32_t, not int64_t
+  cudaEvent_t event;
+};
+
+namespace at {
+namespace cuda {
+
+namespace detail {
+
+/*
+* Pointer-based event API
+*/
+CUDAEventInternals* CUDAEvent_create(unsigned int flags) {
+  std::unique_ptr<CUDAEventInternals> internals { new CUDAEventInternals() };
+  internals->refcount = 1;
+  internals->device = current_device();
+  AT_CUDA_CHECK(cudaEventCreateWithFlags(&internals->event, flags));
+  return internals.release();
+}
+
+void CUDAEvent_retain(CUDAEventInternals* internals) {
+  internals->refcount++;
+}
+
+void CUDAEvent_uncheckedFree(CUDAEventInternals* internals) {
+  if (--internals->refcount) {
+    cudaEventDestroy(internals->event);
+  }
+}
+cudaEvent_t CUDAEvent_event(CUDAEventInternals* internals) {
+  return internals->event;
+}
+
+int64_t CUDAEvent_device(CUDAEventInternals* internals) {
+  return internals->device;
+}
+
+void CUDAEvent_record(CUDAEventInternals* internals, const CUDAStream& stream) {
+  AT_CUDA_CHECK(cudaEventRecord(internals->event, stream));
+}
+
+} // namespace detail
+
+void CUDAEvent::record() const {
+  record(getCurrentCUDAStream());
+}
+
+void CUDAEvent::record(const CUDAStream& stream) const {
+  detail::CUDAEvent_record(internals_, stream);
+}
+
+
+} // namespace cuda
+} // namespace at
diff --git a/aten/src/ATen/cuda/CUDAEvent.h b/aten/src/ATen/cuda/CUDAEvent.h
new file mode 100644
index 0000000000000..79abfd3dcc01a
--- /dev/null
+++ b/aten/src/ATen/cuda/CUDAEvent.h
@@ -0,0 +1,80 @@
+#pragma once
+
+#include <cstdint>
+#include <utility>
+
+#include "cuda_runtime_api.h"
+
+#include <ATen/ATenGeneral.h>
+#include <ATen/Error.h>
+
+/*
+* A CUDA event interface with no CUDA build dependency.
+*
+* Includes the CUDAEvent RAII class and a pointer-based event API.
+*/
+
+struct CUDAEventInternals;
+
+namespace at {
+namespace cuda {
+
+struct CUDAStream;
+
+namespace detail {
+
+// Pointer-based API (for internal use)
+// Note: ATen/Context is preferred to work with streams safely
+AT_API CUDAEventInternals* CUDAEvent_create(unsigned int flags);
+AT_API void CUDAEvent_retain(CUDAEventInternals* internals);
+AT_API void CUDAEvent_uncheckedFree(CUDAEventInternals* internals);
+AT_API cudaEvent_t CUDAEvent_event(CUDAEventInternals* internals);
+AT_API int64_t CUDAEvent_device(CUDAEventInternals* internals);
+
+} // namespace detail
+
+struct CUDAEvent {
+  // Constants
+  static constexpr unsigned int DEFAULT_FLAGS = cudaEventDisableTiming;
+
+  // Constructors
+  CUDAEvent(unsigned int flags = DEFAULT_FLAGS)
+    : internals_(detail::CUDAEvent_create(flags)) {}
+
+  ~CUDAEvent() { detail::CUDAEvent_uncheckedFree(internals_); }
+
+  CUDAEvent(const CUDAEvent& other) {
+    detail::CUDAEvent_retain(other.internals_);
+    internals_ = other.internals_;
+  }
+
+  CUDAEvent(CUDAEvent&& other) {
+    std::swap(internals_, other.internals_);
+  }
+
+  CUDAEvent& operator=(CUDAEvent other) noexcept {
+    std::swap(internals_, other.internals_);
+    return *this;
+  }
+
+  operator cudaEvent_t() const { return detail::CUDAEvent_event(internals_); }
+
+  // Less than operator (to allow use in sets)
+  friend bool operator<(const CUDAEvent& left, const CUDAEvent& right) {
+    return left.internals_ < right.internals_;
+  }
+
+  int64_t device() const { return detail::CUDAEvent_device(internals_); }
+  cudaEvent_t event() const { return detail::CUDAEvent_event(internals_); }
+  CUDAEventInternals* internals() const { return internals_; }
+
+  void record() const; // Record on the current stream
+  void record(const CUDAStream& stream) const;
+
+private:
+  CUDAEventInternals* internals_;
+};
+
+} // namespace cuda
+} // namespace at
+
diff --git a/aten/src/ATen/cuda/CUDAStream.cpp b/aten/src/ATen/cuda/CUDAStream.cpp
index 2dab634bc71e9..12d571da7f459 100644
--- a/aten/src/ATen/cuda/CUDAStream.cpp
+++ b/aten/src/ATen/cuda/CUDAStream.cpp
@@ -1,5 +1,6 @@
 #include "ATen/cuda/CUDAStream.h"
 #include "ATen/cuda/CUDAContext.h"
+#include "ATen/cuda/CUDAEvent.h"
 #include "ATen/cuda/Exceptions.h"
 #include "ATen/core/Error.h"
 
@@ -173,6 +174,10 @@ namespace detail {
     }
   }
 
+  void CUDAStream_synchronize_with(CUDAStreamInternals* ptr, const CUDAEvent& event) {
+    AT_CUDA_CHECK(cudaStreamWaitEvent(ptr->stream, event, 0));
+  }
+
 } // namespace detail
 
   /*
@@ -194,5 +199,9 @@ namespace detail {
     std::swap(internals_, other.internals_);
   }
 
+  void CUDAStream::synchronize_with(const CUDAEvent& event) const {
+    detail::CUDAStream_synchronize_with(internals_, event);
+  }
+
 } // namespace cuda
 } // namespace at
diff --git a/aten/src/ATen/cuda/CUDAStream.h b/aten/src/ATen/cuda/CUDAStream.h
index 545bccfdfbcb7..7a3e1e0595c12 100644
--- a/aten/src/ATen/cuda/CUDAStream.h
+++ b/aten/src/ATen/cuda/CUDAStream.h
@@ -15,12 +15,13 @@
 * The ATen Context interface should be preferred when working with streams.
 */
 
-// Forward-declares internals
 struct CUDAStreamInternals;
 
 namespace at {
 namespace cuda {
 
+struct CUDAEvent;
+
 namespace detail {
 
 // Pointer-based API (for internal use)
@@ -102,6 +103,8 @@ struct CUDAStream {
   cudaStream_t stream() const { return detail::CUDAStream_stream(internals_); }
   CUDAStreamInternals* internals() const { return internals_; }
 
+  void synchronize_with(const CUDAEvent& event) const;
+
 private:
   CUDAStreamInternals* internals_ = nullptr;
 };
diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
index fc61ccd698e26..9f589017822bc 100644
--- a/aten/src/ATen/function_wrapper.py
+++ b/aten/src/ATen/function_wrapper.py
@@ -293,7 +293,7 @@ def __init__(self, reason):
             'Backend::${Backend}, ScalarType::${ScalarName})'),
     'THGenerator*':
         CodeTemplate(
-            'check_generator<${Backend}Generator>(${arg_name}, &globalContext().defaultGenerator(backend()))'),
+            'check_generator<${Backend}Generator>(${arg_name}, &globalContext().defaultGenerator(device_type()))'),
     # This is a cast done via direct-construction
     'IntListStride': CodeTemplate('at::IntList ${result_name} = get_intlist_stride_th(${arg_name});'),
     'real': CodeTemplate('${arg_name}.to${ScalarName}()'),
diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp
index 328cdb88e951c..9292994341130 100644
--- a/aten/src/ATen/native/BinaryOps.cpp
+++ b/aten/src/ATen/native/BinaryOps.cpp
@@ -28,7 +28,7 @@ Tensor& add_out(Tensor& result, const Tensor& self, const Tensor& other, Scalar
     AT_ERROR("add(sparse, dense) is not supported. Use add(dense, sparse) instead.");
   }
   auto iter = TensorIterator::binary_op(result, self, other);
-  add_stub(iter->backend(), *iter, alpha);
+  add_stub(iter->device_type(), *iter, alpha);
   return result;
 }
 
@@ -53,7 +53,7 @@ Tensor& div_out(Tensor& result, const Tensor& self, const Tensor& other) {
     return at::_sparse_div_out(result, self, Scalar(other));
   }
   auto iter = TensorIterator::binary_op(result, self, other);
-  div_stub(iter->backend(), *iter);
+  div_stub(iter->device_type(), *iter);
   return result;
 }
 
@@ -74,7 +74,7 @@ Tensor& mul_out(Tensor& result, const Tensor& self, const Tensor& other) {
     return at::_sparse_mul_out(result, self, other);
   }
   auto iter = TensorIterator::binary_op(result, self, other);
-  mul_stub(iter->backend(), *iter);
+  mul_stub(iter->device_type(), *iter);
   return result;
 }
 
@@ -105,7 +105,7 @@ Tensor& sub_out(Tensor& result, const Tensor& self, const Tensor& other, Scalar
     AT_ERROR("sub(sparse, dense) is not supported. Use sub(dense, sparse) instead.");
   }
   auto iter = TensorIterator::binary_op(result, self, other);
-  sub_stub(iter->backend(), *iter, alpha);
+  sub_stub(iter->device_type(), *iter, alpha);
   return result;
 }
 
diff --git a/aten/src/ATen/native/DispatchStub.h b/aten/src/ATen/native/DispatchStub.h
index 4d4d3df1bd35e..dad05dcf8b47a 100644
--- a/aten/src/ATen/native/DispatchStub.h
+++ b/aten/src/ATen/native/DispatchStub.h
@@ -50,17 +50,17 @@ struct AT_API DispatchStub {
   static_assert(std::is_pointer<FnPtr>::value, "FnPtr should be a pointer type");
 
   template <typename... ArgTypes>
-  void operator()(Backend backend, ArgTypes... args) {
-    if (backend == Backend::CPU) {
+  void operator()(DeviceType device_type, ArgTypes&&... args) {
+    if (device_type == DeviceType::CPU) {
       if (!cpu_dispatch_ptr) {
         cpu_dispatch_ptr = choose_cpu_impl();
       }
-      (*cpu_dispatch_ptr)(args...);
-    } else if (backend == Backend::CUDA) {
+      (*cpu_dispatch_ptr)(std::forward<ArgTypes>(args)...);
+    } else if (device_type == DeviceType::CUDA) {
       AT_ASSERTM(cuda_dispatch_ptr, "DispatchStub: missing CUDA kernel");
-      (*cuda_dispatch_ptr)(args...);
+      (*cuda_dispatch_ptr)(std::forward<ArgTypes>(args)...);
     } else {
-      AT_ERROR("DispatchStub: unsupported backend", backend);
+      AT_ERROR("DispatchStub: unsupported device type", device_type);
     }
   }
 
@@ -109,12 +109,33 @@ struct RegisterDispatch {
 
 #define DEFINE_DISPATCH(name) struct name name
 
-#if defined(__CUDACC__)
-#define REGISTER_DISPATCH(name, fn) \
+#define REGISTER_ARCH_DISPATCH(name, arch, fn) \
+  template <> decltype(fn) DispatchStub<decltype(fn), struct name>::arch = fn;
+
+#ifdef HAVE_AVX_CPU_DEFINITION
+#define REGISTER_AVX_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, AVX, fn)
+#else
+#define REGISTER_AVX_DISPATCH(name, fn)
+#endif
+
+#ifdef HAVE_AVX2_CPU_DEFINITION
+#define REGISTER_AVX2_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, AVX2, fn)
+#else
+#define REGISTER_AVX2_DISPATCH(name, fn)
+#endif
+
+#define REGISTER_NO_CPU_DISPATCH(name, fn_type)                                \
+  REGISTER_ARCH_DISPATCH(name, DEFAULT, static_cast<fn_type>(nullptr))         \
+  REGISTER_AVX_DISPATCH(name, static_cast<fn_type>(nullptr))                   \
+  REGISTER_AVX2_DISPATCH(name, static_cast<fn_type>(nullptr))
+
+#define REGISTER_CUDA_DISPATCH(name, fn) \
   static RegisterDispatch<decltype(fn), struct name> name ## __register(name, fn);
+
+#if defined(__CUDACC__)
+#define REGISTER_DISPATCH(name, fn) REGISTER_CUDA_DISPATCH(name, fn)
 #elif defined(CPU_CAPABILITY)
-#define REGISTER_DISPATCH(name, fn) \
-  template <> decltype(fn) DispatchStub<decltype(fn), struct name>::CPU_CAPABILITY = fn;
+#define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
 #endif
 
 
diff --git a/aten/src/ATen/native/Distributions.cpp b/aten/src/ATen/native/Distributions.cpp
index fb1cd8cf2852f..acb9e220b967b 100644
--- a/aten/src/ATen/native/Distributions.cpp
+++ b/aten/src/ATen/native/Distributions.cpp
@@ -49,7 +49,7 @@ namespace {
  */
 
 THGenerator* get_generator(at::Generator* gen) {
-  auto default_gen = &at::globalContext().defaultGenerator(at::Backend::CPU);
+  auto default_gen = &at::globalContext().defaultGenerator(at::kCPU);
   auto gen_ = at::check_generator<at::CPUGenerator>(gen, default_gen);
   return gen_->generator;
 }
diff --git a/aten/src/ATen/native/Dropout.cpp b/aten/src/ATen/native/Dropout.cpp
index 8450cba142d52..efaa4a4b6f507 100644
--- a/aten/src/ATen/native/Dropout.cpp
+++ b/aten/src/ATen/native/Dropout.cpp
@@ -81,7 +81,7 @@ ALIAS_SPECIALIZATION(_feature_alpha_dropout, true,  true )
 } // anomymous namepsace
 
 Tensor dropout(const Tensor& input, double p, bool train) {
-  if (is_fused_kernel_acceptable(input, p)) {
+  if (train && is_fused_kernel_acceptable(input, p)) {
     return std::get<0>(input._fused_dropout(1 - p));
   }
   return _dropout<false>(input, p, train);
diff --git a/aten/src/ATen/native/Embedding.cpp b/aten/src/ATen/native/Embedding.cpp
index 0026a9907d7ec..f304c6798d11c 100644
--- a/aten/src/ATen/native/Embedding.cpp
+++ b/aten/src/ATen/native/Embedding.cpp
@@ -67,7 +67,7 @@ Tensor embedding_sparse_backward(
   int64_t num_features = grad_.size(-1);
   auto weight_size = std::array<int64_t, 2>{{ num_weights, num_features }};
   auto& dense_type = grad.type();
-  auto& sparse_type = dense_type.toBackend(grad.is_cuda() ? kSparseCUDA : kSparseCPU);
+  auto& sparse_type = dense_type.toBackend(grad.is_cuda() ? Backend::SparseCUDA : Backend::SparseCPU);
 
   // check if all our grad come from padding_idx
   if (grad.numel() == 0) {
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index 388d704a834d4..d6e5ab586cc49 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -89,7 +89,7 @@ Tensor inverse(const Tensor& self) {
 }
 
 Tensor& inverse_out(Tensor &result, const Tensor &self) {
-  AT_CHECK(self.type().backend() == kCPU || self.type().backend() == kCUDA,
+  AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
            "tensor should have CPU or CUDA backend");
   AT_CHECK(self.dim() == 2, "tensor should be 2 dimensional");
   AT_CHECK(self.size(0) == self.size(1), "tensor should be square");
diff --git a/aten/src/ATen/native/LossCTC.cpp b/aten/src/ATen/native/LossCTC.cpp
index 7214c2c355699..ccae5fb75f5b0 100644
--- a/aten/src/ATen/native/LossCTC.cpp
+++ b/aten/src/ATen/native/LossCTC.cpp
@@ -364,8 +364,8 @@ Tensor ctc_loss(const Tensor& log_probs, const Tensor& targets, IntList input_le
 
 // Convenience function accepting Tensors
 Tensor ctc_loss(const Tensor& log_probs, const Tensor& targets, const Tensor& input_lengths, const Tensor& target_lengths, int64_t BLANK, int64_t reduction) {
-  Tensor ilc = input_lengths.toType(kLong).toBackend(kCPU).contiguous();
-  Tensor tlc = target_lengths.toType(kLong).toBackend(kCPU).contiguous();
+  Tensor ilc = input_lengths.toType(kLong).toBackend(Backend::CPU).contiguous();
+  Tensor tlc = target_lengths.toType(kLong).toBackend(Backend::CPU).contiguous();
   IntList il(ilc.data<int64_t>(), ilc.numel());
   IntList tl(tlc.data<int64_t>(), tlc.numel());
   return at::native::ctc_loss(log_probs, targets, il, tl, BLANK, reduction);
diff --git a/aten/src/ATen/native/Memory.cpp b/aten/src/ATen/native/Memory.cpp
index dcd04fc8d4a7c..8950cf02d857a 100644
--- a/aten/src/ATen/native/Memory.cpp
+++ b/aten/src/ATen/native/Memory.cpp
@@ -7,7 +7,7 @@ namespace at {
 namespace native {
 
 Tensor pin_memory(const Tensor& self) {
-  if (self.type().backend() != kCPU) {
+  if (self.type().backend() != Backend::CPU) {
     AT_ERROR("cannot pin '", self.type().toString(), "' only CPU memory can be pinned");
   }
   auto* allocator = detail::getCUDAHooks().getPinnedMemoryAllocator();
diff --git a/aten/src/ATen/native/RNN.cpp b/aten/src/ATen/native/RNN.cpp
index 4e7a23fd1acfa..b93b7c0d2627f 100644
--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@@ -1,3 +1,5 @@
+#include "ATen/native/RNN.h"
+
 #include "ATen/ATen.h"
 #include "ATen/NativeFunctions.h"
 
@@ -286,7 +288,7 @@ struct FullBidirectionalLayer : Layer<Tensor, pair_of<dir_hidden_type>, pair_of<
 
   std::vector<Tensor> reverse(std::vector<Tensor>&& x) const {
     std::reverse(x.begin(), x.end());
-    return x;
+    return std::move(x);
   }
 
   FullLayer<dir_hidden_type> layer_;
@@ -499,100 +501,6 @@ std::tuple<io_type, Tensor, Tensor> _lstm_impl(
   return std::make_tuple(result.outputs, at::stack(hy, 0), at::stack(cy, 0));
 }
 
-////////////////////////////////////////////////////////////////////////////////
-// CUDNN BINDINGS
-////////////////////////////////////////////////////////////////////////////////
-
-// These must line up with the CUDNN mode codes:
-// https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnRNNMode_t
-enum class CuDNNMode { rnn_relu = 0, rnn_tanh = 1, lstm = 2, gru = 3 };
-
-std::tuple<Tensor, Tensor> unpack_hidden(const Tensor& hidden) {
-  return std::make_tuple(hidden, at::Tensor{});
-}
-
-std::tuple<Tensor, Tensor> unpack_hidden(const tpair_of<Tensor>& hidden) {
-  return hidden;
-}
-
-template<typename hidden_type>
-hidden_type pack_hidden(const Tensor& hx, const Tensor& cx) {
-  static_assert(std::is_same<hidden_type, void>::value, "pack_hidden not implemented for this type");
-  AT_ERROR("NOT IMPLEMENTED");
-}
-
-template<>
-Tensor pack_hidden<Tensor>(const Tensor& hx, const Tensor& cx) {
-  AT_ASSERT(cx.numel() == 0);
-  return hx;
-}
-
-template<>
-tpair_of<Tensor> pack_hidden<tpair_of<Tensor>>(const Tensor& hx, const Tensor& cx) {
-  return std::make_tuple(hx, cx);
-}
-
-const char * WEIGHT_FORMAT_WARN = "RNN module weights are not part of single contiguous "
-                                  "chunk of memory. This means they need to be compacted "
-                                  "at every call, possibly greatly increasing memory usage. "
-                                  "To compact weights again call flatten_parameters().";
-
-template<typename hidden_type>
-LayerOutput<Tensor, hidden_type> _cudnn_impl(
-      const Tensor& input, const Tensor& _batch_sizes,
-      const hidden_type& hidden,
-      TensorList params, bool has_biases,
-      CuDNNMode cudnn_mode, const Tensor& weight_buf, const Tensor& dropout_state,
-      int64_t num_layers, double dropout_p, bool train, bool bidirectional) {
-  if (!weight_buf.defined()) {
-    AT_WARN(WEIGHT_FORMAT_WARN);
-  }
-
-  Tensor hx, cx;
-  std::tie(hx, cx) = unpack_hidden(hidden);
-
-  int64_t hidden_size = hx.size(2);
-
-  AT_CHECK(_batch_sizes.dim() == 1, "batch_sizes tensor should be 1D");
-  IntList batch_sizes { _batch_sizes.data<int64_t>(), static_cast<size_t>(_batch_sizes.size(0)) };
-  // cudnn_output = std::tuple<output, hy, cy, reserve, new_weight_buf>
-  auto cudnn_output = at::_cudnn_rnn(
-      input, params, has_biases ? 4 : 2, weight_buf,
-      hx, cx, static_cast<int>(cudnn_mode), hidden_size,
-      num_layers, /*batch_first=*/false, dropout_p, train, bidirectional,
-      batch_sizes, dropout_state);
-
-  return {std::get<0>(cudnn_output),
-          pack_hidden<hidden_type>(std::get<1>(cudnn_output), std::get<2>(cudnn_output))};
-}
-
-template<typename hidden_type>
-LayerOutput<Tensor, hidden_type> _cudnn_impl(
-      const Tensor& input,
-      const hidden_type& hidden,
-      TensorList params, bool has_biases,
-      CuDNNMode cudnn_mode, const Tensor& weight_buf, const Tensor& dropout_state,
-      int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
-  if (!weight_buf.defined()) {
-    AT_WARN(WEIGHT_FORMAT_WARN);
-  }
-
-  Tensor hx, cx;
-  std::tie(hx, cx) = unpack_hidden(hidden);
-
-  int64_t hidden_size = hx.size(2);
-
-  // cudnn_output = std::tuple<output, hy, cy, reserve, new_weight_buf>
-  auto cudnn_output = at::_cudnn_rnn(
-      input, params, has_biases ? 4 : 2, weight_buf,
-      hx, cx, static_cast<int>(cudnn_mode), hidden_size,
-      num_layers, batch_first, dropout_p, train, bidirectional,
-      /*batch_sizes=*/{}, dropout_state);
-
-  return {std::get<0>(cudnn_output),
-          pack_hidden<hidden_type>(std::get<1>(cudnn_output), std::get<2>(cudnn_output))};
-}
-
 } // anonymous namespace
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -600,16 +508,20 @@ LayerOutput<Tensor, hidden_type> _cudnn_impl(
 ////////////////////////////////////////////////////////////////////////////////
 
 #define ONE_HIDDEN_RNN(NAME, CELL)                                             \
+DEFINE_DISPATCH(NAME##_cudnn_stub);                                            \
+DEFINE_DISPATCH(NAME##_packed_cudnn_stub);                                     \
+REGISTER_NO_CPU_DISPATCH(NAME##_cudnn_stub, rnn_fn);                           \
+REGISTER_NO_CPU_DISPATCH(NAME##_packed_cudnn_stub, rnn_packed_fn);             \
+                                                                               \
 std::tuple<Tensor, Tensor> NAME(                                               \
       const Tensor& _input, const Tensor& hx,                                  \
       TensorList _params, bool has_biases,                                     \
-      int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first, \
-      const Tensor& cudnn_weight_buf, const Tensor& cudnn_dropout_state) {     \
+      int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) { \
   if (at::cudnn_is_acceptable(_input)) {                                       \
-    auto result = _cudnn_impl(_input, hx, _params, has_biases,                 \
-        CuDNNMode::NAME, cudnn_weight_buf, cudnn_dropout_state,                \
-        num_layers, dropout_p, train, bidirectional, batch_first);             \
-    return std::make_tuple(result.outputs, result.final_hidden);               \
+    Tensor output, hy;                                                         \
+    NAME##_cudnn_stub(_input.type().device_type(), output, hy, _input, hx, _params, has_biases, \
+            num_layers, dropout_p, train, bidirectional, batch_first);         \
+    return std::make_tuple(output, hy);                                        \
   }                                                                            \
   auto input = batch_first ? _input.transpose(0, 1) : _input;                  \
   auto params = gather_params(_params, has_biases);                            \
@@ -624,12 +536,12 @@ std::tuple<Tensor, Tensor> NAME(                                               \
 std::tuple<Tensor, Tensor> NAME(                                               \
       const Tensor& data, const Tensor& batch_sizes, const Tensor& hx,         \
       TensorList _params, bool has_biases,                                     \
-      int64_t num_layers, double dropout_p, bool train, bool bidirectional,    \
-      const Tensor& cudnn_weight_buf, const Tensor& cudnn_dropout_state) {     \
+      int64_t num_layers, double dropout_p, bool train, bool bidirectional) {  \
   if (at::cudnn_is_acceptable(data)) {                                         \
-    auto result = _cudnn_impl(data, batch_sizes, hx, _params, has_biases,      \
-        CuDNNMode::NAME, cudnn_weight_buf, cudnn_dropout_state, num_layers, dropout_p, train, bidirectional); \
-    return std::make_tuple(result.outputs, result.final_hidden);               \
+    Tensor output, hy;                                                         \
+    NAME##_packed_cudnn_stub(data.type().device_type(), output, hy, data, batch_sizes, hx, \
+            _params, has_biases, num_layers, dropout_p, train, bidirectional); \
+    return std::make_tuple(output, hy);                                        \
   }                                                                            \
   PackedSequence input { data, batch_sizes };                                  \
   auto params = gather_params(_params, has_biases);                            \
@@ -643,16 +555,21 @@ ONE_HIDDEN_RNN(gru, GRUCell)
 ONE_HIDDEN_RNN(rnn_tanh, SimpleCell<tanh_f>)
 ONE_HIDDEN_RNN(rnn_relu, SimpleCell<relu_f>)
 
+DEFINE_DISPATCH(lstm_cudnn_stub);
+DEFINE_DISPATCH(lstm_packed_cudnn_stub);
+REGISTER_NO_CPU_DISPATCH(lstm_cudnn_stub, lstm_fn);
+REGISTER_NO_CPU_DISPATCH(lstm_packed_cudnn_stub, lstm_packed_fn);
+
 std::tuple<Tensor, Tensor, Tensor> lstm(
       const Tensor& _input, TensorList hx,
       TensorList _params, bool has_biases,
-      int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first,
-      const Tensor& cudnn_weight_buf, const Tensor& cudnn_dropout_state) {
+      int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
   AT_CHECK(hx.size() == 2, "lstm expects two hidden states");
   if (at::cudnn_is_acceptable(_input)) {
-    auto result = _cudnn_impl(_input, std::make_tuple(hx[0], hx[1]), _params, has_biases,
-        CuDNNMode::lstm, cudnn_weight_buf, cudnn_dropout_state, num_layers, dropout_p, train, bidirectional, batch_first);
-    return std::make_tuple(result.outputs, std::get<0>(result.final_hidden), std::get<1>(result.final_hidden));
+    Tensor output, hy, cy;
+    lstm_cudnn_stub(_input.type().device_type(), output, hy, cy, _input, hx, _params, has_biases,
+            num_layers, dropout_p, train, bidirectional, batch_first);
+    return std::make_tuple(output, hy, cy);
   }
   auto input = batch_first ? _input.transpose(0, 1) : _input;
   auto params = gather_params(_params, has_biases);
@@ -667,13 +584,13 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
 std::tuple<Tensor, Tensor, Tensor> lstm(
       const Tensor& data, const Tensor& batch_sizes, TensorList hx,
       TensorList _params, bool has_biases,
-      int64_t num_layers, double dropout_p, bool train, bool bidirectional,
-      const Tensor& cudnn_weight_buf, const Tensor& cudnn_dropout_state) {
+      int64_t num_layers, double dropout_p, bool train, bool bidirectional) {
   AT_CHECK(hx.size() == 2, "lstm expects two hidden states");
   if (at::cudnn_is_acceptable(data)) {
-    auto result = _cudnn_impl(data, batch_sizes, std::make_tuple(hx[0], hx[1]), _params, has_biases,
-        CuDNNMode::lstm, cudnn_weight_buf, cudnn_dropout_state, num_layers, dropout_p, train, bidirectional);
-    return std::make_tuple(result.outputs, std::get<0>(result.final_hidden), std::get<1>(result.final_hidden));
+    Tensor output, hy, cy;
+    lstm_packed_cudnn_stub(data.type().device_type(), output, hy, cy, data, batch_sizes, hx,
+            _params, has_biases, num_layers, dropout_p, train, bidirectional);
+    return std::make_tuple(output, hy, cy);
   }
   PackedSequence input { data, batch_sizes };
   auto params = gather_params(_params, has_biases);
diff --git a/aten/src/ATen/native/RNN.h b/aten/src/ATen/native/RNN.h
new file mode 100644
index 0000000000000..3fc89993404a9
--- /dev/null
+++ b/aten/src/ATen/native/RNN.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at { namespace native {
+
+using lstm_fn = void(*)(Tensor&, Tensor&, Tensor&, const Tensor&, TensorList, TensorList, bool, int64_t, double, bool, bool, bool);
+using rnn_fn = void(*)(Tensor&, Tensor&, const Tensor&, const Tensor&, TensorList, bool, int64_t, double, bool, bool, bool);
+using lstm_packed_fn = void(*)(Tensor&, Tensor&, Tensor&, const Tensor&, const Tensor&, TensorList, TensorList, bool, int64_t, double, bool, bool);
+using rnn_packed_fn = void(*)(Tensor&, Tensor&, const Tensor&, const Tensor&, const Tensor&, TensorList, bool, int64_t, double, bool, bool);
+
+DECLARE_DISPATCH(lstm_fn, lstm_cudnn_stub);
+DECLARE_DISPATCH(rnn_fn, gru_cudnn_stub);
+DECLARE_DISPATCH(rnn_fn, rnn_tanh_cudnn_stub);
+DECLARE_DISPATCH(rnn_fn, rnn_relu_cudnn_stub);
+DECLARE_DISPATCH(lstm_packed_fn, lstm_packed_cudnn_stub);
+DECLARE_DISPATCH(rnn_packed_fn, gru_packed_cudnn_stub);
+DECLARE_DISPATCH(rnn_packed_fn, rnn_tanh_packed_cudnn_stub);
+DECLARE_DISPATCH(rnn_packed_fn, rnn_relu_packed_cudnn_stub);
+
+}} // namespace at::native
+
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index 67bc71ca1b68e..db3833e9f8f90 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -413,7 +413,7 @@ void randperm_cpu(Tensor& result, int64_t n, THGenerator* generator) {
 
 
 THGenerator* get_generator(at::Generator* gen) {
-  auto default_gen = &at::globalContext().defaultGenerator(at::Backend::CPU);
+  auto default_gen = &at::globalContext().defaultGenerator(at::kCPU);
   auto gen_ = at::check_generator<at::CPUGenerator>(gen, default_gen);
   return gen_->generator;
 }
@@ -616,7 +616,7 @@ Tensor tensor_cpu(ArrayRef<T> values, const TensorOptions& options) {
 
 template <typename T>
 Tensor tensor_cuda(ArrayRef<T> values, const TensorOptions& options) {
-  auto cpu_tensor = tensor_cpu(values, TensorOptions(options).device(at::kCPU));
+  auto cpu_tensor = tensor_cpu(values, TensorOptions(options).device(DeviceType::CPU));
   return cpu_tensor.to(options.device());
 }
 
diff --git a/aten/src/ATen/native/TensorIterator.cpp b/aten/src/ATen/native/TensorIterator.cpp
index d8f0fbe1825e5..28989fc398b2a 100644
--- a/aten/src/ATen/native/TensorIterator.cpp
+++ b/aten/src/ATen/native/TensorIterator.cpp
@@ -97,7 +97,7 @@ void TensorIterator::compute_common_type() {
       op.type = &type;
       if (op.tensor->defined() && type != op.tensor->type()) {
         if (op.tensor->dim() == 0) {
-          if (type.backend() != at::kCUDA) {
+          if (type.backend() != at::Backend::CUDA) {
             *op.tensor = op.tensor->toType(type);
           }
         } else {
@@ -300,7 +300,7 @@ bool TensorIterator::is_scalar(int arg) const {
 }
 
 bool TensorIterator::is_cpu_scalar(int arg) const {
-  return is_scalar(arg) && operands_[arg].tensor->type().backend() == at::kCPU;
+  return is_scalar(arg) && operands_[arg].tensor->type().backend() == at::Backend::CPU;
 }
 
 void* TensorIterator::data_ptr(int arg) const {
diff --git a/aten/src/ATen/native/TensorIterator.h b/aten/src/ATen/native/TensorIterator.h
index 0cce66cf06e3e..245866373d476 100644
--- a/aten/src/ATen/native/TensorIterator.h
+++ b/aten/src/ATen/native/TensorIterator.h
@@ -120,6 +120,7 @@ struct AT_API TensorIterator {
   }
   ScalarType dtype(int arg) const { return type(arg).scalarType(); }
   Backend backend(int arg=0) const { return type(arg).backend(); }
+  DeviceType device_type(int arg=0) const { return type(arg).device_type(); }
   bool is_scalar(int arg) const;
   bool is_cpu_scalar(int arg) const;
 
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
index 7ecedff060bf2..6733a94db3454 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -104,7 +104,7 @@ static void sigmoid_kernel(Tensor& result, const Tensor& self) {
 
 #define IMPLEMENT_FLOAT_KERNEL(dispatchtypes, op)                          \
   static void op##_kernel(Tensor& result, const Tensor& self) {            \
-    checkBackend(#op, {result}, kCPU);                                     \
+    checkBackend(#op, {result}, Backend::CPU);                             \
     AT_DISPATCH_##dispatchtypes##_TYPES(self.type(), #op, [&] {            \
       if (self.is_contiguous() && result.is_contiguous()) {                \
         vml::v##op(                                                        \
diff --git a/aten/src/ATen/native/cuda/Gesv.cu b/aten/src/ATen/native/cuda/Gesv.cu
index 7c84694056932..4d99f1b5ed22b 100644
--- a/aten/src/ATen/native/cuda/Gesv.cu
+++ b/aten/src/ATen/native/cuda/Gesv.cu
@@ -75,7 +75,7 @@ template<class T>
 static inline std::unique_ptr<Storage> pin_memory(int64_t size, Tensor dummy) {
   int64_t adjusted_size = size * sizeof(T);
   auto* allocator = cuda::getPinnedMemoryAllocator();
-  auto& backend = dummy.type().toBackend(kCPU).toScalarType(kByte);
+  auto& backend = dummy.type().toBackend(Backend::CPU).toScalarType(kByte);
   return backend.storageWithAllocator(adjusted_size, allocator);
 }
 
diff --git a/aten/src/ATen/native/cuda/SummaryOps.cu b/aten/src/ATen/native/cuda/SummaryOps.cu
index 90a13e8255f71..5427e7de0419f 100644
--- a/aten/src/ATen/native/cuda/SummaryOps.cu
+++ b/aten/src/ATen/native/cuda/SummaryOps.cu
@@ -249,7 +249,7 @@ Tensor _bincount_cuda_template(
   }
   if (self.dim() != 1 ||
       (!std::is_same<input_t, uint8_t>::value &&
-       *self.min().toBackend(kCPU).data<input_t>() < 0)) {
+       *self.min().cpu().data<input_t>() < 0)) {
     AT_ERROR("bincount only supports 1-d non-negative integral inputs.");
   }
 
@@ -268,7 +268,7 @@ Tensor _bincount_cuda_template(
     auto ret = cuda::CUDA_tensor_histogram<weights_t, input_t, true>(
         output, self, weights, nbins, 1);
   } else {
-    output = native::zeros({nbins}, device(kCUDA).dtype(kLong));
+    output = native::zeros({nbins}, device(DeviceType::CUDA).dtype(kLong));
     auto ret = cuda::CUDA_tensor_histogram<int64_t, input_t, false>(
         output, self, weights, nbins, 1);
   }
diff --git a/aten/src/ATen/native/cuda/TensorFactories.cu b/aten/src/ATen/native/cuda/TensorFactories.cu
index e4866a21d5523..309b54a299caa 100644
--- a/aten/src/ATen/native/cuda/TensorFactories.cu
+++ b/aten/src/ATen/native/cuda/TensorFactories.cu
@@ -49,7 +49,7 @@ Tensor& randperm_out_cuda(Tensor& result, int64_t n, Generator* generator) {
     result.copy_(randperm_out_cuda(result_float, n, generator));
   } else {
     if (n < 30000) {  // For small inputs, we offload it to CPU instead.
-      auto result_cpu = result.type().toBackend(kCPU).tensor({n});
+      auto result_cpu = result.type().cpu().tensor({n});
       randperm_out(result_cpu, n, generator);
       result.copy_(result_cpu);
     } else {
diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
index 994a652dbaa68..6f2d13a1533bd 100644
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@@ -6,6 +6,8 @@
 #include <ATen/core/Error.h>
 #include <ATen/cuda/CUDAConfig.h>
 #include <ATen/cuda/Exceptions.h>
+#include <ATen/cuda/CUDAEvent.h>
+#include <ATen/native/RNN.h>
 
 #if !AT_CUDNN_ENABLED()
 
@@ -451,7 +453,7 @@ namespace {
           // (same for the hh weights, and the ih and hh biases).
           // Since we're storing all the weights in a single tensor anyway,
           // might as well merge the CUDNN ones into a single tensor as well
-	  int mat_numel = *filter_dim_a.prod(at::ScalarType::Int).data<int>();
+          int mat_numel = *filter_dim_a.prod(at::ScalarType::Int).data<int>();
           if (linear_id == 0 || linear_id == num_linear_layers / 2) {
             std::initializer_list<int64_t> size = {
               mat_numel * num_linear_layers / 2, 1};
@@ -477,6 +479,46 @@ namespace {
     return std::make_pair(params, global_layer_params_count);
   }
 
+  // This is a lightweight version of the method above used to quickly get the expected
+  // parameter offsets.
+  std::vector<void*> get_expected_data_ptrs(
+        const Tensor& weight_buf, cudnnHandle_t handle, const RNNDescriptorParams& rnn,
+        const RNNDescriptor& rnn_desc, const TensorDescriptor& x_desc, cudnnDataType_t datatype) {
+    FilterDescriptor w_desc;
+    w_desc.set(weight_buf, 3);
+
+    int64_t num_linear_layers = _num_linear_layers(rnn.mode);
+    int64_t num_dir_layers = rnn.num_directions() * rnn.num_layers;
+    const auto cudnn_methods = { cudnnGetRNNLinLayerMatrixParams, cudnnGetRNNLinLayerBiasParams };
+    std::vector<void*> data_ptrs;
+    data_ptrs.reserve(num_dir_layers * 2 * 2);
+    for (int64_t layer = 0; layer < num_dir_layers; layer++) {
+      for (auto cudnn_method : cudnn_methods) {
+        // This API returns a separate pointer for weight of every gate,
+        // but we represent them as a single tensor, so we're only interested
+        // in a very limited subset of possible values.
+        const std::array<int64_t, 2> linear_offsets = { 0, num_linear_layers / 2 };
+        for (int64_t linear_id : linear_offsets) {
+          FilterDescriptor lin_layer_mat_desc;
+          void* matrix_pointer;
+          AT_CUDNN_CHECK(cudnn_method(
+                handle,
+                rnn_desc.desc(),
+                layer,
+                x_desc.desc(),
+                w_desc.desc(),
+                weight_buf.data_ptr(),
+                linear_id,
+                lin_layer_mat_desc.mut_desc(),
+                &matrix_pointer
+                ));
+          data_ptrs.push_back(matrix_pointer);
+        }
+      }
+    }
+    return data_ptrs;
+  }
+
   void _copyParams(MatrixRef<Tensor> params_from, MatrixRef<Tensor> params_to) {
     AT_ASSERTM(params_from.size(0) == params_to.size(0), "number of layers mismatch");
     for (size_t i = 0; i < params_from.size(0); i++) {
@@ -1007,6 +1049,243 @@ Tensor _cudnn_init_dropout_state(const Type& ty, double dropout, bool train, int
   return dropout_desc.state;
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// CUDA dispatch for the generic RNN ops (at::lstm, at::gru, ...)
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+
+// Helpers for working with different hidden types.
+std::tuple<Tensor, Tensor> unpack_hidden(const Tensor& hidden) {
+  return std::make_tuple(hidden, at::Tensor{});
+}
+
+std::tuple<Tensor, Tensor> unpack_hidden(const std::tuple<Tensor, Tensor>& hidden) {
+  return hidden;
+}
+
+template<typename hidden_type>
+hidden_type pack_hidden(const Tensor& hx, const Tensor& cx) {
+  static_assert(std::is_same<hidden_type, void>::value, "pack_hidden not implemented for this type");
+  AT_ERROR("NOT IMPLEMENTED");
+}
+
+template<>
+Tensor pack_hidden<Tensor>(const Tensor& hx, const Tensor& cx) {
+  AT_ASSERT(cx.numel() == 0);
+  return hx;
+}
+
+template<>
+std::tuple<Tensor, Tensor> pack_hidden<std::tuple<Tensor, Tensor>>(const Tensor& hx, const Tensor& cx) {
+  return std::make_tuple(hx, cx);
+}
+
+struct DropoutState {
+  // Both buffer and event are lazily instantiated when a dropout state is needed
+  // for the first time. Note that in this case needed != used, as we don't need
+  // a bufer to e.g. run RNNs in test mode.
+  at::Tensor buffer;
+  at::optional<cuda::CUDAEvent> event;
+  std::mutex mutex;
+
+  void lock() {
+    // NB: We can't ignore the lock even when event is undefined, because someone
+    // could then define it before we get to unlock().
+    mutex.lock();
+    if (event) {
+      cuda::getCurrentCUDAStream().synchronize_with(*event);
+    }
+  }
+
+  void unlock() {
+    if (event) {
+      event->record();
+    }
+    mutex.unlock();
+  }
+};
+
+DropoutState& get_dropout_state(const Type& tp, double dropout_p, bool train) {
+  // Each state is slightly over 2MB and initialized lazily, so it's fine to cache them.
+  static std::vector<DropoutState> ten_dropout_state_cache { static_cast<size_t>(cuda::getNumGPUs()) };
+  static std::vector<DropoutState> var_dropout_state_cache { static_cast<size_t>(cuda::getNumGPUs()) };
+  static std::mutex state_cache_mut;
+
+  int device = cuda::current_device();
+  std::unique_lock<std::mutex> lock {state_cache_mut};
+  auto& state = tp.is_variable() ? var_dropout_state_cache.at(device)
+                                 : ten_dropout_state_cache.at(device);
+  if (train && dropout_p > 0 && !state.buffer.defined()) {
+    std::unique_lock<std::mutex> lock {state.mutex};
+    int64_t seed = at::empty({}, at::kLong).random_().toCLong();
+    state.buffer = at::_cudnn_init_dropout_state(
+      tp.toScalarType(at::kByte), dropout_p, train, seed);
+    // NB: CUDA binds the event to a device at creation time, so we can initialize it
+    // only now, when we know we're on the correct device.
+    state.event.emplace();
+  }
+  return state;
+}
+
+Tensor try_get_weight_buf(
+      const Tensor& input, TensorList parameters, bool has_biases,
+      cudnnRNNMode_t mode, int64_t hidden_size, int64_t num_layers, bool bidirectional) {
+  // Prepare all relevant descriptors
+  auto handle = getCudnnHandle();
+  auto datatype = getCudnnDataType(input);
+
+  RNNDescriptorParams rnn;
+  rnn.set(mode, hidden_size, num_layers, bidirectional, datatype);
+  RNNDescriptor rnn_desc = rnn.descriptor(handle);
+
+  TensorGeometry x_geom ({1, input.size(-1)});
+  TensorDescriptor x_desc;
+  x_desc.set(datatype, x_geom.sizes(), x_geom.strides(), 5);
+
+  auto num_params = get_num_weights(handle, rnn_desc, x_desc, datatype);
+
+  // Try to get parameter storage
+  auto & any_param = parameters.at(0);
+  auto param_storage = any_param.storage();
+  auto weight_buf = any_param.type().tensor().set_(*param_storage);
+  if (weight_buf.size(0) < num_params) {
+    return {};
+  } else if (weight_buf.size(0) > num_params) {
+    weight_buf = weight_buf.narrow(0, 0, num_params);
+  }
+
+  // Get and check data pointers
+  auto expected_data_ptrs = get_expected_data_ptrs(
+      weight_buf, handle, rnn, rnn_desc, x_desc, datatype);
+
+  int64_t num_parameters = parameters.size();
+  int64_t num_ptrs = expected_data_ptrs.size();
+  AT_ASSERT(num_ptrs == (num_parameters * (has_biases ? 1 : 2)));
+  AT_ASSERT(num_ptrs % (has_biases ? 4 : 2) == 0);
+  for (int64_t param_i = 0, ptr_i = 0;
+       ptr_i < num_ptrs;
+       ptr_i += (has_biases ? 2 : 4), param_i += 2) {
+    if (expected_data_ptrs[ptr_i] != parameters[param_i].data_ptr()) return {};
+    if (expected_data_ptrs[ptr_i + 1] != parameters[param_i + 1].data_ptr()) return {};
+  }
+  if (!parameters[num_parameters - 1].is_contiguous()) return {};
+  return weight_buf;
+}
+
+const char * WEIGHT_FORMAT_WARN = "RNN module weights are not part of single contiguous "
+                                  "chunk of memory. This means they need to be compacted "
+                                  "at every call, possibly greatly increasing memory usage. "
+                                  "To compact weights again call flatten_parameters().";
+
+template<typename hidden_type>
+std::pair<Tensor, hidden_type> _cudnn_impl(
+      const Tensor& input, const Tensor& _batch_sizes, const hidden_type& hidden,
+      TensorList params, bool has_biases, cudnnRNNMode_t mode,
+      int64_t num_layers, double dropout_p, bool train, bool bidirectional) {
+  Tensor hx, cx;
+  std::tie(hx, cx) = unpack_hidden(hidden);
+  int64_t hidden_size = hx.size(2);
+
+  auto weight_buf = try_get_weight_buf(
+      input, params, has_biases, mode, hidden_size, num_layers, bidirectional);
+  if (!weight_buf.defined()) {
+    AT_WARN(WEIGHT_FORMAT_WARN);
+  }
+
+  AT_CHECK(_batch_sizes.dim() == 1, "batch_sizes tensor should be 1D");
+  IntList batch_sizes { _batch_sizes.data<int64_t>(), static_cast<size_t>(_batch_sizes.size(0)) };
+
+  auto & dropout_state = get_dropout_state(input.type(), dropout_p, train);
+  std::unique_lock<DropoutState> lock { dropout_state };
+  // cudnn_output = std::tuple<output, hy, cy, reserve, new_weight_buf>
+  auto cudnn_output = at::_cudnn_rnn(
+      input, params, has_biases ? 4 : 2, weight_buf,
+      hx, cx, static_cast<int>(mode), hidden_size, num_layers, /*batch_first=*/false,
+      dropout_p, train, bidirectional, batch_sizes, dropout_state.buffer);
+
+  return {std::get<0>(cudnn_output),
+          pack_hidden<hidden_type>(std::get<1>(cudnn_output), std::get<2>(cudnn_output))};
+}
+
+template<typename hidden_type>
+std::pair<Tensor, hidden_type> _cudnn_impl(
+      const Tensor& input, const hidden_type& hidden,
+      TensorList params, bool has_biases, cudnnRNNMode_t mode,
+      int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
+  Tensor hx, cx;
+  std::tie(hx, cx) = unpack_hidden(hidden);
+  int64_t hidden_size = hx.size(2);
+
+  auto weight_buf = try_get_weight_buf(
+      input, params, has_biases, mode, hidden_size, num_layers, bidirectional);
+  if (!weight_buf.defined()) {
+    AT_WARN(WEIGHT_FORMAT_WARN);
+  }
+
+  auto & dropout_state = get_dropout_state(input.type(), dropout_p, train);
+  std::unique_lock<DropoutState> lock { dropout_state };
+  // cudnn_output = std::tuple<output, hy, cy, reserve, new_weight_buf>
+  auto cudnn_output = at::_cudnn_rnn(
+      input, params, has_biases ? 4 : 2, weight_buf,
+      hx, cx, static_cast<int>(mode), hidden_size, num_layers, batch_first, dropout_p,
+      train, bidirectional, /*batch_sizes=*/{}, dropout_state.buffer);
+
+  return {std::get<0>(cudnn_output),
+          pack_hidden<hidden_type>(std::get<1>(cudnn_output), std::get<2>(cudnn_output))};
+}
+
+#define ONE_HIDDEN_RNN(NAME, MODE)                                             \
+void NAME##_cudnn(Tensor& output, Tensor& hy,                                  \
+      const Tensor& input, const Tensor& hx,                                   \
+      TensorList params, bool has_biases,                                      \
+      int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) { \
+  std::tie(output, hy) = _cudnn_impl(input, hx, params, has_biases,            \
+      MODE, num_layers, dropout_p, train, bidirectional, batch_first);         \
+}                                                                              \
+                                                                               \
+void NAME##_packed_cudnn(Tensor& output, Tensor& hy,                           \
+      const Tensor& data, const Tensor& batch_sizes, const Tensor& hx,         \
+      TensorList params, bool has_biases,                                      \
+      int64_t num_layers, double dropout_p, bool train, bool bidirectional) {  \
+  std::tie(output, hy) = _cudnn_impl(data, batch_sizes, hx, params,            \
+      has_biases, MODE, num_layers, dropout_p, train, bidirectional);          \
+}                                                                              \
+                                                                               \
+REGISTER_CUDA_DISPATCH(NAME##_cudnn_stub, &NAME##_cudnn);                      \
+REGISTER_CUDA_DISPATCH(NAME##_packed_cudnn_stub, &NAME##_packed_cudnn);
+
+ONE_HIDDEN_RNN(gru, CUDNN_GRU)
+ONE_HIDDEN_RNN(rnn_tanh, CUDNN_RNN_TANH)
+ONE_HIDDEN_RNN(rnn_relu, CUDNN_RNN_RELU)
+
+void lstm_cudnn(Tensor& output, Tensor& hy, Tensor& cy,
+      const Tensor& input, TensorList hx,
+      TensorList params, bool has_biases,
+      int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
+  auto result = _cudnn_impl(input, std::make_tuple(hx[0], hx[1]), params, has_biases,
+      CUDNN_LSTM, num_layers, dropout_p, train, bidirectional, batch_first);
+  output = result.first;
+  hy = std::get<0>(result.second);
+  cy = std::get<1>(result.second);
+}
+
+void lstm_packed_cudnn(Tensor& output, Tensor& hy, Tensor& cy,
+      const Tensor& data, const Tensor& batch_sizes, TensorList hx,
+      TensorList params, bool has_biases,
+      int64_t num_layers, double dropout_p, bool train, bool bidirectional) {
+  auto result = _cudnn_impl(data, batch_sizes, std::make_tuple(hx[0], hx[1]),
+      params, has_biases, CUDNN_LSTM, num_layers, dropout_p, train, bidirectional);
+  output = result.first;
+  hy = std::get<0>(result.second);
+  cy = std::get<1>(result.second);
+}
+
+REGISTER_CUDA_DISPATCH(lstm_cudnn_stub, &lstm_cudnn);
+REGISTER_CUDA_DISPATCH(lstm_packed_cudnn_stub, &lstm_packed_cudnn);
+
+} // anonymous namepsace
+
 }} // namespace at::native
 
 #endif // AT_CUDNN_ENABLED()
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index bcac90d5d7de2..f23ee7a322d01 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -2110,28 +2110,28 @@
   variants: function
 
 # RNN cells and layers
-- func: lstm(Tensor input, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first, Tensor? cudnn_weight_buf={}, Tensor? cudnn_dropout_state={}) -> (Tensor, Tensor, Tensor)
+- func: lstm(Tensor input, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor)
   variants: function
 
-- func: lstm(Tensor data, Tensor batch_sizes, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, Tensor? cudnn_weight_buf={}, Tensor? cudnn_dropout_state={}) -> (Tensor, Tensor, Tensor)
+- func: lstm(Tensor data, Tensor batch_sizes, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) -> (Tensor, Tensor, Tensor)
   variants: function
 
-- func: gru(Tensor input, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first, Tensor? cudnn_weight_buf={}, Tensor? cudnn_dropout_state={}) -> (Tensor, Tensor)
+- func: gru(Tensor input, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
   variants: function
 
-- func: gru(Tensor data, Tensor batch_sizes, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, Tensor? cudnn_weight_buf={}, Tensor? cudnn_dropout_state={}) -> (Tensor, Tensor)
+- func: gru(Tensor data, Tensor batch_sizes, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
   variants: function
 
-- func: rnn_tanh(Tensor input, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first, Tensor? cudnn_weight_buf={}, Tensor? cudnn_dropout_state={}) -> (Tensor, Tensor)
+- func: rnn_tanh(Tensor input, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
   variants: function
 
-- func: rnn_tanh(Tensor data, Tensor batch_sizes, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, Tensor? cudnn_weight_buf={}, Tensor? cudnn_dropout_state={}) -> (Tensor, Tensor)
+- func: rnn_tanh(Tensor data, Tensor batch_sizes, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
   variants: function
 
-- func: rnn_relu(Tensor input, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first, Tensor? cudnn_weight_buf={}, Tensor? cudnn_dropout_state={}) -> (Tensor, Tensor)
+- func: rnn_relu(Tensor input, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
   variants: function
 
-- func: rnn_relu(Tensor data, Tensor batch_sizes, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, Tensor? cudnn_weight_buf={}, Tensor? cudnn_dropout_state={}) -> (Tensor, Tensor)
+- func: rnn_relu(Tensor data, Tensor batch_sizes, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
   variants: function
 
 - func: lstm_cell(Tensor input, TensorList hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih={}, Tensor? b_hh={}) -> (Tensor, Tensor)
diff --git a/aten/src/ATen/templates/SparseTypeDerived.cpp b/aten/src/ATen/templates/SparseTypeDerived.cpp
index 6c3094e71aa0d..8508cf4c5463a 100644
--- a/aten/src/ATen/templates/SparseTypeDerived.cpp
+++ b/aten/src/ATen/templates/SparseTypeDerived.cpp
@@ -35,8 +35,8 @@ ScalarType ${Type}::scalarType() const {
 Backend ${Type}::backend() const {
   return Backend::${Backend};
 }
-bool ${Type}::is_cuda() const { return backend() == kCUDA || backend() == kSparseCUDA; }
-bool ${Type}::is_sparse() const { return backend() == kSparseCPU || backend() == kSparseCUDA; }
+bool ${Type}::is_cuda() const { return backend() == Backend::CUDA || backend() == Backend::SparseCUDA; }
+bool ${Type}::is_sparse() const { return backend() == Backend::SparseCPU || backend() == Backend::SparseCUDA; }
 bool ${Type}::is_distributed() const { return false; }
 
 std::unique_ptr<Storage> ${Type}::storage(bool resizable) const {
diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h
index 5a7fd278fb1f0..0d27a88b773fb 100644
--- a/aten/src/ATen/templates/Tensor.h
+++ b/aten/src/ATen/templates/Tensor.h
@@ -154,6 +154,9 @@ struct Tensor : public detail::TensorBase {
   Tensor operator[](Tensor index) const;
   Tensor operator[](int64_t index) const;
 
+  Tensor cpu() const;
+  Tensor cuda() const;
+
   // ~~~~~ Autograd API ~~~~~
 
   Tensor& set_requires_grad(bool requires_grad) {
diff --git a/aten/src/ATen/templates/TensorMethods.h b/aten/src/ATen/templates/TensorMethods.h
index 58c0198375985..1ab26d97a121d 100644
--- a/aten/src/ATen/templates/TensorMethods.h
+++ b/aten/src/ATen/templates/TensorMethods.h
@@ -19,6 +19,14 @@ inline Tensor Tensor::toType(const Type & t, bool non_blocking) const {
   return t.copy(*this, non_blocking);
 }
 
+inline Tensor Tensor::cpu() const {
+  return toType(type().cpu());
+}
+
+inline Tensor Tensor::cuda() const {
+  return toType(type().cuda());
+}
+
 inline Tensor & Tensor::copy_(const Tensor & src, bool non_blocking) {
   return type().copy_(*this, src, non_blocking);
 }
diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h
index 10036a5286b5b..4523b7b3efbc2 100644
--- a/aten/src/ATen/templates/Type.h
+++ b/aten/src/ATen/templates/Type.h
@@ -74,15 +74,26 @@ struct AT_API Type {
   Type & toDense() const {
     return this->toBackend(at::toDense(this->backend()));
   }
+  Type & cpu() const {
+    return this->toBackend(at::backendToCPU(this->backend()));
+  }
+  Type & cuda() const {
+    return this->toBackend(at::backendToCUDA(this->backend()));
+  }
   Context& get_context() const { return *context; }
 
-  // contingious IDs for all types in the system
+  // contiguous IDs for all types in the system
   // for external dispatch
   virtual TypeID ID() const = 0;
 
   // New-style TensorTypeId that supports open registration.
   TensorTypeId type_id() const { return type_id_; }
 
+  // NB: This will return DeviceType::CPU for Backend::SparseCPU
+  DeviceType device_type() const {
+    return backendToDeviceType(backend());
+  }
+
   Tensor copy(const Tensor & src, bool non_blocking=false) const;
   Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking=false) const;
   virtual Tensor & s_copy_(Tensor & self, const Tensor & src, bool non_blocking) const = 0;
@@ -121,7 +132,7 @@ inline Layout Tensor::layout() const noexcept {
 }
 
 inline Device Tensor::device() const {
-  return Device(type().backend(), type().is_cuda() ? get_device() : -1);
+  return Device(type().device_type(), type().is_cuda() ? get_device() : -1);
 }
 
 } // namespace at
diff --git a/aten/src/ATen/templates/TypeDerived.cpp b/aten/src/ATen/templates/TypeDerived.cpp
index 5f2e354d8869e..82942655a8514 100644
--- a/aten/src/ATen/templates/TypeDerived.cpp
+++ b/aten/src/ATen/templates/TypeDerived.cpp
@@ -46,8 +46,8 @@ ScalarType ${Type}::scalarType() const {
 Backend ${Type}::backend() const {
   return Backend::${Backend};
 }
-bool ${Type}::is_cuda() const { return backend() == kCUDA || backend() == kSparseCUDA; }
-bool ${Type}::is_sparse() const { return backend() == kSparseCPU || backend() == kSparseCUDA; }
+bool ${Type}::is_cuda() const { return backend() == Backend::CUDA || backend() == Backend::SparseCUDA; }
+bool ${Type}::is_sparse() const { return backend() == Backend::SparseCPU || backend() == Backend::SparseCUDA; }
 bool ${Type}::is_distributed() const { return false; }
 
 std::unique_ptr<Storage> ${Type}::storage(bool resizable) const {
@@ -80,9 +80,9 @@ std::unique_ptr<Storage> ${Type}::storageFromBlob(void * data, int64_t size, con
       ScalarType::${ScalarName},
       InefficientStdFunctionContext::makeDataPtr(data, deleter,
 #if ${isCUDA}
-      Device(kCUDA, getPointerDevice(data))
+      Device(DeviceType::CUDA, getPointerDevice(data))
 #else
-      kCPU
+      DeviceType::CPU
 #endif
       ),
       size,
diff --git a/aten/src/ATen/test/apply_utils_test.cpp b/aten/src/ATen/test/apply_utils_test.cpp
index 24359a05e4bf6..38027baae97b7 100644
--- a/aten/src/ATen/test/apply_utils_test.cpp
+++ b/aten/src/ATen/test/apply_utils_test.cpp
@@ -109,31 +109,31 @@ void test(Type& type, IntList shape, int64_t a = 0, int64_t b = 1) {
 }
 
 TEST_CASE("apply utils test 2-dim small contiguous", "[cpu]") {
-  manual_seed(123, at::Backend::CPU);
+  manual_seed(123, at::kCPU);
   test(CPU(kDouble), {2, 1}, -1, -1);
 }
 
 TEST_CASE("apply utils test 2-dim small", "[cpu]") {
-  manual_seed(123, at::Backend::CPU);
+  manual_seed(123, at::kCPU);
   test(CPU(kDouble), {2, 1});
 }
 
 TEST_CASE("apply utils test 2-dim", "[cpu]") {
-  manual_seed(123, at::Backend::CPU);
+  manual_seed(123, at::kCPU);
   test(CPU(kDouble), {20, 10});
 }
 
 TEST_CASE("apply utils test 3-dim", "[cpu]") {
-  manual_seed(123, at::Backend::CPU);
+  manual_seed(123, at::kCPU);
   test(CPU(kDouble), {3, 4, 2});
 }
 
 TEST_CASE("apply utils test 3-dim medium", "[cpu]") {
-  manual_seed(123, at::Backend::CPU);
+  manual_seed(123, at::kCPU);
   test(CPU(kDouble), {3, 40, 2});
 }
 
 TEST_CASE("apply utils test 10-dim", "[cpu]") {
-  manual_seed(123, at::Backend::CPU);
+  manual_seed(123, at::kCPU);
   test(CPU(kDouble), {3, 4, 2, 5, 2, 1, 3, 4, 2, 3});
 }
diff --git a/aten/src/ATen/test/atest.cpp b/aten/src/ATen/test/atest.cpp
index 0271de625fd13..cc831fbd42832 100644
--- a/aten/src/ATen/test/atest.cpp
+++ b/aten/src/ATen/test/atest.cpp
@@ -24,8 +24,8 @@ void trace() {
 
 TEST_CASE( "atest", "[]" ) {
 
-  manual_seed(123, at::Backend::CPU);
-  manual_seed(123, at::Backend::CUDA);
+  manual_seed(123, at::kCPU);
+  manual_seed(123, at::kCUDA);
 
   auto foo = rand({12,6});
   REQUIRE(foo.data<float>() == foo.toFloatData());
diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp
index 0e668e27919b1..cfd77986d626c 100644
--- a/aten/src/ATen/test/basic.cpp
+++ b/aten/src/ATen/test/basic.cpp
@@ -57,7 +57,7 @@ static void test(Type & type) {
     REQUIRE(Scalar(z_sorted[0][0]).toFloat() < Scalar(z_sorted[0][1]).toFloat());
   }
 
-  if(type.backend() != kCUDA)
+  if(type.backend() != Backend::CUDA)
   SECTION( "randperm" ) {
     Tensor b = randperm(15, type);
     Tensor rv, ri;
@@ -277,13 +277,13 @@ static void test(Type & type) {
 }
 
 TEST_CASE( "basic tests CPU", "[cpu]" ) {
-  manual_seed(123, at::Backend::CPU);
+  manual_seed(123, at::kCPU);
 
   test(CPU(kFloat));
 }
 
 TEST_CASE( "basic tests GPU", "[cuda]" ) {
-  manual_seed(123, at::Backend::CUDA);
+  manual_seed(123, at::kCUDA);
 
   if(at::hasCUDA()) {
     test(CUDA(kFloat));
diff --git a/aten/src/ATen/test/broadcast_test.cpp b/aten/src/ATen/test/broadcast_test.cpp
index b86f58f3deeee..395b49d4be0f5 100644
--- a/aten/src/ATen/test/broadcast_test.cpp
+++ b/aten/src/ATen/test/broadcast_test.cpp
@@ -8,7 +8,7 @@ using namespace at;
 
 TEST_CASE( "broadcast", "[]" ) {
 
-  manual_seed(123, at::Backend::CPU);
+  manual_seed(123, at::kCPU);
 
   Type & T = CPU(kFloat);
 
diff --git a/aten/src/ATen/test/cudnn_test.cpp b/aten/src/ATen/test/cudnn_test.cpp
index 7194c83c0be71..31786e88a0944 100644
--- a/aten/src/ATen/test/cudnn_test.cpp
+++ b/aten/src/ATen/test/cudnn_test.cpp
@@ -10,7 +10,7 @@ using namespace at;
 using namespace at::native;
 
 TEST_CASE( "cudnn", "[cuda]" ) {
-  manual_seed(123, at::Backend::CUDA);
+  manual_seed(123, at::kCUDA);
 
 #if CUDNN_VERSION < 7000
   auto handle = getCudnnHandle();
diff --git a/aten/src/ATen/test/dlconvertor_test.cpp b/aten/src/ATen/test/dlconvertor_test.cpp
index 1603e3d54b16e..4882929876027 100644
--- a/aten/src/ATen/test/dlconvertor_test.cpp
+++ b/aten/src/ATen/test/dlconvertor_test.cpp
@@ -13,7 +13,7 @@ using namespace at;
 
 TEST_CASE( "dlconvertor", "[cpu]" ) {
 
-  manual_seed(123, at::Backend::CPU);
+  manual_seed(123, at::kCPU);
 
   INFO( "convert ATen to DLTensor" );
 
diff --git a/aten/src/ATen/test/native_test.cpp b/aten/src/ATen/test/native_test.cpp
index 99a21d36d7720..fac85da04aa60 100644
--- a/aten/src/ATen/test/native_test.cpp
+++ b/aten/src/ATen/test/native_test.cpp
@@ -179,13 +179,13 @@ void test(Type & T, Type & AccT) {
 }
 
 TEST_CASE( "native test CPU", "[cpu]" ) {
-  manual_seed(123, at::Backend::CPU);
+  manual_seed(123, at::kCPU);
 
   test(CPU(kFloat), CPU(kDouble));
 }
 
 TEST_CASE( "native test CUDA", "[cuda]" ) {
-  manual_seed(123, at::Backend::CUDA);
+  manual_seed(123, at::kCUDA);
 
   if (at::hasCUDA()) {
     test(CUDA(kFloat), CUDA(kDouble));
diff --git a/aten/src/ATen/test/scalar_tensor_test.cpp b/aten/src/ATen/test/scalar_tensor_test.cpp
index 59d8e369772fa..0907c89e09b06 100644
--- a/aten/src/ATen/test/scalar_tensor_test.cpp
+++ b/aten/src/ATen/test/scalar_tensor_test.cpp
@@ -264,13 +264,13 @@ void test(Type &T) {
 }
 
 TEST_CASE( "scalar tensor test CPU", "[cpu]" ) {
-  manual_seed(123, at::Backend::CPU);
+  manual_seed(123, at::kCPU);
 
   test(CPU(kFloat));
 }
 
 TEST_CASE( "scalar tensor test CUDA", "[cuda]" ) {
-  manual_seed(123, at::Backend::CUDA);
+  manual_seed(123, at::kCUDA);
 
   if (at::hasCUDA()) {
     test(CUDA(kFloat));
diff --git a/aten/src/ATen/test/scalar_test.cpp b/aten/src/ATen/test/scalar_test.cpp
index 2880004555a74..a83f345cc48c3 100644
--- a/aten/src/ATen/test/scalar_test.cpp
+++ b/aten/src/ATen/test/scalar_test.cpp
@@ -72,15 +72,15 @@ void test_overflow() {
 
 TEST_CASE( "scalar test", "[]" ) {
 
-  manual_seed(123, at::Backend::CPU);
-  manual_seed(123, at::Backend::CUDA);
+  manual_seed(123, at::kCPU);
+  manual_seed(123, at::kCUDA);
 
   Scalar what = 257;
   Scalar bar = 3.0;
   Half h = bar.toHalf();
   Scalar h2 = h;
   cout << "H2: " << h2.toDouble() << " " << what.toFloat() << " " << bar.toDouble() << " " << what.isIntegral() <<  "\n";
-  Generator & gen = at::globalContext().defaultGenerator(Backend::CPU);
+  Generator & gen = at::globalContext().defaultGenerator(at::kCPU);
   REQUIRE_NOTHROW(gen.seed());
   auto && C = at::globalContext();
   if(at::hasCUDA()) {
diff --git a/aten/src/ATen/test/tbb_init_test.cpp b/aten/src/ATen/test/tbb_init_test.cpp
index ae8b02acbc4e8..a0f21734fa6e4 100644
--- a/aten/src/ATen/test/tbb_init_test.cpp
+++ b/aten/src/ATen/test/tbb_init_test.cpp
@@ -23,7 +23,7 @@ void test(int given_num_threads) {
 }
 
 int main() {
-  manual_seed(123, at::Backend::CPU);
+  manual_seed(123, at::kCPU);
 
   test(-1);
   std::thread t1(test, -1);
diff --git a/aten/src/ATen/test/test_parallel.cpp b/aten/src/ATen/test/test_parallel.cpp
index 5dbd9676e5de9..552328029ce03 100644
--- a/aten/src/ATen/test/test_parallel.cpp
+++ b/aten/src/ATen/test/test_parallel.cpp
@@ -13,7 +13,7 @@ using namespace at;
 
 TEST_CASE( "parallel", "[cpu]" ) {
 
-  manual_seed(123, at::Backend::CPU);
+  manual_seed(123, at::kCPU);
   set_num_threads(1);
 
   Tensor a = rand({1,3});
diff --git a/aten/src/ATen/test/test_seed.h b/aten/src/ATen/test/test_seed.h
index 16f9ecb6ed479..980a6eb823ee9 100644
--- a/aten/src/ATen/test/test_seed.h
+++ b/aten/src/ATen/test/test_seed.h
@@ -2,12 +2,12 @@
 
 #include "ATen/ATen.h"
 
-void manual_seed(uint64_t seed, at::Backend backend) {
-  if (backend == at::Backend::CPU) {
-    at::Generator & cpu_gen = at::globalContext().defaultGenerator(at::Backend::CPU);
+void manual_seed(uint64_t seed, at::DeviceType backend) {
+  if (backend == at::kCPU) {
+    at::Generator & cpu_gen = at::globalContext().defaultGenerator(at::kCPU);
     cpu_gen.manualSeed(seed);
-  } else if (backend == at::Backend::CUDA && at::hasCUDA()) {
-    at::Generator & cuda_gen = at::globalContext().defaultGenerator(at::Backend::CUDA);
+  } else if (backend == at::kCUDA && at::hasCUDA()) {
+    at::Generator & cuda_gen = at::globalContext().defaultGenerator(at::kCUDA);
     cuda_gen.manualSeed(seed);
   }
 }
diff --git a/aten/src/ATen/test/undefined_tensor_test.cpp b/aten/src/ATen/test/undefined_tensor_test.cpp
index 7e5dad7e46f6d..d302d736c696d 100644
--- a/aten/src/ATen/test/undefined_tensor_test.cpp
+++ b/aten/src/ATen/test/undefined_tensor_test.cpp
@@ -9,7 +9,7 @@
 using namespace at;
 
 TEST_CASE( "undefined tensor test", "[]" ) {
-  manual_seed(123, at::Backend::CPU);
+  manual_seed(123, at::kCPU);
 
   // mainly test ops on undefined tensors don't segfault and give a reasonable errror message.
   Tensor und;
diff --git a/aten/src/ATen/test/wrapdim_test.cpp b/aten/src/ATen/test/wrapdim_test.cpp
index 23d37117cb871..1c45e1d00a058 100644
--- a/aten/src/ATen/test/wrapdim_test.cpp
+++ b/aten/src/ATen/test/wrapdim_test.cpp
@@ -7,7 +7,7 @@
 using namespace at;
 
 TEST_CASE( "wrapdim test", "[]" ) {
-  manual_seed(123, at::Backend::CPU);
+  manual_seed(123, at::kCPU);
 
   Type & T = CPU(kFloat);
 
diff --git a/aten/src/TH/THAllocator.cpp b/aten/src/TH/THAllocator.cpp
index 9dccbb384b17a..1cbc232890b4c 100644
--- a/aten/src/TH/THAllocator.cpp
+++ b/aten/src/TH/THAllocator.cpp
@@ -22,7 +22,7 @@
 struct THDefaultAllocator final : public at::Allocator {
   at::DataPtr allocate(size_t size) const override {
     auto* ptr = THAlloc(size);
-    return {ptr, ptr, &THFree, at::kCPU};
+    return {ptr, ptr, &THFree, at::DeviceType::CPU};
   }
   at::DeleterFnPtr raw_deleter() const override {
     return &THFree;
@@ -537,25 +537,25 @@ THRefcountedMapAllocator* THRefcountedMapAllocator::fromDataPtr(const at::DataPt
 at::DataPtr THMapAllocator::makeDataPtr(const char *filename, int flags, size_t size, size_t* actual_size_out) {
   auto* context = new THMapAllocator(filename, flags, size);
   if (actual_size_out) *actual_size_out = context->size();
-  return {context->data(), context, &deleteTHMapAllocator, at::kCPU};
+  return {context->data(), context, &deleteTHMapAllocator, at::DeviceType::CPU};
 }
 
 at::DataPtr THMapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
   auto* context = new THMapAllocator(WITH_FD, filename, fd, flags, size);
   if (actual_size_out) *actual_size_out = context->size();
-  return {context->data(), context, &deleteTHMapAllocator, at::kCPU};
+  return {context->data(), context, &deleteTHMapAllocator, at::DeviceType::CPU};
 }
 
 at::DataPtr THRefcountedMapAllocator::makeDataPtr(const char *filename, int flags, size_t size, size_t* actual_size_out) {
   auto* context = new THRefcountedMapAllocator(filename, flags, size);
   if (actual_size_out) *actual_size_out = context->size() - TH_ALLOC_ALIGNMENT;
-  return {context->data(), context, &deleteTHRefcountedMapAllocator, at::kCPU};
+  return {context->data(), context, &deleteTHRefcountedMapAllocator, at::DeviceType::CPU};
 }
 
 at::DataPtr THRefcountedMapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
   auto* context = new THRefcountedMapAllocator(WITH_FD, filename, fd, flags, size);
   if (actual_size_out) *actual_size_out = context->size() - TH_ALLOC_ALIGNMENT;
-  return {context->data(), context, &deleteTHRefcountedMapAllocator, at::kCPU};
+  return {context->data(), context, &deleteTHRefcountedMapAllocator, at::DeviceType::CPU};
 }
 
 void* THRefcountedMapAllocator::data() const {
diff --git a/aten/src/THC/THCAllocator.cpp b/aten/src/THC/THCAllocator.cpp
index c6be2f0afefbb..098ec406110c5 100644
--- a/aten/src/THC/THCAllocator.cpp
+++ b/aten/src/THC/THCAllocator.cpp
@@ -10,7 +10,7 @@ struct THCudaHostAllocator : public at::Allocator {
     if (size != 0) {
       THCudaCheck(cudaMallocHost(&ptr, size));
     }
-    return {ptr, ptr, &THCudaHostDeleter, at::kCPU};
+    return {ptr, ptr, &THCudaHostDeleter, at::DeviceType::CPU};
   }
   at::DeleterFnPtr raw_deleter() const override {
     return &THCudaHostDeleter;
@@ -34,7 +34,7 @@ struct THCUVAAllocator : public at::Allocator {
     if (size != 0) {
       THCudaCheck(cudaMallocManaged(&ptr, size, cudaMemAttachGlobal));
     }
-    return {ptr, ptr, &THCUVADeleter, at::kCPU};
+    return {ptr, ptr, &THCUVADeleter, at::DeviceType::CPU};
   }
   at::DeleterFnPtr raw_deleter() const override {
     return &THCUVADeleter;
@@ -64,5 +64,5 @@ at::DataPtr THCIpcDeleter::makeDataPtr(void* data, int device) {
   int cur_device;
   THCudaCheck(cudaGetDevice(&cur_device));
   auto* context = new THCIpcDeleter(data, device);
-  return {data, context, &deleteTHCIpcDeleter, at::Device(at::kCUDA, cur_device)};
+  return {data, context, &deleteTHCIpcDeleter, at::Device(at::DeviceType::CUDA, cur_device)};
 }
diff --git a/aten/src/THC/THCBlas.cu b/aten/src/THC/THCBlas.cu
index 9a9de1f5e9b62..44c536e7e5d70 100644
--- a/aten/src/THC/THCBlas.cu
+++ b/aten/src/THC/THCBlas.cu
@@ -2,6 +2,8 @@
 #include "THCGeneral.h"
 #include "THCHalf.h"
 
+#include <algorithm>
+
 float THCudaBlas_Sdot(THCState *state, int64_t n, float *x, int64_t incx, float *y, int64_t incy)
 {
   if (n == 1) {
diff --git a/aten/src/THC/THCCachingAllocator.cpp b/aten/src/THC/THCCachingAllocator.cpp
index b63e47d86eac7..7882e9a37d546 100644
--- a/aten/src/THC/THCCachingAllocator.cpp
+++ b/aten/src/THC/THCCachingAllocator.cpp
@@ -510,7 +510,7 @@ struct CudaCachingAllocator : public at::Allocator {
     if (size != 0) {
       AT_CUDA_CHECK(caching_allocator.malloc(&r, size, at::cuda::getCurrentCUDAStreamOnDevice(device)));
     }
-    return {r, r, &CudaCachingDeleter, at::Device(at::kCUDA, device)};
+    return {r, r, &CudaCachingDeleter, at::Device(at::DeviceType::CUDA, device)};
   }
   at::DeleterFnPtr raw_deleter() const override {
     return &CudaCachingDeleter;
diff --git a/aten/src/THC/THCCachingHostAllocator.cpp b/aten/src/THC/THCCachingHostAllocator.cpp
index 617c6f2f520af..b371ed9873abe 100644
--- a/aten/src/THC/THCCachingHostAllocator.cpp
+++ b/aten/src/THC/THCCachingHostAllocator.cpp
@@ -269,7 +269,7 @@ struct THCCachingHostAllocator final : public at::Allocator {
     THAssert(size >= 0);
     void *ptr;
     THCudaCheck(allocator.malloc(&ptr, size));
-    return {ptr, ptr, &THCCachingHostDeleter, at::kCPU};
+    return {ptr, ptr, &THCCachingHostDeleter, at::DeviceType::CPU};
   }
   at::DeleterFnPtr raw_deleter() const override {
     return &THCCachingHostDeleter;
diff --git a/aten/src/THC/THCGeneral.cpp b/aten/src/THC/THCGeneral.cpp
index 05b76d79f59c1..1b716db430714 100644
--- a/aten/src/THC/THCGeneral.cpp
+++ b/aten/src/THC/THCGeneral.cpp
@@ -49,7 +49,7 @@ struct THDefaultDeviceAllocator final : public at::Allocator {
     if (size != 0) THCudaCheck(cudaMalloc(&p, size));
     int device;
     THCudaCheck(cudaGetDevice(&device));
-    return {p, p, &THDefaultDeviceDeleter, at::Device(at::kCUDA, device)};
+    return {p, p, &THDefaultDeviceDeleter, at::Device(at::DeviceType::CUDA, device)};
   }
   at::DeleterFnPtr raw_deleter() const override {
     return &THDefaultDeviceDeleter;
diff --git a/aten/src/THC/THCStorage.cpp b/aten/src/THC/THCStorage.cpp
index f76b39a816048..0fb6fea51f5d5 100644
--- a/aten/src/THC/THCStorage.cpp
+++ b/aten/src/THC/THCStorage.cpp
@@ -22,7 +22,7 @@ void THCStorage_resize(THCState *state, THCStorage *self, ptrdiff_t size)
 
   if(size == 0)
   {
-    self->set_data_ptr(at::DataPtr(nullptr, at::Device(at::kCUDA, device)));
+    self->set_data_ptr(at::DataPtr(nullptr, at::Device(at::DeviceType::CUDA, device)));
     self->set_size(0);
   }
   else
diff --git a/caffe2/contrib/aten/aten_op.cc b/caffe2/contrib/aten/aten_op.cc
index df3ee5326b7d9..0483ebb05d968 100644
--- a/caffe2/contrib/aten/aten_op.cc
+++ b/caffe2/contrib/aten/aten_op.cc
@@ -6,7 +6,7 @@ namespace caffe2 {
 REGISTER_CPU_OPERATOR(ATen, ATenOp<CPUContext>);
 template<>
 at::Backend ATenOp<CPUContext>::backend() const {
-  return at::kCPU;
+  return at::Backend::CPU;
 }
 
 OPERATOR_SCHEMA(ATen);
diff --git a/caffe2/contrib/aten/aten_op_cuda.cc b/caffe2/contrib/aten/aten_op_cuda.cc
index d416e700cb186..8e1c6bdd23645 100644
--- a/caffe2/contrib/aten/aten_op_cuda.cc
+++ b/caffe2/contrib/aten/aten_op_cuda.cc
@@ -6,7 +6,7 @@ namespace caffe2 {
 REGISTER_CUDA_OPERATOR(ATen, ATenOp<CUDAContext>);
 template<>
 at::Backend ATenOp<CUDAContext>::backend() const {
-  return at::kCUDA;
+  return at::Backend::CUDA;
 }
 
 namespace math {
diff --git a/caffe2/operators/onnxifi_op.cc b/caffe2/operators/onnxifi_op.cc
index 3cfb97292c9a2..4e9f6f2ac280f 100644
--- a/caffe2/operators/onnxifi_op.cc
+++ b/caffe2/operators/onnxifi_op.cc
@@ -51,6 +51,10 @@ OnnxifiOp<float, CPUContext>::BuildInitializationList(
     std::vector<std::string>* weight_names,
     std::vector<std::vector<uint64_t>>* weight_shapes) {
   const std::vector<string>& ws_blobs = ws->Blobs();
+  // Since onnxTensorDescriptorV1.name will point into the memory in
+  // weight_names, we need to prevent weight_names from reallocating by
+  // reserving enough memory ahead of time
+  weight_names->reserve(ws_blobs.size());
   std::vector<onnxTensorDescriptorV1> descs;
   for (const auto& s : ws_blobs) {
     auto it = initialization_list->find(s);
diff --git a/caffe2/opt/fusion.cc b/caffe2/opt/fusion.cc
index f5ea0f678ed51..61d5301adb72e 100644
--- a/caffe2/opt/fusion.cc
+++ b/caffe2/opt/fusion.cc
@@ -11,12 +11,17 @@ using namespace nom;
 // $$ X_{bn} = \frac{s(X - m)}{\sqrt{\sigma + \epsilon}} + b_{bn}$$
 // $$ X_{conv} = X * W + b_{conv} $$
 // thus, substituting $X$ with $X_{conv}$ in the BN equation we get:
-// $$X_{bn} = X * \frac{sW}{\sqrt{\sigma + \epsilon}} + \frac{s(b_{conv} - m)}{\sqrt{\sigma + \epsilon}} + b_{bn}$$
-// or
+// $$X_{bn} = X * \frac{sW}{\sqrt{\sigma + \epsilon}} + \frac{s(b_{conv} -
+// m)}{\sqrt{\sigma + \epsilon}} + b_{bn}$$ or
 // $$ W' = W\frac{s}{\sqrt{\sigma + \epsilon}}$$
 // $$ b' = (b_{conv} - m)\frac{s}{\sqrt{\sigma + \epsilon}} + b_{bn}$$
 bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) {
-  for (auto convNode : repr::nn::nodeIterator<repr::Conv>(nn->dataFlow)) {
+  size_t convOrder = 0;
+  for (auto node_pair : repr::nn::dataIterator<repr::Conv>(nn->dataFlow)) {
+    repr::NNGraph::NodeRef convNode;
+    repr::Conv* conv;
+    std::tie(conv, convNode) = node_pair;
+
     auto output = repr::nn::getOutputs(convNode).front();
     auto consumers = repr::nn::getConsumers(output);
     NOM_REQUIRE_OR_CONT(consumers.size() == 1);
@@ -31,9 +36,9 @@ bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) {
     auto bnOutput = bnOutputs.front();
 
     auto convInputs = repr::nn::getInputs(convNode);
-    CAFFE_ENFORCE(
-        convInputs.size() >= 3,
-        "Invalid convolution input size (TODO: optional bias)");
+    if (convInputs.size() < 2) {
+      continue;
+    }
 
     auto bnInputs = repr::nn::getInputs(bnNode);
     CAFFE_ENFORCE(
@@ -46,13 +51,46 @@ bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) {
   auto name##Data = name##Tensor->mutable_data<float>();
 
     EXPOSE_TENSOR_DATA(filter, 1, convInputs);
-    EXPOSE_TENSOR_DATA(biasConv, 2, convInputs);
 
     EXPOSE_TENSOR_DATA(scale, 1, bnInputs);
     EXPOSE_TENSOR_DATA(biasBN, 2, bnInputs);
     EXPOSE_TENSOR_DATA(mean, 3, bnInputs);
     EXPOSE_TENSOR_DATA(variance, 4, bnInputs);
 
+    if (convInputs.size() == 2) {
+      NOM_REQUIRE_OR_CONT(conv->getMutableAnnotation() != nullptr);
+      auto annotation =
+          dyn_cast<caffe2::Caffe2Annotation>(conv->getMutableAnnotation());
+      NOM_REQUIRE_OR_CONT(annotation != nullptr);
+      auto op = annotation->getOperatorDef();
+      auto convName = op.name();
+
+      while (true) {
+        auto convBiasName = convName + "_bias" + to_string(convOrder);
+        if (!ws->HasBlob(convBiasName)) {
+          auto convBiasTensor = make_unique<repr::Tensor>(convBiasName);
+          convBiasTensor->setType(repr::Tensor::DataType::Float);
+          auto convBiasNode = nn->dataFlow.createNode(
+              unique_dyn_cast<repr::NeuralNetData>(convBiasTensor));
+          nn->inputs.insert(convBiasNode);
+          nn->dataFlow.createEdge(convBiasNode, convNode);
+
+          auto* blob = ws->CreateBlob(convBiasName);
+          caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
+          CHECK_NOTNULL(tensor);
+          // Get output channel
+          size_t c = filterTensor->dim32(0);
+          tensor->Resize(c);
+          tensor->mutable_data<float>();
+          break;
+        }
+        convOrder++;
+      }
+    }
+
+    convInputs = repr::nn::getInputs(convNode);
+    EXPOSE_TENSOR_DATA(biasConv, 2, convInputs);
+
 #undef EXPOSE_TENSOR_DATA
 
     // Assume M{CHW,HWC}
diff --git a/caffe2/python/dataio.py b/caffe2/python/dataio.py
index a51251a23a75a..302ee8de817f5 100644
--- a/caffe2/python/dataio.py
+++ b/caffe2/python/dataio.py
@@ -586,6 +586,11 @@ def schema(self):
     def setup(self, **kwargs):
         for reader_builder in self._reader_builders:
             reader_builder.setup(**kwargs)
+            # limiter is stateful; it can only be used once. Since
+            # CompositeReader stops when one of the reader stops,
+            # this is fine.
+            if "limiter" in kwargs:
+                kwargs.pop("limiter")
 
     def new_reader(self, **kwargs):
         readers = []
diff --git a/caffe2/python/pipeline.py b/caffe2/python/pipeline.py
index e2a8ac0c2102d..ee38fe52df8c4 100644
--- a/caffe2/python/pipeline.py
+++ b/caffe2/python/pipeline.py
@@ -55,12 +55,17 @@ def _init_output(output, capacity, global_init_net, global_exit_net):
     return out_queue, writer
 
 
-def make_processor(processor):
+def make_processor(processor, reader=None):
     if processor is None:
         return lambda rec: rec
     elif isinstance(processor, core.Net):
         return NetProcessor(processor)
     else:
+        if reader is not None and hasattr(processor, "schema_func"):
+            def processor_schema():
+                return processor.schema_func(reader)
+
+            processor.schema = processor_schema
         return processor
 
 
@@ -352,7 +357,10 @@ class ProcessingReader(Reader):
     def __init__(self, reader, processor):
         Reader.__init__(self)
         self.reader = reader
-        self.processor = make_processor(processor)
+        self.processor = make_processor(processor, reader)
+
+    def schema(self):
+        return self.processor.schema()
 
     def setup_ex(self, init_net, finish_net):
         self.reader.setup_ex(init_net, finish_net)
@@ -404,6 +412,9 @@ def __init__(self, net, stop_signal=None, thread_init_nets=None, name=None):
         self._frozen = False
         self._cloned_init_nets = []
 
+    def schema(self):
+        return self.net.output_record()
+
     def setup(self, init_net):
         self._frozen = True
         cloned_init_nets = self._cloned_init_nets
diff --git a/caffe2/python/transformations_test.py b/caffe2/python/transformations_test.py
index 4e215b586e5f3..d9992116a696e 100644
--- a/caffe2/python/transformations_test.py
+++ b/caffe2/python/transformations_test.py
@@ -221,6 +221,116 @@ def test_transformer_FuseConvBN(self, size, input_channels, seed, order, epsilon
         assert np.allclose(
             preTransformOutput,
             postTransformOutput,
-            rtol=1e-02,
+            rtol=5e-02,
+            atol=1e-03
+        )
+
+    @given(
+        size=st.integers(7, 10),
+        input_channels=st.integers(1, 10),
+        seed=st.integers(0, 65535),
+        order=st.sampled_from(["NCHW", "NHWC"]),
+        epsilon=st.floats(min_value=1e-5, max_value=1e-2),
+    )
+    def test_transformer_FuseConvBNNoConvBias(self, size, input_channels, seed, order, epsilon):
+        workspace.ResetWorkspace()
+        net = core.Net("net")
+        c = input_channels
+        h = size
+        w = size
+        k = 3
+        net.Conv(["X", "w"], ["Y"], stride=1, pad=0, kernel=k, order=order)
+        net.SpatialBN(
+            ["Y", "scale", "bias", "mean", "var"],
+            ["Y2"],
+            is_test=True,
+            order=order,
+            epsilon=epsilon,
+        )
+
+        np.random.seed(seed)
+        if order == "NCHW":
+            workspace.FeedBlob("X", np.random.rand(1, c, h, w).astype(np.float32))
+            workspace.FeedBlob("w", np.random.rand(c, c, k, k).astype(np.float32))
+        else:
+            workspace.FeedBlob("X", np.random.rand(1, h, w, c).astype(np.float32))
+            workspace.FeedBlob("w", np.random.rand(c, k, k, c).astype(np.float32))
+        workspace.FeedBlob("scale", np.random.rand(c).astype(np.float32))
+        workspace.FeedBlob("bias", np.random.rand(c).astype(np.float32))
+        workspace.FeedBlob("mean", np.random.rand(c).astype(np.float32))
+        # This is necessary because 1/sqrt(var) is used and if var is too small
+        # we get floating point artifacts that cause test failures
+        workspace.FeedBlob("var", np.random.rand(c).astype(np.float32) + 0.5)
+        workspace.RunNetOnce(net)
+        preTransformOutput = workspace.FetchBlob("Y2").flatten()
+        workspace.FeedBlob("Y2", np.zeros((1, 1)))
+        transformer.FuseConvBN(net)
+
+        # Ensure fusion
+        assert len(net.Proto().op) == 1
+        workspace.RunNetOnce(net)
+        postTransformOutput = workspace.FetchBlob("Y2").flatten()
+        # Check that there is no numerical difference
+        assert np.allclose(
+            preTransformOutput,
+            postTransformOutput,
+            rtol=5e-02,
+            atol=1e-03
+        )
+
+    @given(
+        size=st.integers(7, 10),
+        input_channels=st.integers(1, 10),
+        seed=st.integers(0, 65535),
+        order=st.sampled_from(["NCHW", "NHWC"]),
+        epsilon=st.floats(min_value=1e-5, max_value=1e-2),
+    )
+    def test_transformer_FuseConvBNNoConvBiasDuplicatedName(self, size, input_channels, seed, order, epsilon):
+        workspace.ResetWorkspace()
+        net = core.Net("net")
+        c = input_channels
+        h = size
+        w = size
+        k = 3
+        net.Conv(["X", "w"], ["Y"], stride=1, pad=0, kernel=k, order=order)
+        net.SpatialBN(
+            ["Y", "scale", "_bias0", "mean", "var"],
+            ["Y2"],
+            is_test=True,
+            order=order,
+            epsilon=epsilon,
+        )
+
+        np.random.seed(seed)
+        if order == "NCHW":
+            workspace.FeedBlob("X", np.random.rand(1, c, h, w).astype(np.float32))
+            workspace.FeedBlob("w", np.random.rand(c, c, k, k).astype(np.float32))
+        else:
+            workspace.FeedBlob("X", np.random.rand(1, h, w, c).astype(np.float32))
+            workspace.FeedBlob("w", np.random.rand(c, k, k, c).astype(np.float32))
+        workspace.FeedBlob("scale", np.random.rand(c).astype(np.float32))
+        workspace.FeedBlob("_bias0", np.random.rand(c).astype(np.float32))
+        workspace.FeedBlob("mean", np.random.rand(c).astype(np.float32))
+        # This is necessary because 1/sqrt(var) is used and if var is too small
+        # we get floating point artifacts that cause test failures
+        workspace.FeedBlob("var", np.random.rand(c).astype(np.float32) + 0.5)
+        workspace.RunNetOnce(net)
+        preTransformOutput = workspace.FetchBlob("Y2").flatten()
+        workspace.FeedBlob("Y2", np.zeros((1, 1)))
+        transformer.FuseConvBN(net)
+
+        # Ensure fusion
+        assert len(net.Proto().op) == 1
+        workspace.RunNetOnce(net)
+        postTransformOutput = workspace.FetchBlob("Y2").flatten()
+        print("pre")
+        print(preTransformOutput)
+        print("after")
+        print(postTransformOutput)
+        # Check that there is no numerical difference
+        assert np.allclose(
+            preTransformOutput,
+            postTransformOutput,
+            rtol=5e-02,
             atol=1e-03
         )
diff --git a/docs/cpp/Doxyfile b/docs/cpp/Doxyfile
new file mode 100644
index 0000000000000..acb9e35fc2bb0
--- /dev/null
+++ b/docs/cpp/Doxyfile
@@ -0,0 +1,2468 @@
+# Doxyfile 1.8.14
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all text
+# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
+# built into libc) for the transcoding. See
+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = "PyTorch"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          =
+
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = build
+
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+ALLOW_UNICODE_NAMES    = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 4
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines (in the resulting output). You can put ^^ in the value part of an
+# alias to insert a newline as if a physical newline was in the original file.
+
+ALIASES                = "rst=\verbatim embed:rst:leading-asterisk"
+ALIASES               += "endrst=\endverbatim"
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding "class=itcl::class"
+# will allow you to use the command class in the itcl::class meaning.
+
+TCL_SUBST              =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
+# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
+# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
+# Fortran. In the later case the parser tries to guess whether the code is fixed
+# or free formatted code, this is the default for Fortran type files), VHDL. For
+# instance to make doxygen treat .inc files as Fortran files (default is PHP),
+# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+#
+# Note: For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 0.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 0
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# If one adds a struct or class to a group and this option is enabled, then also
+# any nested class or struct is added to the same group. By default this option
+# is disabled and one has to add nested compounds explicitly via \ingroup.
+# The default value is: NO.
+
+GROUP_NESTED_COMPOUNDS = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL            = YES
+
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = YES
+
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = YES
+
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. If set to YES, local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO, these classes will be included in the various overviews. This option
+# has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# (class|struct|union) declarations. If set to NO, these declarations will be
+# included in the documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO, these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
+# names in lower-case letters. If set to YES, upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES       = NO
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES, the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+HIDE_COMPOUND_REFERENCE= NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES, the
+# list will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = NO
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO, doxygen will only warn about wrong or incomplete
+# parameter documentation, but not about the absence of documentation.
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = NO
+
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  =  ../../torch/csrc/api/include \
+                          ../../torch/csrc/api/src \
+                          ../../aten/src/ATen/ATen.h \
+                          ../../aten/src/ATen/Backend.h \
+                          ../../aten/src/ATen/Context.h \
+                          ../../aten/src/ATen/Device.h \
+                          ../../aten/src/ATen/DeviceGuard.h \
+                          ../../aten/src/ATen/Layout.h \
+                          ../../aten/src/ATen/OptionsGuard.h \
+                          ../../aten/src/ATen/Scalar.h \
+                          ../../aten/src/ATen/TensorOptions.h \
+                          ../../aten/src/ATen/core/ArrayRef.h \
+                          ../../aten/src/ATen/core/DeviceType.h \
+                          ../../aten/src/ATen/core/Error.h \
+                          ../../aten/src/ATen/core/Half.h \
+                          ../../aten/src/ATen/core/ScalarType.h \
+                          ../../aten/src/ATen/cuda/CUDAGuard.h \
+                          ../../aten/src/ATen/cuda/CUDAStream.h \
+                          ../../aten/src/ATen/cuda/CUDAHalf.h \
+                          ../../aten/src/ATen/cuda/CUDAContext.h \
+                          ../../aten/src/ATen/cuda/PinnedMemoryAllocator.h \
+                          ../../aten/src/ATen/cudnn/Descriptors.h \
+                          ../../aten/src/ATen/cudnn/Handles.h \
+                          ../../aten/src/ATen/cudnn/Types.h \
+                          ../../aten/src/ATen/cudnn/Utils.h \
+                          ../../aten/src/ATen/mkl/Descriptors.h \
+                          ../../build/aten/src/ATen/Tensor.h \
+                          ../../build/aten/src/ATen/Functions.h \
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see: https://www.gnu.org/software/libiconv/) for the list of
+# possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# read by doxygen.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
+# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
+# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
+# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
+# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf.
+
+FILE_PATTERNS          = *.h *.cpp
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                =
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS = c10::* caffe2::* cereal* DL* TH*
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       = *
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# function all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see https://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = YES
+
+# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
+# which the alphabetical index list will be split.
+# Minimum value: 1, maximum value: 20, default value: 5.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = NO
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefore more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list). For an example see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the style sheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to YES can help to show when doxygen was last run and thus if the
+# documentation is up to date.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = NO
+
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via Javascript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have Javascript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see: https://developer.apple.com/tools/xcode/), introduced with
+# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# Makefile in the HTML output directory. Running make will produce the docset in
+# that directory and running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See https://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the master .chm file (NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see: http://doc.qt.io/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see: http://doc.qt.io/qt-4.8/qthelpproject.html#virtual-folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# http://doc.qt.io/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location of Qt's
+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
+# generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# https://www.mathjax.org) which uses client side Javascript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from https://www.mathjax.org before deployment.
+# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using Javascript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: https://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: https://xapian.org/). See the section "External Indexing and
+# Searching" for details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when enabling USE_PDFLATEX this option is only used for generating
+# bitmaps for formulas in the HTML output, but not in the Makefile that is
+# written to the output directory.
+# The default file is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. The package can be specified just
+# by its name or with the correct syntax as to be used with the LaTeX
+# \usepackage command. To get the times font for instance you can specify :
+# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
+# To use the option intlimits with the amsmath package you can specify:
+# EXTRA_PACKAGES=[intlimits]{amsmath}
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
+# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
+# string, for the replacement values of the other commands the user is referred
+# to HTML_HEADER.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_STYLESHEET =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
+# the PDF file directly from the LaTeX files. Set this option to YES, to get a
+# higher quality PDF documentation.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's config
+# file, i.e. a series of assignments. You only have to provide replacements,
+# missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's config file. A template extensions file can be generated
+# using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    =
+
+# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
+# with syntax highlighting in the RTF output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_SOURCE_CODE        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_SUBDIR             =
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = YES
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
+# program listings (including syntax highlighting and cross-referencing
+# information) to the DOCBOOK output. Note that enabling this will significantly
+# increase the size of the DOCBOOK output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_PROGRAMLISTING = NO
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# the structure of the code including all documentation. Note that this feature
+# is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO, the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = YES
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED             = DOXYGEN_DOCUMENTATION_BUILD
+PREDEFINED            += DOXYGEN_SHOULD_SKIP_THIS
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of 'which perl').
+# The default file (with absolute path) is: /usr/bin/perl.
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
+# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
+# NO turns the diagrams off. Note that this option also works with HAVE_DOT
+# disabled, but it is recommended to install and use dot, since it yields more
+# powerful graphs.
+# The default value is: YES.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see:
+# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH               =
+
+# If set to YES the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: NO.
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font in the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
+# each documented class showing the direct and indirect inheritance relations.
+# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command. Disabling a call graph can be
+# accomplished by means of the command \hidecallgraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command. Disabling a caller graph can be
+# accomplished by means of the command \hidecallergraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. For an explanation of the image formats see the section
+# output formats in the documentation of the dot tool (Graphviz (see:
+# http://www.graphviz.org/)).
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
+# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# png:gdiplus:gdiplus.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS           =
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file. If left blank, it is assumed
+# PlantUML is not used or called during a preprocessing step. Doxygen will
+# generate a warning when it encounters a \startuml command in this case and
+# will not generate output for the diagram.
+
+PLANTUML_JAR_PATH      =
+
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+PLANTUML_INCLUDE_PATH  =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# files that are used to generate the various graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_CLEANUP            = YES
diff --git a/docs/cpp/Makefile b/docs/cpp/Makefile
new file mode 100644
index 0000000000000..336792c10277f
--- /dev/null
+++ b/docs/cpp/Makefile
@@ -0,0 +1,24 @@
+# Minimal makefile for Sphinx documentation
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SPHINXPROJ    = PyTorch
+SOURCEDIR     = ./
+BUILDDIR      = build
+PYCMD         = python
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+clean:
+	@echo "Removing everything under 'build'..."
+	@rm -r $(BUILDDIR)/html/ $(BUILDDIR)/doctrees $(BUILDDIR)/xml
diff --git a/docs/cpp/building.rst b/docs/cpp/building.rst
new file mode 100644
index 0000000000000..24ab7a5e69ba3
--- /dev/null
+++ b/docs/cpp/building.rst
@@ -0,0 +1,2 @@
+Building
+========
diff --git a/docs/cpp/conf.py b/docs/cpp/conf.py
new file mode 100644
index 0000000000000..aeea35a4f331e
--- /dev/null
+++ b/docs/cpp/conf.py
@@ -0,0 +1,224 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# PyTorch documentation build configuration file, created by
+# sphinx-quickstart on Fri Dec 23 13:31:47 2016.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# sys.path.insert(0, os.path.abspath('.'))
+
+import sys
+import textwrap
+
+import torch
+import sphinx_rtd_theme
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'breathe',
+    'exhale'
+]
+
+breathe_projects = {"PyTorch": "build/xml"}
+breathe_default_project = "PyTorch"
+
+# Setup the exhale extension
+exhale_args = {
+    ############################################################################
+    # These arguments are required.                                            #
+    ############################################################################
+    "containmentFolder": "./api",
+    "rootFileName": "library_root.rst",
+    "rootFileTitle": "Library API",
+    "doxygenStripFromPath": "../",
+    ############################################################################
+    # Suggested optional arguments.                                            #
+    ############################################################################
+    "createTreeView": True,
+    "exhaleExecutesDoxygen": True,
+    "exhaleUseDoxyfile": True,
+    "verboseBuild": True,
+    ############################################################################
+    # HTML Theme specific configurations.                                      #
+    ############################################################################
+    # Fix broken Sphinx RTD Theme 'Edit on GitHub' links
+    # Search for 'Edit on GitHub' on the FAQ:
+    #     http://exhale.readthedocs.io/en/latest/faq.html
+    "pageLevelConfigMeta": ":github_url: https://github.com/pytorch/pytorch",
+    ############################################################################
+    # Individual page layout example configuration.                            #
+    ############################################################################
+    # Example of adding contents directives on custom kinds with custom title
+    "contentsTitle": "Page Contents",
+    "kindsWithContentsDirectives": ["class", "file", "namespace", "struct"],
+    ############################################################################
+    # Main library page layout example configuration.                          #
+    ############################################################################
+    "afterTitleDescription": textwrap.dedent(u'''
+        Welcome to the developer reference for the PyTorch C++ API.
+    '''),
+}
+
+# Tell sphinx what the primary language being documented is.
+primary_domain = 'cpp'
+
+# Tell sphinx what the pygments highlight language should be.
+highlight_language = 'cpp'
+
+# Add any paths that contain templates here, relative to this directory.
+# templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'PyTorch'
+copyright = '2018, Torch Contributors'
+author = 'Torch Contributors'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+# TODO: change to [:2] at v1.0
+version = 'master (' + torch.__version__ + ' )'
+# The full version, including alpha/beta/rc tags.
+# TODO: verify this works as expected
+release = 'master'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = []
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = True
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+html_theme_options = {
+    'canonical_url': 'https://pytorch.org/docs/stable/',
+    'collapse_navigation': False,
+    'display_version': True,
+    'logo_only': True,
+}
+
+html_logo = '../source/_static/img/pytorch-logo-dark-unstable.png'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['../source/_static']
+
+# html_style_path = 'css/pytorch_theme.css'
+html_context = {
+    'css_files': [
+        'https://fonts.googleapis.com/css?family=Lato',
+        '_static/css/pytorch_theme.css'
+    ],
+}
+
+
+# -- Options for HTMLHelp output ------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'PyTorchdoc'
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'pytorch.tex', 'PyTorch Documentation',
+     'Torch Contributors', 'manual'),
+]
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'PyTorch', 'PyTorch Documentation',
+     [author], 1)
+]
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'PyTorch', 'PyTorch Documentation',
+     author, 'PyTorch', 'One line description of project.',
+     'Miscellaneous'),
+]
diff --git a/docs/cpp/contributing.rst b/docs/cpp/contributing.rst
new file mode 100644
index 0000000000000..5a1988f1db7c6
--- /dev/null
+++ b/docs/cpp/contributing.rst
@@ -0,0 +1,2 @@
+Contributing
+============
diff --git a/docs/cpp/examples.rst b/docs/cpp/examples.rst
new file mode 100644
index 0000000000000..bac945d559fec
--- /dev/null
+++ b/docs/cpp/examples.rst
@@ -0,0 +1,2 @@
+Examples
+========
diff --git a/docs/cpp/index.rst b/docs/cpp/index.rst
new file mode 100644
index 0000000000000..2743c3ea650b4
--- /dev/null
+++ b/docs/cpp/index.rst
@@ -0,0 +1,41 @@
+PyTorch C++ API
+===============
+
+The PyTorch C++ API is a research and production ready C++ interface to PyTorch,
+a library for tensors and dynamic neural networks with strong GPU acceleration.
+
+Description
+-----------
+
+The PyTorch C++ API provides all the major building blocks to research and iterate on
+state of the art machine learning models with a user friendly modern C++ interface,
+as well as providing an excellent platform for deploying machine learning applications
+in bare bones, high performance environments.
+
+1. Design Philosophy
+2. Description of components
+3. One small example
+
+License
+-------
+
+
+
+Contents
+========
+
+.. toctree::
+   :maxdepth: 2
+
+   api/library_root
+   examples
+   building
+   contributing
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/docs/cpp/requirements.txt b/docs/cpp/requirements.txt
new file mode 100644
index 0000000000000..ef1bcc7e57448
--- /dev/null
+++ b/docs/cpp/requirements.txt
@@ -0,0 +1,4 @@
+sphinx>=1.7.5
+-e git://github.com/snide/sphinx_rtd_theme.git#egg=sphinx_rtd_theme
+breathe
+exhale
diff --git a/test/cpp/api/integration.cpp b/test/cpp/api/integration.cpp
index 5e89c87e148cb..58e716e3096cf 100644
--- a/test/cpp/api/integration.cpp
+++ b/test/cpp/api/integration.cpp
@@ -163,7 +163,7 @@ bool test_mnist(
       }
     }
 
-    return data.toBackend(useGPU ? torch::kCUDA : torch::kCPU);
+    return data.toBackend(useGPU ? torch::Backend::CUDA : torch::Backend::CPU);
   };
 
   auto readLabels = [&](std::string fn) {
@@ -177,7 +177,7 @@ bool test_mnist(
     for (int i = 0; i < label_count; ++i) {
       a_data[i] = static_cast<int64_t>(rd.read_byte());
     }
-    return data.toBackend(useGPU ? torch::kCUDA : torch::kCPU);
+    return data.toBackend(useGPU ? torch::Backend::CUDA : torch::Backend::CPU);
   };
 
   auto trdata = readData("test/cpp/api/mnist/train-images-idx3-ubyte");
diff --git a/test/cpp/api/serialization.cpp b/test/cpp/api/serialization.cpp
index 7173547231258..282d5422615db 100644
--- a/test/cpp/api/serialization.cpp
+++ b/test/cpp/api/serialization.cpp
@@ -59,7 +59,7 @@ TEST_CASE("serialization") {
       }
 
       auto x = torch::ones(
-          {5, 5}, torch::getType(torch::kCPU, static_cast<torch::Dtype>(i)));
+          {5, 5}, torch::getType(torch::Backend::CPU, static_cast<torch::Dtype>(i)));
       auto y = torch::empty({});
 
       std::stringstream ss;
diff --git a/test/cpp/api/tensor_options.cpp b/test/cpp/api/tensor_options.cpp
index ab1de64db0a51..12a6a321034ca 100644
--- a/test/cpp/api/tensor_options.cpp
+++ b/test/cpp/api/tensor_options.cpp
@@ -32,7 +32,7 @@ TEST_CASE("TensorOptions/DefaultsToTheRightValues") {
 
 TEST_CASE("TensorOptions/ReturnsTheCorrectType") {
   auto options = TensorOptions().device(kCPU).dtype(kInt).layout(kSparse);
-  REQUIRE(options.type() == getType(kSparseCPU, kInt));
+  REQUIRE(options.type() == getType(Backend::SparseCPU, kInt));
 }
 
 TEST_CASE("TensorOptions/UtilityFunctionsReturnTheRightTensorOptions") {
@@ -62,10 +62,10 @@ TEST_CASE("TensorOptions/ConstructsWellFromCPUTypes") {
   options = TensorOptions(kInt);
   REQUIRE_OPTIONS(kCPU, -1, kInt, kStrided);
 
-  options = TensorOptions(getType(kSparseCPU, kFloat));
+  options = TensorOptions(getType(Backend::SparseCPU, kFloat));
   REQUIRE_OPTIONS(kCPU, -1, kFloat, kSparse);
 
-  options = TensorOptions(getType(kSparseCPU, kByte));
+  options = TensorOptions(getType(Backend::SparseCPU, kByte));
   REQUIRE_OPTIONS(kCPU, -1, kByte, kSparse);
 }
 
@@ -73,7 +73,7 @@ TEST_CASE("TensorOptions/ConstructsWellFromCPUTensors") {
   auto options = TensorOptions(empty(5, kDouble));
   REQUIRE_OPTIONS(kCPU, -1, kDouble, kStrided);
 
-  options = TensorOptions(empty(5, getType(kSparseCPU, kByte)));
+  options = TensorOptions(empty(5, getType(Backend::SparseCPU, kByte)));
   REQUIRE_OPTIONS(kCPU, -1, kByte, kSparse);
 }
 
diff --git a/test/cpp/api/tensor_options_cuda.cpp b/test/cpp/api/tensor_options_cuda.cpp
index f5b0635c834d9..4fca1efa4d7c1 100644
--- a/test/cpp/api/tensor_options_cuda.cpp
+++ b/test/cpp/api/tensor_options_cuda.cpp
@@ -28,16 +28,16 @@ TEST_CASE("TensorOptions/ConstructsWellFromCUDATypes", "[cuda]") {
   options = TensorOptions(CUDA(kInt));
   REQUIRE_OPTIONS(kCUDA, -1, kInt, kStrided);
 
-  options = TensorOptions(getType(kSparseCUDA, kFloat));
+  options = TensorOptions(getType(Backend::SparseCUDA, kFloat));
   REQUIRE_OPTIONS(kCUDA, -1, kFloat, kSparse);
 
-  options = TensorOptions(getType(kSparseCUDA, kByte));
+  options = TensorOptions(getType(Backend::SparseCUDA, kByte));
   REQUIRE_OPTIONS(kCUDA, -1, kByte, kSparse);
 
   options = TensorOptions(CUDA(kFloat), /*device=*/5);
   REQUIRE_OPTIONS(kCUDA, 5, kFloat, kStrided);
 
-  options = TensorOptions(getType(kSparseCUDA, kFloat), /*device=*/5);
+  options = TensorOptions(getType(Backend::SparseCUDA, kFloat), /*device=*/5);
   REQUIRE_OPTIONS(kCUDA, 5, kFloat, kSparse);
 }
 
@@ -45,7 +45,7 @@ TEST_CASE("TensorOptions/ConstructsWellFromCUDATensors", "[multi-cuda]") {
   auto options = TensorOptions(empty(5, device(kCUDA).dtype(kDouble)));
   REQUIRE_OPTIONS(kCUDA, 0, kDouble, kStrided);
 
-  options = TensorOptions(empty(5, getType(kSparseCUDA, kByte)));
+  options = TensorOptions(empty(5, getType(Backend::SparseCUDA, kByte)));
   REQUIRE_OPTIONS(kCUDA, 0, kByte, kSparse);
 
   if (at::globalContext().getNumGPUs() > 1) {
diff --git a/test/cpp_extensions/cudnn_extension.cpp b/test/cpp_extensions/cudnn_extension.cpp
index 754c09c8179e7..7c3be3e471630 100644
--- a/test/cpp_extensions/cudnn_extension.cpp
+++ b/test/cpp_extensions/cudnn_extension.cpp
@@ -31,10 +31,10 @@ void cudnn_relu_check(const at::Tensor& inputs, const at::Tensor& outputs) {
   // TensorArgs.
   at::checkContiguous(cudnn_relu_name, arg_inputs);
   at::checkScalarType(cudnn_relu_name, arg_inputs, at::kFloat);
-  at::checkBackend(cudnn_relu_name, arg_inputs.tensor, at::kCUDA);
+  at::checkBackend(cudnn_relu_name, arg_inputs.tensor, at::Backend::CUDA);
   at::checkContiguous(cudnn_relu_name, arg_outputs);
   at::checkScalarType(cudnn_relu_name, arg_outputs, at::kFloat);
-  at::checkBackend(cudnn_relu_name, arg_outputs.tensor, at::kCUDA);
+  at::checkBackend(cudnn_relu_name, arg_outputs.tensor, at::Backend::CUDA);
   at::checkSameSize(cudnn_relu_name, arg_inputs, arg_outputs);
 }
 
diff --git a/test/expect/TestJit.test_broadcast_fusion_cuda.expect b/test/expect/TestJit.test_broadcast_fusion_cuda.expect
new file mode 100644
index 0000000000000..6c70d7d9494a1
--- /dev/null
+++ b/test/expect/TestJit.test_broadcast_fusion_cuda.expect
@@ -0,0 +1,18 @@
+graph(%0 : Float(4, 4)
+      %1 : Float(4)
+      %2 : Float(4)) {
+  %3 : int[] = prim::Constant[value=[4, 4]]()
+  %4 : int = prim::Constant[value=0]()
+  %5 : Float(4!, 4) = aten::expand(%1, %3, %4)
+  %6 : Float(4!, 4) = aten::expand(%2, %3, %4)
+  %7 : Float(4, 4) = prim::FusionGroup_0[device=0](%6, %0, %5)
+  return (%7);
+}
+with prim::FusionGroup_0 = graph(%1 : Float(4!, 4)
+      %4 : Float(4, 4)
+      %5 : Float(4!, 4)) {
+  %6 : Float(4, 4) = aten::mul(%4, %5)
+  %2 : int = prim::Constant[value=1]()
+  %3 : Float(4, 4) = aten::add(%6, %1, %2)
+  return (%3);
+}
diff --git a/test/expect/TestJit.test_concat_fusion_invariant_cuda.expect b/test/expect/TestJit.test_concat_fusion_invariant_cuda.expect
index bf45946ecd7c5..91eb27bc7d042 100644
--- a/test/expect/TestJit.test_concat_fusion_invariant_cuda.expect
+++ b/test/expect/TestJit.test_concat_fusion_invariant_cuda.expect
@@ -2,16 +2,16 @@ graph(%0 : Float(2, 2)
       %1 : Float(2, 2)
       %2 : Float(4, 2)) {
   %3 : int = prim::Constant[value=1]()
-  %4 : Float(2, 2) = aten::sub(%0, %1, %3)
-  %5 : Float(4, 2) = prim::FusionGroup_0[device=0](%4, %0, %1)
-  %6 : Float(4, 2) = aten::add(%5, %2, %3)
-  return (%6);
+  %4 : Float(4, 2) = prim::FusionGroup_0[device=0](%0, %1)
+  %5 : Float(4, 2) = aten::add(%4, %2, %3)
+  return (%5);
 }
-with prim::FusionGroup_0 = graph(%1 : Float(2, 2)
-      %3 : Float(2, 2)
+with prim::FusionGroup_0 = graph(%3 : Float(2, 2)
       %4 : Float(2, 2)) {
+  %7 : int = prim::Constant[value=1]()
+  %8 : Float(2, 2) = aten::add(%3, %4, %7)
   %5 : int = prim::Constant[value=1]()
-  %6 : Float(2, 2) = aten::add(%3, %4, %5)
-  %2 : Float(4, 2) = prim::FusedConcat[dim=0](%6, %1)
+  %6 : Float(2, 2) = aten::sub(%3, %4, %5)
+  %2 : Float(4, 2) = prim::FusedConcat[dim=0](%8, %6)
   return (%2);
 }
diff --git a/test/expect/TestJit.test_fusion_distribute.expect b/test/expect/TestJit.test_fusion_distribute.expect
index 380a92c8a112d..d42e97fc60304 100644
--- a/test/expect/TestJit.test_fusion_distribute.expect
+++ b/test/expect/TestJit.test_fusion_distribute.expect
@@ -1,20 +1,16 @@
 graph(%0 : Float(4, 4)
       %1 : Float(4, 4)) {
-  %2 : int = prim::Constant[value=1]()
-  %3 : int = prim::Constant[value=2]()
-  %4 : Float(4!, 2), %5 : Float(4!, 2) = aten::chunk(%0, %3, %2)
-  %6 : Float(4!, 2), %7 : Float(4!, 2) = aten::chunk(%1, %3, %2)
-  %8 : Float(4, 2) = prim::FusionGroup_0[device=0](%4, %6, %5, %7)
-  return (%8);
+  %2 : Float(4, 2) = prim::FusionGroup_0[device=0](%0, %1)
+  return (%2);
 }
-with prim::FusionGroup_0 = graph(%3 : Float(4!, 2)
-      %4 : Float(4!, 2)
-      %7 : Float(4!, 2)
-      %8 : Float(4!, 2)) {
+with prim::FusionGroup_0 = graph(%11 : Float(4, 4)
+      %14 : Float(4, 4)) {
+  %15 : Dynamic, %16 : Dynamic = prim::FusedChunk[chunks=2, dim=1](%14)
+  %12 : Dynamic, %13 : Dynamic = prim::FusedChunk[chunks=2, dim=1](%11)
   %9 : int = prim::Constant[value=1]()
-  %10 : Float(4, 2) = aten::add(%7, %8, %9)
+  %10 : Float(4, 2) = aten::add(%13, %16, %9)
   %5 : int = prim::Constant[value=1]()
-  %6 : Float(4, 2) = aten::add(%3, %4, %5)
+  %6 : Float(4, 2) = aten::add(%12, %15, %5)
   %2 : Float(4, 2) = aten::mul(%6, %10)
   return (%2);
 }
diff --git a/test/expect/TestJit.test_lstm_fusion_concat.expect b/test/expect/TestJit.test_lstm_fusion_concat.expect
index f0771c133c11d..e441e535d060f 100644
--- a/test/expect/TestJit.test_lstm_fusion_concat.expect
+++ b/test/expect/TestJit.test_lstm_fusion_concat.expect
@@ -10,29 +10,22 @@ graph(%0 : Float(3, 10)
   %9 : Float(3, 80) = aten::addmm(%5, %0, %7, %8, %8)
   %10 : Float(20!, 80!) = aten::t(%4)
   %11 : Float(3, 80) = aten::addmm(%6, %1, %10, %8, %8)
-  %12 : int = prim::Constant[value=4]()
-  %13 : Float(3!, 20), %14 : Float(3!, 20), %15 : Float(3!, 20), %16 : Float(3!, 20) = aten::chunk(%9, %12, %8)
-  %17 : Float(3!, 20), %18 : Float(3!, 20), %19 : Float(3!, 20), %20 : Float(3!, 20) = aten::chunk(%11, %12, %8)
-  %21 : Float(6, 20) = prim::FusionGroup_0[device=0](%2, %16, %20, %15, %19, %14, %18, %13, %17)
-  return (%21);
+  %12 : Float(6, 20) = prim::FusionGroup_0[device=0](%2, %9, %11)
+  return (%12);
 }
 with prim::FusionGroup_0 = graph(%15 : Float(3, 20)
-      %25 : Float(3!, 20)
-      %26 : Float(3!, 20)
-      %29 : Float(3!, 20)
-      %30 : Float(3!, 20)
-      %33 : Float(3!, 20)
-      %34 : Float(3!, 20)
-      %37 : Float(3!, 20)
-      %38 : Float(3!, 20)) {
+      %41 : Float(3, 80)
+      %46 : Float(3, 80)) {
+  %47 : Dynamic, %48 : Dynamic, %49 : Dynamic, %50 : Dynamic = prim::FusedChunk[chunks=4, dim=1](%46)
+  %42 : Dynamic, %43 : Dynamic, %44 : Dynamic, %45 : Dynamic = prim::FusedChunk[chunks=4, dim=1](%41)
   %39 : int = prim::Constant[value=1]()
-  %40 : Float(3, 20) = aten::add(%37, %38, %39)
+  %40 : Float(3, 20) = aten::add(%42, %47, %39)
   %35 : int = prim::Constant[value=1]()
-  %36 : Float(3, 20) = aten::add(%33, %34, %35)
+  %36 : Float(3, 20) = aten::add(%43, %48, %35)
   %31 : int = prim::Constant[value=1]()
-  %32 : Float(3, 20) = aten::add(%29, %30, %31)
+  %32 : Float(3, 20) = aten::add(%44, %49, %31)
   %27 : int = prim::Constant[value=1]()
-  %28 : Float(3, 20) = aten::add(%25, %26, %27)
+  %28 : Float(3, 20) = aten::add(%45, %50, %27)
   %24 : Float(3, 20) = aten::sigmoid(%40)
   %22 : Float(3, 20) = aten::sigmoid(%36)
   %20 : Float(3, 20) = aten::tanh(%32)
diff --git a/test/expect/TestJit.test_lstm_fusion_cuda.expect b/test/expect/TestJit.test_lstm_fusion_cuda.expect
index 06be6cbb5d44a..5e9140d7a6690 100644
--- a/test/expect/TestJit.test_lstm_fusion_cuda.expect
+++ b/test/expect/TestJit.test_lstm_fusion_cuda.expect
@@ -10,29 +10,22 @@ graph(%0 : Float(3, 10)
   %9 : Float(3, 80) = aten::addmm(%5, %0, %7, %8, %8)
   %10 : Float(20!, 80!) = aten::t(%4)
   %11 : Float(3, 80) = aten::addmm(%6, %1, %10, %8, %8)
-  %12 : int = prim::Constant[value=4]()
-  %13 : Float(3!, 20), %14 : Float(3!, 20), %15 : Float(3!, 20), %16 : Float(3!, 20) = aten::chunk(%9, %12, %8)
-  %17 : Float(3!, 20), %18 : Float(3!, 20), %19 : Float(3!, 20), %20 : Float(3!, 20) = aten::chunk(%11, %12, %8)
-  %21 : Float(3, 20), %22 : Float(3, 20) = prim::FusionGroup_0[device=0](%2, %16, %20, %15, %19, %14, %18, %13, %17)
-  return (%21, %22);
+  %12 : Float(3, 20), %13 : Float(3, 20) = prim::FusionGroup_0[device=0](%2, %9, %11)
+  return (%12, %13);
 }
 with prim::FusionGroup_0 = graph(%13 : Float(3, 20)
-      %23 : Float(3!, 20)
-      %24 : Float(3!, 20)
-      %27 : Float(3!, 20)
-      %28 : Float(3!, 20)
-      %31 : Float(3!, 20)
-      %32 : Float(3!, 20)
-      %35 : Float(3!, 20)
-      %36 : Float(3!, 20)) {
+      %39 : Float(3, 80)
+      %44 : Float(3, 80)) {
+  %45 : Dynamic, %46 : Dynamic, %47 : Dynamic, %48 : Dynamic = prim::FusedChunk[chunks=4, dim=1](%44)
+  %40 : Dynamic, %41 : Dynamic, %42 : Dynamic, %43 : Dynamic = prim::FusedChunk[chunks=4, dim=1](%39)
   %37 : int = prim::Constant[value=1]()
-  %38 : Float(3, 20) = aten::add(%35, %36, %37)
+  %38 : Float(3, 20) = aten::add(%40, %45, %37)
   %33 : int = prim::Constant[value=1]()
-  %34 : Float(3, 20) = aten::add(%31, %32, %33)
+  %34 : Float(3, 20) = aten::add(%41, %46, %33)
   %29 : int = prim::Constant[value=1]()
-  %30 : Float(3, 20) = aten::add(%27, %28, %29)
+  %30 : Float(3, 20) = aten::add(%42, %47, %29)
   %25 : int = prim::Constant[value=1]()
-  %26 : Float(3, 20) = aten::add(%23, %24, %25)
+  %26 : Float(3, 20) = aten::add(%43, %48, %25)
   %22 : Float(3, 20) = aten::sigmoid(%38)
   %20 : Float(3, 20) = aten::sigmoid(%34)
   %18 : Float(3, 20) = aten::tanh(%30)
diff --git a/test/expect/TestScript.test_chunk_fusion_cuda.expect b/test/expect/TestScript.test_chunk_fusion_cuda.expect
new file mode 100644
index 0000000000000..ef1e9ff797792
--- /dev/null
+++ b/test/expect/TestScript.test_chunk_fusion_cuda.expect
@@ -0,0 +1,11 @@
+graph(%x : Float(10, 6)) {
+  %1 : Float(10, 2) = prim::FusionGroup_0[device=0](%x)
+  return (%1);
+}
+with prim::FusionGroup_0 = graph(%7 : Float(10, 6)) {
+  %8 : Dynamic, %9 : Dynamic, %10 : Dynamic = prim::FusedChunk[chunks=3, dim=1](%7)
+  %6 : Float(10, 2) = aten::mul(%8, %9)
+  %2 : int = prim::Constant[value=1]()
+  %3 : Float(10, 2) = aten::add(%6, %10, %2)
+  return (%3);
+}
diff --git a/test/expect/TestScript.test_chunk_multiple_fusion_cuda.expect b/test/expect/TestScript.test_chunk_multiple_fusion_cuda.expect
new file mode 100644
index 0000000000000..7bc39ee65b134
--- /dev/null
+++ b/test/expect/TestScript.test_chunk_multiple_fusion_cuda.expect
@@ -0,0 +1,30 @@
+graph(%s : Float(5, 2, 3)
+      %x : Float(5, 6, 3)
+      %y : Float(10, 2, 3)
+      %z : Float(5, 2, 6)) {
+  %4 : Float(5, 2, 3) = prim::FusionGroup_0[device=0](%s, %y, %x, %z)
+  return (%4);
+}
+with prim::FusionGroup_0 = graph(%24 : Float(5, 2, 3)
+      %28 : Float(10, 2, 3)
+      %31 : Float(5, 6, 3)
+      %35 : Float(5, 2, 6)) {
+  %36 : Dynamic, %37 : Dynamic = prim::FusedChunk[chunks=2, dim=2](%35)
+  %32 : Dynamic, %33 : Dynamic, %34 : Dynamic = prim::FusedChunk[chunks=3, dim=1](%31)
+  %29 : Dynamic, %30 : Dynamic = prim::FusedChunk[chunks=2, dim=0](%28)
+  %26 : int = prim::Constant[value=1]()
+  %27 : Float(5, 2, 3) = aten::add(%24, %32, %26)
+  %22 : int = prim::Constant[value=1]()
+  %23 : Float(5, 2, 3) = aten::add(%27, %33, %22)
+  %18 : int = prim::Constant[value=1]()
+  %19 : Float(5, 2, 3) = aten::add(%23, %34, %18)
+  %14 : int = prim::Constant[value=1]()
+  %15 : Float(5, 2, 3) = aten::add(%19, %29, %14)
+  %10 : int = prim::Constant[value=1]()
+  %11 : Float(5, 2, 3) = aten::add(%15, %30, %10)
+  %6 : int = prim::Constant[value=1]()
+  %7 : Float(5, 2, 3) = aten::add(%11, %36, %6)
+  %2 : int = prim::Constant[value=1]()
+  %3 : Float(5, 2, 3) = aten::add(%7, %37, %2)
+  return (%3);
+}
diff --git a/test/expect/TestScript.test_lstm_fusion_cuda-backward.expect b/test/expect/TestScript.test_lstm_fusion_cuda-backward.expect
index a0c059e6b716e..9c5e077ad626f 100644
--- a/test/expect/TestScript.test_lstm_fusion_cuda-backward.expect
+++ b/test/expect/TestScript.test_lstm_fusion_cuda-backward.expect
@@ -18,70 +18,56 @@ graph(%0 : Float(3, 20!)
       %outgate : Float(3, 20)
       %18 : Float(3, 20)) {
   %19 : int = prim::Constant[value=1]()
-  %20 : Float(3, 20) = prim::FusionGroup_0[device=0](%18)
-  %21 : Float(3, 20) = aten::add(%20, %19, %19)
-  %22 : Float(3, 20) = prim::FusionGroup_1[device=0](%1, %21, %0, %outgate)
-  %23 : Float(3, 20) = aten::mul(%22, %19)
-  %24 : Float(3, 20) = aten::neg(%outgate)
-  %25 : Float(3, 20) = aten::add(%24, %19, %19)
-  %26 : Float(3, 20) = prim::FusionGroup_2[device=0](%cellgate)
-  %27 : Float(3, 20) = aten::add(%26, %19, %19)
-  %28 : Float(3, 20) = aten::neg(%forgetgate)
-  %29 : Float(3, 20) = aten::add(%28, %19, %19)
-  %30 : Float(3, 20) = aten::neg(%ingate)
-  %31 : Float(3, 20) = aten::add(%30, %19, %19)
-  %32 : Float(3, 80) = prim::FusionGroup_3[device=0](%31, %ingate, %29, %forgetgate, %27, %25, %outgate, %22, %cx, %23, %cellgate, %0, %18)
-  %33 : Float(3, 80) = aten::mul(%32, %19)
-  %34 : Float(80!, 3!) = aten::t(%33)
-  %35 : Float(80, 20) = aten::mm(%34, %hx)
-  %36 : Float(80!, 3!) = aten::t(%32)
-  %37 : Float(80, 10) = aten::mm(%36, %x)
-  return (%37, %35, %33, %33);
+  %20 : Float(3, 80) = prim::FusionGroup_0[device=0](%ingate, %forgetgate, %cellgate, %outgate, %cx, %1, %18, %0)
+  %21 : Float(3, 80) = aten::mul(%20, %19)
+  %22 : Float(80!, 3!) = aten::t(%21)
+  %23 : Float(80, 20) = aten::mm(%22, %hx)
+  %24 : Float(80!, 3!) = aten::t(%20)
+  %25 : Float(80, 10) = aten::mm(%24, %x)
+  return (%25, %23, %21, %21);
 }
-with prim::FusionGroup_0 = graph(%2 : Float(3, 20)) {
-  %3 : Float(3, 20) = aten::mul(%2, %2)
-  %1 : Float(3, 20) = aten::neg(%3)
-  return (%1);
-}
-with prim::FusionGroup_1 = graph(%0 : Float(3, 20!)
-      %5 : Float(3, 20)
-      %7 : Float(3, 20!)
-      %8 : Float(3, 20)) {
-  %9 : Float(3, 20) = aten::mul(%7, %8)
-  %6 : Float(3, 20) = aten::mul(%9, %5)
-  %2 : int = prim::Constant[value=1]()
-  %3 : Float(3, 20) = aten::add(%0, %6, %2)
-  return (%3);
-}
-with prim::FusionGroup_2 = graph(%2 : Float(3, 20)) {
-  %3 : Float(3, 20) = aten::mul(%2, %2)
-  %1 : Float(3, 20) = aten::neg(%3)
-  return (%1);
-}
-with prim::FusionGroup_3 = graph(%6 : Float(3, 20)
-      %9 : Float(3, 20)
-      %12 : Float(3, 20)
-      %15 : Float(3, 20)
-      %18 : Float(3, 20)
-      %21 : Float(3, 20)
-      %24 : Float(3, 20)
-      %26 : Float(3, 20)
-      %27 : Float(3, 20)
-      %29 : Float(3, 20)
-      %31 : Float(3, 20)
-      %33 : Float(3, 20!)
-      %34 : Float(3, 20)) {
-  %35 : Float(3, 20) = aten::mul(%33, %34)
-  %32 : Float(3, 20) = aten::mul(%29, %31)
-  %30 : Float(3, 20) = aten::mul(%29, %9)
-  %28 : Float(3, 20) = aten::mul(%26, %27)
-  %25 : Float(3, 20) = aten::mul(%35, %24)
-  %22 : Float(3, 20) = aten::mul(%25, %21)
-  %19 : Float(3, 20) = aten::mul(%30, %18)
-  %16 : Float(3, 20) = aten::mul(%28, %15)
-  %13 : Float(3, 20) = aten::mul(%16, %12)
-  %10 : Float(3, 20) = aten::mul(%32, %9)
-  %7 : Float(3, 20) = aten::mul(%10, %6)
-  %4 : Float(3, 80) = prim::FusedConcat[dim=1](%7, %13, %19, %22)
+with prim::FusionGroup_0 = graph(%9 : Float(3, 20)
+      %19 : Float(3, 20)
+      %33 : Float(3, 20)
+      %39 : Float(3, 20)
+      %46 : Float(3, 20)
+      %53 : Float(3, 20!)
+      %65 : Float(3, 20)
+      %67 : Float(3, 20!)) {
+  %69 : Float(3, 20) = aten::mul(%67, %65)
+  %68 : Float(3, 20) = aten::mul(%67, %39)
+  %66 : Float(3, 20) = aten::mul(%65, %65)
+  %64 : Float(3, 20) = aten::neg(%66)
+  %61 : int = prim::Constant[value=1]()
+  %62 : Float(3, 20) = aten::add(%64, %61, %61)
+  %59 : Float(3, 20) = aten::mul(%68, %62)
+  %55 : int = prim::Constant[value=1]()
+  %56 : Float(3, 20) = aten::add(%53, %59, %55)
+  %51 : int = prim::Constant[value=1]()
+  %52 : Float(3, 20) = aten::mul(%56, %51)
+  %50 : Float(3, 20) = aten::mul(%52, %33)
+  %49 : Float(3, 20) = aten::mul(%52, %9)
+  %47 : Float(3, 20) = aten::mul(%56, %46)
+  %44 : Float(3, 20) = aten::neg(%39)
+  %42 : int = prim::Constant[value=1]()
+  %43 : Float(3, 20) = aten::add(%44, %42, %42)
+  %40 : Float(3, 20) = aten::mul(%69, %39)
+  %37 : Float(3, 20) = aten::mul(%40, %43)
+  %34 : Float(3, 20) = aten::mul(%33, %33)
+  %32 : Float(3, 20) = aten::neg(%34)
+  %29 : int = prim::Constant[value=1]()
+  %30 : Float(3, 20) = aten::add(%32, %29, %29)
+  %27 : Float(3, 20) = aten::mul(%49, %30)
+  %24 : Float(3, 20) = aten::neg(%19)
+  %22 : int = prim::Constant[value=1]()
+  %23 : Float(3, 20) = aten::add(%24, %22, %22)
+  %20 : Float(3, 20) = aten::mul(%47, %19)
+  %17 : Float(3, 20) = aten::mul(%20, %23)
+  %14 : Float(3, 20) = aten::neg(%9)
+  %12 : int = prim::Constant[value=1]()
+  %13 : Float(3, 20) = aten::add(%14, %12, %12)
+  %10 : Float(3, 20) = aten::mul(%50, %9)
+  %7 : Float(3, 20) = aten::mul(%10, %13)
+  %4 : Float(3, 80) = prim::FusedConcat[dim=1](%7, %17, %27, %37)
   return (%4);
 }
diff --git a/test/expect/TestScript.test_lstm_fusion_cuda-forward.expect b/test/expect/TestScript.test_lstm_fusion_cuda-forward.expect
index 7b84828fdf10a..df7e19bf8ad49 100644
--- a/test/expect/TestScript.test_lstm_fusion_cuda-forward.expect
+++ b/test/expect/TestScript.test_lstm_fusion_cuda-forward.expect
@@ -8,26 +8,42 @@ graph(%x.1 : Float(3, 10)
   %7 : Float(10!, 80!) = aten::t(%w_ih)
   %8 : Float(20!, 80!) = aten::t(%w_hh)
   %9 : Float(3, 80) = aten::mm(%hx.1, %8)
-  %10 : int = prim::Constant[value=1]()
-  %11 : float = prim::Constant[value=1]()
-  %12 : float = prim::Constant[value=1]()
-  %13 : Float(3, 80) = aten::addmm(%9, %x.1, %7, %11, %12)
-  %14 : Float(3, 80) = aten::add(%13, %b_ih, %10)
-  %gates : Float(3, 80) = aten::add(%14, %b_hh, %10)
-  %16 : int = prim::Constant[value=4]()
-  %ingate.1 : Float(3!, 20), %forgetgate.1 : Float(3!, 20), %cellgate.1 : Float(3!, 20), %outgate.1 : Float(3!, 20) = aten::chunk(%gates, %16, %10)
-  %hy : Float(3, 20), %22 : Float(3, 20), %cy : Float(3, 20), %outgate.2 : Float(3, 20), %cellgate.2 : Float(3, 20), %forgetgate.2 : Float(3, 20), %ingate.2 : Float(3, 20) = prim::FusionGroup_0[device=0](%cx.1, %outgate.1, %cellgate.1, %forgetgate.1, %ingate.1)
-  return (%hy, %cy, %7, %8, %ingate.2, %forgetgate.2, %cellgate.2, %outgate.2, %22);
+  %10 : float = prim::Constant[value=1]()
+  %11 : Float(3, 80) = aten::addmm(%9, %x.1, %7, %10, %10)
+  %12 : int[] = prim::Constant[value=[3, 80]]()
+  %13 : int = prim::Constant[value=0]()
+  %14 : Float(3!, 80) = aten::expand(%b_ih, %12, %13)
+  %15 : Float(3!, 80) = aten::expand(%b_hh, %12, %13)
+  %hy : Float(3, 20), %17 : Float(3, 20), %cy : Float(3, 20), %outgate.2 : Float(3, 20), %cellgate.2 : Float(3, 20), %forgetgate.2 : Float(3, 20), %ingate.2 : Float(3, 20) = prim::FusionGroup_0[device=0](%cx.1, %15, %11, %14)
+  return (%hy, %cy, %7, %8, %ingate.2, %forgetgate.2, %cellgate.2, %outgate.2, %17);
 }
 with prim::FusionGroup_0 = graph(%13 : Float(3, 20)
-      %15 : Float(3!, 20)
-      %17 : Float(3!, 20)
-      %19 : Float(3!, 20)
-      %21 : Float(3!, 20)) {
-  %ingate.2 : Float(3, 20) = aten::sigmoid(%21)
-  %forgetgate.2 : Float(3, 20) = aten::sigmoid(%19)
-  %cellgate.2 : Float(3, 20) = aten::tanh(%17)
-  %outgate.2 : Float(3, 20) = aten::sigmoid(%15)
+      %55 : Float(3!, 80)
+      %60 : Float(3, 80)
+      %65 : Float(3!, 80)) {
+  %66 : Dynamic, %67 : Dynamic, %68 : Dynamic, %69 : Dynamic = prim::FusedChunk[chunks=4, dim=1](%65)
+  %61 : Dynamic, %62 : Dynamic, %63 : Dynamic, %64 : Dynamic = prim::FusedChunk[chunks=4, dim=1](%60)
+  %56 : Dynamic, %57 : Dynamic, %58 : Dynamic, %59 : Dynamic = prim::FusedChunk[chunks=4, dim=1](%55)
+  %53 : int = prim::Constant[value=1]()
+  %54 : Float(3, 20) = aten::add(%61, %66, %53)
+  %49 : int = prim::Constant[value=1]()
+  %50 : Float(3, 20) = aten::add(%62, %67, %49)
+  %45 : int = prim::Constant[value=1]()
+  %46 : Float(3, 20) = aten::add(%63, %68, %45)
+  %41 : int = prim::Constant[value=1]()
+  %42 : Float(3, 20) = aten::add(%64, %69, %41)
+  %37 : int = prim::Constant[value=1]()
+  %38 : Float(3, 20) = aten::add(%54, %56, %37)
+  %33 : int = prim::Constant[value=1]()
+  %34 : Float(3, 20) = aten::add(%50, %57, %33)
+  %29 : int = prim::Constant[value=1]()
+  %30 : Float(3, 20) = aten::add(%46, %58, %29)
+  %25 : int = prim::Constant[value=1]()
+  %26 : Float(3, 20) = aten::add(%42, %59, %25)
+  %ingate.2 : Float(3, 20) = aten::sigmoid(%38)
+  %forgetgate.2 : Float(3, 20) = aten::sigmoid(%34)
+  %cellgate.2 : Float(3, 20) = aten::tanh(%30)
+  %outgate.2 : Float(3, 20) = aten::sigmoid(%26)
   %14 : Float(3, 20) = aten::mul(%forgetgate.2, %13)
   %11 : Float(3, 20) = aten::mul(%ingate.2, %cellgate.2)
   %7 : int = prim::Constant[value=1]()
diff --git a/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect b/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect
index 1d0bfa9dc75d3..020b1c68bd5a9 100644
--- a/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect
+++ b/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect
@@ -26,94 +26,88 @@ graph(%0 : Float(3, 20!)
       %cellgate : Float(3, 20)
       %outgate : Float(3, 20)
       %27 : Float(3, 20)) {
-  %28 : int = prim::Constant[value=1]()
-  %29 : Float(3, 20) = prim::FusionGroup_0[device=0](%27)
-  %30 : Float(3, 20) = aten::add(%29, %28, %28)
-  %31 : Float(3, 20) = prim::FusionGroup_1[device=0](%1, %30, %0, %outgate)
-  %32 : Float(3, 20) = aten::mul(%31, %28)
-  %33 : Float(3, 20) = aten::neg(%outgate)
-  %34 : Float(3, 20) = aten::add(%33, %28, %28)
-  %35 : Float(3, 20) = prim::FusionGroup_2[device=0](%cellgate)
-  %36 : Float(3, 20) = aten::add(%35, %28, %28)
-  %37 : Float(3, 20) = aten::neg(%forgetgate)
-  %38 : Float(3, 20) = aten::add(%37, %28, %28)
-  %39 : Float(3, 20) = aten::neg(%ingate)
-  %40 : Float(3, 20) = aten::add(%39, %28, %28)
-  %41 : Float(3, 80) = prim::FusionGroup_3[device=0](%40, %ingate, %38, %forgetgate, %36, %34, %outgate, %31, %cx, %32, %cellgate, %0, %27)
-  %42 : Float(3, 80) = aten::mul(%41, %28)
-  %43 : Float(3, 80) = aten::mul(%42, %Uz)
-  %44 : Float(3, 80) = aten::mul(%42, %beta_h)
-  %45 : Float(3, 80) = aten::mul(%42, %Wx)
-  %46 : Float(3, 80) = aten::mul(%42, %beta_i)
-  %47 : Float(3, 80) = prim::FusionGroup_4[device=0](%44, %41, %22)
-  %48 : Float(3, 80), %49 : Float(3, 80) = prim::FusionGroup_5[device=0](%Wx, %41, %Uz)
-  %50 : Float(3, 80) = aten::mul(%49, %alpha)
-  %51 : Float(3, 80) = aten::add(%46, %50, %28)
-  %52 : Float(80!, 3!) = aten::t(%47)
-  %53 : Float(80, 20) = aten::mm(%52, %hx)
-  %54 : Float(80!, 3!) = aten::t(%51)
-  %55 : Float(80, 10) = aten::mm(%54, %x)
-  return (%55, %53, %48, %45, %43, %42);
-}
-with prim::FusionGroup_0 = graph(%2 : Float(3, 20)) {
-  %3 : Float(3, 20) = aten::mul(%2, %2)
-  %1 : Float(3, 20) = aten::neg(%3)
-  return (%1);
-}
-with prim::FusionGroup_1 = graph(%0 : Float(3, 20!)
-      %5 : Float(3, 20)
-      %7 : Float(3, 20!)
-      %8 : Float(3, 20)) {
-  %9 : Float(3, 20) = aten::mul(%7, %8)
-  %6 : Float(3, 20) = aten::mul(%9, %5)
-  %2 : int = prim::Constant[value=1]()
-  %3 : Float(3, 20) = aten::add(%0, %6, %2)
-  return (%3);
+  %28 : Float(3, 80) = prim::FusionGroup_0[device=0](%ingate, %forgetgate, %cellgate, %outgate, %cx, %1, %27, %0)
+  %29 : int[] = prim::Constant[value=[3, 80]]()
+  %30 : int = prim::Constant[value=0]()
+  %31 : Float(3!, 80) = aten::expand(%beta_h, %29, %30)
+  %32 : Float(3!, 80) = aten::expand(%beta_i, %29, %30)
+  %33 : Float(3!, 80) = aten::expand(%alpha, %29, %30)
+  %34 : Float(3, 80), %35 : Float(3, 80), %36 : Float(3, 80), %37 : Float(3, 80), %38 : Float(3, 80), %39 : Float(3, 80) = prim::FusionGroup_1[device=0](%33, %32, %Wx, %28, %Uz, %22, %31)
+  %40 : Float(80!, 3!) = aten::t(%36)
+  %41 : Float(80, 20) = aten::mm(%40, %hx)
+  %42 : Float(80!, 3!) = aten::t(%34)
+  %43 : Float(80, 10) = aten::mm(%42, %x)
+  return (%43, %41, %35, %37, %38, %39);
 }
-with prim::FusionGroup_2 = graph(%2 : Float(3, 20)) {
-  %3 : Float(3, 20) = aten::mul(%2, %2)
-  %1 : Float(3, 20) = aten::neg(%3)
-  return (%1);
-}
-with prim::FusionGroup_3 = graph(%6 : Float(3, 20)
-      %9 : Float(3, 20)
-      %12 : Float(3, 20)
-      %15 : Float(3, 20)
-      %18 : Float(3, 20)
-      %21 : Float(3, 20)
-      %24 : Float(3, 20)
-      %26 : Float(3, 20)
-      %27 : Float(3, 20)
-      %29 : Float(3, 20)
-      %31 : Float(3, 20)
-      %33 : Float(3, 20!)
-      %34 : Float(3, 20)) {
-  %35 : Float(3, 20) = aten::mul(%33, %34)
-  %32 : Float(3, 20) = aten::mul(%29, %31)
-  %30 : Float(3, 20) = aten::mul(%29, %9)
-  %28 : Float(3, 20) = aten::mul(%26, %27)
-  %25 : Float(3, 20) = aten::mul(%35, %24)
-  %22 : Float(3, 20) = aten::mul(%25, %21)
-  %19 : Float(3, 20) = aten::mul(%30, %18)
-  %16 : Float(3, 20) = aten::mul(%28, %15)
-  %13 : Float(3, 20) = aten::mul(%16, %12)
-  %10 : Float(3, 20) = aten::mul(%32, %9)
-  %7 : Float(3, 20) = aten::mul(%10, %6)
-  %4 : Float(3, 80) = prim::FusedConcat[dim=1](%7, %13, %19, %22)
+with prim::FusionGroup_0 = graph(%9 : Float(3, 20)
+      %19 : Float(3, 20)
+      %33 : Float(3, 20)
+      %39 : Float(3, 20)
+      %46 : Float(3, 20)
+      %53 : Float(3, 20!)
+      %65 : Float(3, 20)
+      %67 : Float(3, 20!)) {
+  %69 : Float(3, 20) = aten::mul(%67, %65)
+  %68 : Float(3, 20) = aten::mul(%67, %39)
+  %66 : Float(3, 20) = aten::mul(%65, %65)
+  %64 : Float(3, 20) = aten::neg(%66)
+  %61 : int = prim::Constant[value=1]()
+  %62 : Float(3, 20) = aten::add(%64, %61, %61)
+  %59 : Float(3, 20) = aten::mul(%68, %62)
+  %55 : int = prim::Constant[value=1]()
+  %56 : Float(3, 20) = aten::add(%53, %59, %55)
+  %51 : int = prim::Constant[value=1]()
+  %52 : Float(3, 20) = aten::mul(%56, %51)
+  %50 : Float(3, 20) = aten::mul(%52, %33)
+  %49 : Float(3, 20) = aten::mul(%52, %9)
+  %47 : Float(3, 20) = aten::mul(%56, %46)
+  %44 : Float(3, 20) = aten::neg(%39)
+  %42 : int = prim::Constant[value=1]()
+  %43 : Float(3, 20) = aten::add(%44, %42, %42)
+  %40 : Float(3, 20) = aten::mul(%69, %39)
+  %37 : Float(3, 20) = aten::mul(%40, %43)
+  %34 : Float(3, 20) = aten::mul(%33, %33)
+  %32 : Float(3, 20) = aten::neg(%34)
+  %29 : int = prim::Constant[value=1]()
+  %30 : Float(3, 20) = aten::add(%32, %29, %29)
+  %27 : Float(3, 20) = aten::mul(%49, %30)
+  %24 : Float(3, 20) = aten::neg(%19)
+  %22 : int = prim::Constant[value=1]()
+  %23 : Float(3, 20) = aten::add(%24, %22, %22)
+  %20 : Float(3, 20) = aten::mul(%47, %19)
+  %17 : Float(3, 20) = aten::mul(%20, %23)
+  %14 : Float(3, 20) = aten::neg(%9)
+  %12 : int = prim::Constant[value=1]()
+  %13 : Float(3, 20) = aten::add(%14, %12, %12)
+  %10 : Float(3, 20) = aten::mul(%50, %9)
+  %7 : Float(3, 20) = aten::mul(%10, %13)
+  %4 : Float(3, 80) = prim::FusedConcat[dim=1](%7, %17, %27, %37)
   return (%4);
 }
-with prim::FusionGroup_4 = graph(%0 : Float(3, 80)
-      %4 : Float(3, 80)
-      %5 : Float(3, 80)) {
-  %6 : Float(3, 80) = aten::mul(%4, %5)
+with prim::FusionGroup_1 = graph(%5 : Float(3!, 80)
+      %8 : Float(3!, 80)
+      %10 : Float(3, 80)
+      %12 : Float(3, 80)
+      %13 : Float(3, 80)
+      %20 : Float(3, 80)
+      %22 : Float(3!, 80)) {
+  %30 : int = prim::Constant[value=1]()
+  %29 : int = prim::Constant[value=1]()
+  %28 : int = prim::Constant[value=1]()
+  %26 : int = prim::Constant[value=1]()
+  %27 : Float(3, 80) = aten::mul(%12, %26)
+  %25 : Float(3, 80) = aten::mul(%27, %13)
+  %24 : Float(3, 80) = aten::mul(%27, %10)
+  %23 : Float(3, 80) = aten::mul(%27, %22)
+  %21 : Float(3, 80) = aten::mul(%12, %20)
+  %19 : int = prim::Constant[value=1]()
+  %17 : int = prim::Constant[value=1]()
+  %18 : Float(3, 80) = aten::add(%23, %21, %17)
+  %14 : Float(3, 80) = aten::mul(%12, %13)
+  %11 : Float(3, 80) = aten::mul(%14, %10)
+  %9 : Float(3, 80) = aten::mul(%27, %8)
+  %6 : Float(3, 80) = aten::mul(%14, %5)
   %2 : int = prim::Constant[value=1]()
-  %3 : Float(3, 80) = aten::add(%0, %6, %2)
-  return (%3);
-}
-with prim::FusionGroup_5 = graph(%1 : Float(3, 80)
-      %3 : Float(3, 80)
-      %4 : Float(3, 80)) {
-  %5 : Float(3, 80) = aten::mul(%3, %4)
-  %2 : Float(3, 80) = aten::mul(%5, %1)
-  return (%2, %5);
+  %3 : Float(3, 80) = aten::add(%9, %6, %2)
+  return (%3, %11, %18, %24, %25, %27);
 }
diff --git a/test/expect/TestScript.test_milstm_fusion_cuda-forward.expect b/test/expect/TestScript.test_milstm_fusion_cuda-forward.expect
index bf846fa6e8a4d..c466b37c47d4a 100644
--- a/test/expect/TestScript.test_milstm_fusion_cuda-forward.expect
+++ b/test/expect/TestScript.test_milstm_fusion_cuda-forward.expect
@@ -12,36 +12,67 @@ graph(%x.1 : Float(3, 10)
   %11 : Float(20!, 80!) = aten::t(%w_hh)
   %Uz.1 : Float(3, 80) = aten::mm(%hx.1, %11)
   %13 : Float(3, 80) = aten::mul(%alpha.1, %Wx.1)
-  %14 : Float(3, 80) = aten::mul(%beta_i.1, %Wx.1)
-  %15 : int = prim::Constant[value=1]()
-  %16 : Float(3, 80) = aten::mul(%beta_h.1, %Uz.1)
-  %17 : Float(3, 80) = prim::FusionGroup_0[device=0](%16, %14, %13, %Uz.1)
-  %gates : Float(3, 80) = aten::add(%17, %bias, %15)
-  %19 : int = prim::Constant[value=4]()
-  %ingate.1 : Float(3!, 20), %forgetgate.1 : Float(3!, 20), %cellgate.1 : Float(3!, 20), %outgate.1 : Float(3!, 20) = aten::chunk(%gates, %19, %15)
-  %hy : Float(3, 20), %25 : Float(3, 20), %cy : Float(3, 20), %outgate.2 : Float(3, 20), %cellgate.2 : Float(3, 20), %forgetgate.2 : Float(3, 20), %ingate.2 : Float(3, 20) = prim::FusionGroup_1[device=0](%cx.1, %outgate.1, %cellgate.1, %forgetgate.1, %ingate.1)
-  return (%hy, %cy, %9, %Wx.1, %11, %Uz.1, %13, %ingate.2, %forgetgate.2, %cellgate.2, %outgate.2, %25);
+  %14 : int[] = prim::Constant[value=[3, 80]]()
+  %15 : int = prim::Constant[value=0]()
+  %16 : Float(3!, 80) = aten::expand(%beta_i.1, %14, %15)
+  %17 : Float(3!, 80) = aten::expand(%beta_h.1, %14, %15)
+  %18 : Float(3!, 80) = aten::expand(%bias, %14, %15)
+  %hy : Float(3, 20), %20 : Float(3, 20), %cy : Float(3, 20), %outgate.2 : Float(3, 20), %cellgate.2 : Float(3, 20), %forgetgate.2 : Float(3, 20), %ingate.2 : Float(3, 20) = prim::FusionGroup_0[device=0](%cx.1, %Wx.1, %18, %17, %Uz.1, %16, %13)
+  return (%hy, %cy, %9, %Wx.1, %11, %Uz.1, %13, %ingate.2, %forgetgate.2, %cellgate.2, %outgate.2, %20);
 }
-with prim::FusionGroup_0 = graph(%1 : Float(3, 80)
-      %5 : Float(3, 80)
-      %8 : Float(3, 80)
-      %9 : Float(3, 80)) {
-  %10 : Float(3, 80) = aten::mul(%8, %9)
-  %6 : int = prim::Constant[value=1]()
-  %7 : Float(3, 80) = aten::add(%10, %5, %6)
-  %2 : int = prim::Constant[value=1]()
-  %3 : Float(3, 80) = aten::add(%7, %1, %2)
-  return (%3);
-}
-with prim::FusionGroup_1 = graph(%13 : Float(3, 20)
-      %15 : Float(3!, 20)
-      %17 : Float(3!, 20)
-      %19 : Float(3!, 20)
-      %21 : Float(3!, 20)) {
-  %ingate.2 : Float(3, 20) = aten::sigmoid(%21)
-  %forgetgate.2 : Float(3, 20) = aten::sigmoid(%19)
-  %cellgate.2 : Float(3, 20) = aten::tanh(%17)
-  %outgate.2 : Float(3, 20) = aten::sigmoid(%15)
+with prim::FusionGroup_0 = graph(%13 : Float(3, 20)
+      %107 : Float(3, 80)
+      %112 : Float(3!, 80)
+      %117 : Float(3!, 80)
+      %122 : Float(3, 80)
+      %127 : Float(3!, 80)
+      %132 : Float(3, 80)) {
+  %133 : Dynamic, %134 : Dynamic, %135 : Dynamic, %136 : Dynamic = prim::FusedChunk[chunks=4, dim=1](%132)
+  %128 : Dynamic, %129 : Dynamic, %130 : Dynamic, %131 : Dynamic = prim::FusedChunk[chunks=4, dim=1](%127)
+  %123 : Dynamic, %124 : Dynamic, %125 : Dynamic, %126 : Dynamic = prim::FusedChunk[chunks=4, dim=1](%122)
+  %118 : Dynamic, %119 : Dynamic, %120 : Dynamic, %121 : Dynamic = prim::FusedChunk[chunks=4, dim=1](%117)
+  %113 : Dynamic, %114 : Dynamic, %115 : Dynamic, %116 : Dynamic = prim::FusedChunk[chunks=4, dim=1](%112)
+  %108 : Dynamic, %109 : Dynamic, %110 : Dynamic, %111 : Dynamic = prim::FusedChunk[chunks=4, dim=1](%107)
+  %106 : Float(3, 20) = aten::mul(%136, %126)
+  %103 : Float(3, 20) = aten::mul(%131, %111)
+  %100 : Float(3, 20) = aten::mul(%135, %125)
+  %97 : Float(3, 20) = aten::mul(%133, %123)
+  %94 : Float(3, 20) = aten::mul(%128, %108)
+  %91 : Float(3, 20) = aten::mul(%134, %124)
+  %88 : Float(3, 20) = aten::mul(%129, %109)
+  %85 : Float(3, 20) = aten::mul(%130, %110)
+  %81 : int = prim::Constant[value=1]()
+  %82 : Float(3, 20) = aten::add(%91, %88, %81)
+  %78 : Float(3, 20) = aten::mul(%119, %124)
+  %74 : int = prim::Constant[value=1]()
+  %75 : Float(3, 20) = aten::add(%97, %94, %74)
+  %71 : Float(3, 20) = aten::mul(%118, %123)
+  %68 : Float(3, 20) = aten::mul(%120, %125)
+  %64 : int = prim::Constant[value=1]()
+  %65 : Float(3, 20) = aten::add(%100, %85, %64)
+  %61 : Float(3, 20) = aten::mul(%121, %126)
+  %57 : int = prim::Constant[value=1]()
+  %58 : Float(3, 20) = aten::add(%106, %103, %57)
+  %53 : int = prim::Constant[value=1]()
+  %54 : Float(3, 20) = aten::add(%75, %71, %53)
+  %49 : int = prim::Constant[value=1]()
+  %50 : Float(3, 20) = aten::add(%82, %78, %49)
+  %45 : int = prim::Constant[value=1]()
+  %46 : Float(3, 20) = aten::add(%65, %68, %45)
+  %41 : int = prim::Constant[value=1]()
+  %42 : Float(3, 20) = aten::add(%58, %61, %41)
+  %37 : int = prim::Constant[value=1]()
+  %38 : Float(3, 20) = aten::add(%54, %113, %37)
+  %33 : int = prim::Constant[value=1]()
+  %34 : Float(3, 20) = aten::add(%50, %114, %33)
+  %29 : int = prim::Constant[value=1]()
+  %30 : Float(3, 20) = aten::add(%46, %115, %29)
+  %25 : int = prim::Constant[value=1]()
+  %26 : Float(3, 20) = aten::add(%42, %116, %25)
+  %ingate.2 : Float(3, 20) = aten::sigmoid(%38)
+  %forgetgate.2 : Float(3, 20) = aten::sigmoid(%34)
+  %cellgate.2 : Float(3, 20) = aten::tanh(%30)
+  %outgate.2 : Float(3, 20) = aten::sigmoid(%26)
   %14 : Float(3, 20) = aten::mul(%forgetgate.2, %13)
   %11 : Float(3, 20) = aten::mul(%ingate.2, %cellgate.2)
   %7 : int = prim::Constant[value=1]()
diff --git a/test/expect/TestScript.test_tensor_scalar_fusion_cuda-1.expect b/test/expect/TestScript.test_tensor_scalar_fusion_cuda-1.expect
new file mode 100644
index 0000000000000..adab021b4ea32
--- /dev/null
+++ b/test/expect/TestScript.test_tensor_scalar_fusion_cuda-1.expect
@@ -0,0 +1,11 @@
+graph(%x : Float(2, 2)) {
+  %1 : Float(2, 2) = prim::FusionGroup_0[device=0](%x)
+  return (%1);
+}
+with prim::FusionGroup_0 = graph(%0 : Float(2, 2)) {
+  %z : float = prim::Constant[value=3]()
+  %4 : int = prim::Constant[value=1]()
+  %y : Float(2, 2) = aten::add(%0, %z, %4)
+  %2 : Float(2, 2) = aten::mul(%0, %y)
+  return (%2);
+}
diff --git a/test/expect/TestScript.test_tensor_scalar_fusion_cuda-2.expect b/test/expect/TestScript.test_tensor_scalar_fusion_cuda-2.expect
new file mode 100644
index 0000000000000..47280ca90154d
--- /dev/null
+++ b/test/expect/TestScript.test_tensor_scalar_fusion_cuda-2.expect
@@ -0,0 +1,8 @@
+graph(%x : Float(2, 2)
+      %z : Float()) {
+  %2 : int = prim::TensorToNum(%z)
+  %3 : int = prim::Constant[value=1]()
+  %y : Dynamic = aten::add(%x, %2, %3)
+  %5 : Dynamic = aten::mul(%x, %y)
+  return (%5);
+}
diff --git a/test/onnx/expect/TestOperators.test_acos.expect b/test/onnx/expect/TestOperators.test_acos.expect
new file mode 100644
index 0000000000000..219f74f6b2fcb
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_acos.expect
@@ -0,0 +1,46 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.4"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "Acos"
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 7
+}
diff --git a/test/onnx/expect/TestOperators.test_asin.expect b/test/onnx/expect/TestOperators.test_asin.expect
new file mode 100644
index 0000000000000..2f9438488521d
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_asin.expect
@@ -0,0 +1,46 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.4"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "Asin"
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 7
+}
diff --git a/test/onnx/expect/TestOperators.test_atan.expect b/test/onnx/expect/TestOperators.test_atan.expect
new file mode 100644
index 0000000000000..b265c17a1a4a5
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_atan.expect
@@ -0,0 +1,46 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.4"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "Atan"
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 7
+}
diff --git a/test/onnx/expect/TestOperators.test_cos.expect b/test/onnx/expect/TestOperators.test_cos.expect
new file mode 100644
index 0000000000000..fc4d799ec0e87
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_cos.expect
@@ -0,0 +1,46 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.4"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "Cos"
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 7
+}
diff --git a/test/onnx/expect/TestOperators.test_sin.expect b/test/onnx/expect/TestOperators.test_sin.expect
new file mode 100644
index 0000000000000..a6d733d7b7e03
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_sin.expect
@@ -0,0 +1,46 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.4"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "Sin"
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 7
+}
diff --git a/test/onnx/expect/TestOperators.test_tan.expect b/test/onnx/expect/TestOperators.test_tan.expect
new file mode 100644
index 0000000000000..58b7ff689b993
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_tan.expect
@@ -0,0 +1,46 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.4"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "Tan"
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 7
+}
diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py
index 9d0777949eb28..a7362453e4bfc 100644
--- a/test/onnx/test_operators.py
+++ b/test/onnx/test_operators.py
@@ -358,6 +358,30 @@ def test_exp(self):
         x = Variable(torch.randn(3, 4), requires_grad=True)
         self.assertONNX(lambda x: x.exp(), x)
 
+    def test_sin(self):
+        x = Variable(torch.randn(3, 4), requires_grad=True)
+        self.assertONNX(lambda x: x.sin(), x)
+
+    def test_cos(self):
+        x = Variable(torch.randn(3, 4), requires_grad=True)
+        self.assertONNX(lambda x: x.cos(), x)
+
+    def test_tan(self):
+        x = Variable(torch.randn(3, 4), requires_grad=True)
+        self.assertONNX(lambda x: x.tan(), x)
+
+    def test_asin(self):
+        x = Variable(torch.rand(3, 4), requires_grad=True)
+        self.assertONNX(lambda x: x.asin(), x)
+
+    def test_acos(self):
+        x = Variable(torch.rand(3, 4), requires_grad=True)
+        self.assertONNX(lambda x: x.acos(), x)
+
+    def test_atan(self):
+        x = Variable(torch.randn(3, 4), requires_grad=True)
+        self.assertONNX(lambda x: x.atan(), x)
+
     def test_flatten(self):
         # Flatten is a special case of Reshape when the output is a 2-D tensor.
         x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True)
diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py
index f0a9ee4eb4451..1f039f648dfcb 100644
--- a/test/onnx/test_pytorch_onnx_caffe2.py
+++ b/test/onnx/test_pytorch_onnx_caffe2.py
@@ -559,6 +559,24 @@ def forward(self, input):
         input = Variable(torch.empty(BATCH_SIZE, 10, 10).uniform_(4, 9))
         self.run_model_test(MyModel(), train=False, input=input, batch_size=BATCH_SIZE)
 
+    def test_trigonometry(self):
+        def test_func(name):
+            class MyModel(torch.nn.Module):
+                def __init__(self):
+                    super(MyModel, self).__init__()
+
+                def forward(self, input):
+                    return getattr(input, name)()
+            input = Variable(torch.empty(BATCH_SIZE, 10, 10).uniform_())
+            self.run_model_test(MyModel(), train=False, input=input, batch_size=BATCH_SIZE)
+
+        test_func('cos')
+        test_func('sin')
+        test_func('tan')
+        test_func('acos')
+        test_func('asin')
+        test_func('atan')
+
     def test_addconstant(self):
         class MyModel(torch.nn.Module):
             def __init__(self):
diff --git a/test/test_jit.py b/test/test_jit.py
index 31714fcd5330c..f7945cd9cb25c 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -22,7 +22,7 @@
 import shutil
 import warnings
 from test_autograd import method_tests, create_input, unpack_variables, \
-    exclude_tensor_method, EXCLUDE_GRADCHECK, EXCLUDE_FUNCTIONAL
+    exclude_tensor_method, non_differentiable, EXCLUDE_GRADCHECK, EXCLUDE_FUNCTIONAL
 from copy import deepcopy
 import random
 
@@ -475,6 +475,21 @@ def forward(self, x):
         self.assertExportImport(trace, (t,) + tuple(model.parameters()))
         self.assertExpectedONNXGraph(trace)
 
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    @skipIfRocm
+    def test_broadcast_fusion_cuda(self):
+        def scaleshift(x, scale, shift):
+            return x * scale + shift
+
+        inputs = [
+            torch.randn(4, 4, dtype=torch.float, device='cuda'),
+            torch.randn(4, dtype=torch.float, device='cuda'),
+            torch.randn(4, dtype=torch.float, device='cuda'),
+        ]
+        ge = self.checkTrace(scaleshift, inputs)
+        self.assertExpectedGraph(ge.graph_for(*inputs))
+
     # TODO: Fuser doesn't work at all when inputs require grad. Fix that
     @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
     @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
@@ -2188,6 +2203,88 @@ def func2(x):
             func2.graph, (torch.zeros(1, 1, 1, 1, 4),), False)
         self.assertExpected(canonical(func2.graph), subname='2')
 
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @unittest.skipIf(not RUN_CUDA, "No CUDA")
+    @skipIfRocm
+    def test_chunk_fusion_cuda(self):
+        def fn(x):
+            a, b, c = x.chunk(3, 1)
+            return a * b + c
+
+        inputs = [torch.randn(10, 6, dtype=torch.float, device='cuda')]
+
+        self.checkScript(fn, inputs)
+
+        fn_script = torch.jit.script(fn)
+        _ = fn_script(*inputs)
+        self.assertExpectedGraph(fn_script.graph_for(*inputs))
+
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @unittest.skipIf(not RUN_CUDA, "No CUDA")
+    @skipIfRocm
+    def test_chunk_multiple_fusion_cuda(self):
+        # The arguments are intentionally used out of order as a test to see
+        # if the fusion compiler adds extra args in the correct order
+        def fn(s, x, y, z):
+            z1, z2 = z.chunk(2, 2)
+            x1, x2, x3 = x.chunk(3, 1)
+            y1, y2 = y.chunk(2, 0)
+            return s + x1 + x2 + x3 + y1 + y2 + z1 + z2
+
+        inputs = [
+            torch.randn(5, 2, 3, dtype=torch.float, device='cuda'),
+            torch.randn(5, 6, 3, dtype=torch.float, device='cuda'),
+            torch.randn(10, 2, 3, dtype=torch.float, device='cuda'),
+            torch.randn(5, 2, 6, dtype=torch.float, device='cuda'),
+        ]
+
+        self.checkScript(fn, inputs)
+
+        fn_script = torch.jit.script(fn)
+        _ = fn_script(*inputs)
+        self.assertExpectedGraph(fn_script.graph_for(*inputs))
+
+    @staticmethod
+    def _test_chunk_fusion_correctness(self, device='cpu'):
+        def chunk_4_0(x):
+            x0, x1, x2, x3 = x.chunk(4, 0)
+            return x0 + x1 + x2 + x3
+
+        def chunk_4_1(x):
+            x0, x1, x2, x3 = x.chunk(4, 1)
+            return x0 + x1 + x2 + x3
+
+        def chunk_4_last(x):
+            x0, x1, x2, x3 = x.chunk(4, 2)
+            return x0 + x1 + x2 + x3
+
+        fns = [chunk_4_0, chunk_4_1, chunk_4_last]
+        tensors = [
+            # splitSize = 1
+            torch.randn(4, 4, 4, dtype=torch.float, device=device),
+
+            # contiguous case
+            torch.randn(12, 8, 16, dtype=torch.float, device=device),
+
+            # non-contiguous case
+            torch.randn(12, 8, 16, dtype=torch.float, device=device).transpose(1, 2),
+        ]
+
+        for tensor in tensors:
+            for fn in fns:
+                self.checkScript(fn, [tensor])
+
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @skipIfRocm
+    def test_chunk_fusion_correctness(self):
+        return self._test_chunk_fusion_correctness(self, 'cpu')
+
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @unittest.skipIf(not RUN_CUDA, "No CUDA")
+    @skipIfRocm
+    def test_chunk_fusion_correctness_cuda(self):
+        return self._test_chunk_fusion_correctness(self, 'cuda')
+
     def test_cat(self):
         @torch.jit.script
         def func(x):
@@ -2313,6 +2410,32 @@ def func2():
 
         self.checkScript(func2, ())
 
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    @skipIfRocm
+    def test_tensor_scalar_fusion_cuda(self):
+        def should_fuse(x):
+            z = 3.
+            y = x + z
+            return x * y
+
+        # XXX: right now we only support fusing scalars if
+        # they're constant (#9940)
+        def should_not_fuse(x, z):
+            y = x + int(z)
+            return x * y
+
+        inputs = [torch.randn(2, 2, dtype=torch.float, device='cuda')]
+        ge = self.checkScript(should_fuse, inputs)
+        self.assertExpectedGraph(ge.graph_for(*inputs), subname='1')
+
+        inputs = [
+            torch.randn(2, 2, dtype=torch.float, device='cuda'),
+            torch.tensor(3., dtype=torch.float, device='cuda'),
+        ]
+        ge = self.checkScript(should_not_fuse, inputs)
+        self.assertExpectedGraph(ge.graph_for(*inputs), subname='2')
+
     def test_list_ops(self):
         def test_equality():
             a = [1, 2, 3]
@@ -6103,6 +6226,16 @@ def forward(self, x, y):
     'test_split_dim_neg0',
     'test_gesv',
     'test_inverse',
+
+    # nn functional test
+    # schema not found for onnx node
+    'test_nn_instance_norm',
+
+    # output no dependence with traced input, tracer confusion
+    'test_nn_rrelu',
+
+    # aten op has additional cudnn argument
+    'test_nn_group_norm',
 }
 
 # known to be failing in script
@@ -6158,6 +6291,53 @@ def forward(self, x, y):
     'test_split_dim_neg0',
     'test_gesv',
     'test_inverse',
+    # skipped nn functional tests
+    # ops involves sampling which could not test
+    'test_nn_dropout',
+    'test_nn_alpha_dropout',
+    'test_nn_dropout2d',
+    'test_nn_dropout3d',
+    'test_nn_feature_alpha_dropout',
+
+    'test_nn_adaptive_max_pool1d',
+    'test_nn_adaptive_max_pool2d',
+    'test_nn_adaptive_max_pool3d',
+
+
+    # argument has custom behavior
+    'test_nn_fractional_max_pool2d',
+    'test_nn_embedding',
+    'test_nn_embedding_bag',
+    'test_nn_batch_norm',
+    # aten op has additional cudnn argument
+    'test_nn_group_norm',
+    'test_nn_nll_loss',
+    'test_nn_unfold',
+    'test_nn_max_unpool2d',
+
+    # argument type not supported
+    'test_nn_affine_grid',
+
+    # unknown builtin op
+    'test_nn_tanhshrink',
+    'test_nn_softsign',
+    'test_nn_softmin',
+    'test_nn_local_response_norm',
+    'test_nn_poisson_nll_loss',
+    'test_nn_cross_entropy',
+    'test_nn_binary_cross_entropy_with_logits',
+    'test_nn_multilabel_soft_margin_loss',
+    'test_nn_pixel_shuffle',
+    'test_nn_interpolate',
+    'test_nn_pad',
+    'test_nn_cosine_similarity',
+    'test_nn_normalize',
+    'test_nn_fold',
+    'test_nn_linear',
+    'test_nn_max_unpool1d',
+    'test_nn_lp_pool1d',
+    'test_nn_lp_pool2d',
+    'test_nn_instance_norm',
 }
 
 
@@ -6187,7 +6367,7 @@ def the_method({}):
 '''
 
 
-def create_script_fn(method_name, is_functional, output_process_fn):
+def create_script_fn(method_name, func_type, output_process_fn):
     def script_fn(*args, **kwargs):
         formals = []
         tensors = []
@@ -6203,10 +6383,15 @@ def script_fn(*args, **kwargs):
         kwargs_str = ''
         for k, v in kwargs.items():
             kwargs_str += ', ' + k + '=' + str(v)
-        if is_functional:
+        if func_type == 'functional':
             call = 'torch.{}({}{})'.format(method_name, ', '.join(actuals), kwargs_str)
-        else:
+        elif func_type == 'method':
             call = '{}.{}({}{})'.format(actuals[0], method_name, ', '.join(actuals[1:]), kwargs_str)
+        elif func_type == 'nn_functional':
+            call = 'torch.nn.functional.{}({}{})'.format(method_name, ', '.join(actuals), kwargs_str)
+        else:
+            raise 'Unsupported function type'
+
         script = script_template.format(', '.join(formals), call)
         CU = torch.jit.CompilationUnit(script)
         return output_process_fn(CU.the_method(*tensors))
@@ -6413,6 +6598,116 @@ def func(x):
     "test_reciprocal_scalar",
 ]
 
+L = 20
+M = 10
+S = 5
+
+# NB: JIT script tests for all nn functional interfaces, script mode does
+# not support in_place operations yet, so no inplace operation tests added.
+# removed all the deprecated functions
+#
+# (
+#   method name,
+#   input size/constructing fn,
+#   args (tuple represents shape of a tensor arg),
+#   test variant name (will be used at test name suffix),    // optional
+#   indices for possible dim arg,                            // optional
+#   fn mapping output to part that should be gradcheck'ed,   // optional
+# )
+nn_functional_tests = [
+    # TODO: default arguments for None type not supported, add
+    # manually as argument, remove when ATen default arg system ready
+    ('conv1d', (S, S, S), ((S, S, S), None)),
+    ('conv2d', (S, S, S, S), ((S, S, S, S), None)),
+    ('conv3d', (S, S, S, S, S), ((S, S, S, S, S), None)),
+    ('conv_transpose1d', (S, S, S), ((S, S, S), None)),
+    ('conv_transpose2d', (S, S, S, S), ((S, S, S, S), None)),
+    ('conv_transpose3d', (S, S, S, S, S), ((S, S, S, S, S), None)),
+    ('conv_tbc', (S, S, S), ((S, S, S), (S,), 2)),
+    ('avg_pool1d', (S, S, S), (3,)),
+    ('avg_pool2d', (S, S, S, S), (3,)),
+    ('avg_pool3d', (S, S, S, S, S), (3,)),
+    ('fractional_max_pool2d', (S, S, S, S), (3, [2, 3], None)),
+    ('max_pool1d', (S, S, S), (2, 1)),
+    ('max_pool2d', (S, S, S, S), (2, 1)),
+    ('max_pool3d', (S, S, S, S, S), (2, 1)),
+    ('max_unpool1d', torch.tensor([[[2., 4]]]), (torch.tensor([[[1, 3]]]), 2, 2, 0)),
+    ('max_unpool2d', torch.tensor([[[[2., 4]]]]), (torch.tensor([[[[1, 3]]]]), 2, 2, 0)),
+    ('lp_pool1d', (S, S, S), (2, 3, 2,)),
+    ('lp_pool2d', (S, S, S, S), (2, 3, 2,)),
+    ('adaptive_max_pool1d', (S, S, S), (5,)),
+    ('adaptive_max_pool2d', (S, S, S, S), ([5, 7],)),
+    ('adaptive_max_pool3d', (S, S, S, S, S), ([3, 2, 2],)),
+    ('adaptive_avg_pool1d', (S, S, S), (5,)),
+    ('adaptive_avg_pool2d', (S, S, S, S), ([5, 7],)),
+    ('adaptive_avg_pool3d', (S, S, S, S, S), ([3, 2, 2],)),
+    ('dropout', (S, S, S), (0.5,)),
+    ('alpha_dropout', (S, S, S), (0.5,)),
+    ('dropout2d', (S, S, S), (0.5,)),
+    ('dropout3d', (S, S, S), (0.5,)),
+    ('feature_alpha_dropout', (S, S, S), (0.5,)),
+    ('threshold', (S, S, S), (0.1, 2),),
+    ('relu', (S, S, S), (),),
+    ('glu', (S - 1, S - 1, S - 1), (),),
+    ('hardtanh', (S, S, S), (-0.5, 0.5),),
+    ('elu', (S, S, S), (0.9,),),
+    ('selu', (S, S, S), (),),
+    ('celu', (S, S, S), (0.9,),),
+    ('leaky_relu', (S, S, S), (0.02,),),
+    ('rrelu', (S, S), (0.1, 0.3, False, None),),
+    ('hardshrink', (S, S, S), (0.4,),),
+    ('tanhshrink', (S, S, S), (),),
+    ('softsign', (S, S, S), (),),
+    ('softplus', (S, S, S), (),),
+    ('softmin', (S, S, S), (0,),),
+    ('softmax', (S, S, S), (0,),),
+    ('log_softmax', (S, S, S), (0,),),
+    ('linear', (S, S), ((M, S), None),),
+    ('bilinear', (S, S, S), ((S, S, M), torch.zeros(M, S, M), None),),
+    ('embedding', torch.tensor([[1, 2, 4, 5], [4, 3, 2, 5]]), (torch.rand(6, 3), ),),
+    ('embedding_bag', torch.tensor([1, 2, 4, 2]), (torch.rand(5, 3), torch.tensor([0, 4]),),),
+    ('batch_norm', (S, S), (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)), ),),
+    ('instance_norm', (S, S, S), (non_differentiable(torch.zeros(S)), non_differentiable(torch.ones(S))),),
+    ('layer_norm', (S, S, S, S), ([5], None, None),),
+    ('group_norm', (S, S, S), (1, torch.Tensor(5), None),),
+    ('local_response_norm', (S, S, S), (2, ),),
+    ('nll_loss', F.log_softmax(torch.randn(3, 5), dim=0), (torch.tensor([1, 0, 4]), None, None),),
+    ('poisson_nll_loss', (S, 2), ((S, 2),),),
+    ('kl_div', F.log_softmax(torch.randn(S, 10), 1), (F.softmax(torch.randn(S, 10), 1),),),
+    ('cross_entropy', (3, S), (torch.randint(S, (3,), dtype=torch.int64),),),
+    ('binary_cross_entropy_with_logits', (3,), (torch.empty(3).random_(2), ),),
+    ('smooth_l1_loss', (3, S), (non_differentiable(torch.rand(3, S)),),),
+    ('l1_loss', (3, S), (non_differentiable(torch.rand(3, S)),),),
+    ('mse_loss', (3, S), (non_differentiable(torch.rand(3, S)),),),
+    ('margin_ranking_loss', (3, S), ((3, S), (S,)),),
+    ('hinge_embedding_loss', (3, S), (non_differentiable(torch.rand(3, S)),),),
+    ('soft_margin_loss', (3, S), (non_differentiable(torch.rand(3, S)),),),
+    ('multilabel_soft_margin_loss', (3, S), (non_differentiable(torch.rand(3, S)),),),
+    ('cosine_embedding_loss', (S, S), ((S, S), non_differentiable(torch.rand(S,))),),
+    ('pixel_shuffle', (1, 9, 4, 4), (3,),),
+    ('interpolate', torch.zeros(3, 3).view(1, 1, 3, 3), (2,),),
+    ('affine_grid', (S, 2, 3), (torch.Size([S, 1, 7, 7]),),),
+    ('pad', (3, 3, 4, 2), ([1, 1],),),
+    ('pairwise_distance', (S, S), ((S, S),),),
+    ('cosine_similarity', (S, S), ((S, S),),),
+    ('triplet_margin_loss', (S, S), ((S, S), (S, S)),),
+    ('normalize', (S, S, S), (),),
+    ('unfold', (S, S, S, S), ([2, 3]),),
+    ('fold', (1, 3 * 2 * 2, 12), ([4, 5], [2, 2]),),
+
+    # distribution sampling make result different in every run for below ops:
+    # ('gumbel_softmax', (S, S), (2,),),
+    #
+    #  No high order gradient for the below ops
+    # ('multilabel_margin_loss', torch.tensor([[0.2, -0.2, 0.07]]), (torch.tensor([[0, 0, 1]]),),),
+    # ('max_unpool3d', torch.tensor([[[[[2., 4]]]]]), (torch.tensor([[[[[1, 3]]]]]), 2, 2, 0)),
+    # ('grid_sample', (S, S, S, S), (non_differentiable(torch.rand(S, S, S, 2)),),),
+    # ('multi_margin_loss', (S, S), (non_differentiable(torch.randint(S, (S, ), dtype=torch.int64)),),),
+    # ('binary_cross_entropy', torch.randn(3, 2).sigmoid(), (non_differentiable(torch.rand(3, 2)),),),
+    # ('ctc_loss', torch.randn(S, S, S).log_softmax(2).detach().requires_grad_(), (torch.randint(1, S + 1, (S, S),
+    # dtype=torch.long), torch.full((S,), S, dtype=torch.long), torch.randint(1,S,(S,), dtype=torch.long))),
+]
+
 
 def add_test(
         name,
@@ -6444,7 +6739,7 @@ def check(name):
                 # FixMe: run grad checks on inplace self
                 if is_inplace:
                     self_variable.requires_grad = False
-                # need to record this because methods can change the szie (e.g. unsqueeze)
+                # need to record this because methods can change the size (e.g. unsqueeze)
                 args_variable, kwargs_variable = create_input(args, requires_grad=not is_inplace, call_kwargs=kwargs)
                 self_tensor = deepcopy(self_variable.data)
                 args_tensor = deepcopy(unpack_variables(args_variable))
@@ -6461,7 +6756,7 @@ def fn(*inputs, **kwargs):
 
                     if not is_magic_method and test_name not in EXCLUDE_SCRIPT:
                         check_against_reference(self,
-                                                create_script_fn(name, False, output_process_fn),
+                                                create_script_fn(name, 'method', output_process_fn),
                                                 fn, (self_variable,) + args_variable, kwargs_variable)
 
                 # functional interface tests
@@ -6478,7 +6773,7 @@ def fn(*inputs, **kwargs):
 
                     if not is_inplace and test_name not in EXCLUDE_SCRIPT:
                         check_against_reference(self,
-                                                create_script_fn(name, True, output_process_fn),
+                                                create_script_fn(name, 'functional', output_process_fn),
                                                 fn, f_args_variable, kwargs_variable)
 
             check(name)
@@ -6488,16 +6783,55 @@ def fn(*inputs, **kwargs):
             if hasattr(torch.ones(1), inplace_name) and not broadcast_skip_inplace:
                 check(inplace_name)
 
-        assert not hasattr(TestJitGenerated, test_name), 'Two tests have the same name: ' + test_name
+        post_add_test(test_name, skipTestIf, do_test)
+
+
+def add_nn_test(name, self_size, args, skipTestIf=(), output_process_fn=lambda x: x, kwargs=None):
+    test_name = 'test_nn_' + name
+
+    def do_test(self, name=name, args=args, test_name=test_name):
+        torch.manual_seed(2)
+
+        self_variable = create_input((self_size,))[0][0]
+
+        # need to record this because methods can change the size (e.g. unsqueeze)
+        args_variable, kwargs_variable = create_input(args, call_kwargs=kwargs)
+
+        self_tensor = deepcopy(self_variable.data)
+        args_tensor = deepcopy(unpack_variables(args_variable))
 
-        for skip in skipTestIf:
-            do_test = skip(do_test)
+        output_variable = getattr(F, name)(self_variable, *args_variable, **kwargs_variable)
+
+        def fn(*inputs, **kwargs):
+            output = getattr(F, name)(*inputs, **kwargs)
+            return output_process_fn(output)
+
+        f_args_variable = (self_variable,) + args_variable
+        f_args_tensor = (self_tensor,) + args_tensor
+
+        if test_name not in EXCLUDE_SCRIPT:
+            check_against_reference(self,
+                                    create_script_fn(name, 'nn_functional', output_process_fn),
+                                    fn, f_args_variable, kwargs_variable)
+
+    post_add_test(test_name, skipTestIf, do_test)
+
+
+def post_add_test(test_name, skipTestIf, do_test):
+    assert not hasattr(TestJitGenerated, test_name), 'Two tests have the same name: ' + test_name
+
+    for skip in skipTestIf:
+        do_test = skip(do_test)
+
+    if not (TEST_WITH_UBSAN and test_name in UBSAN_BLACKLISTED_TESTS):
+        setattr(TestJitGenerated, test_name, do_test)
 
-        if not (TEST_WITH_UBSAN and test_name in UBSAN_BLACKLISTED_TESTS):
-            setattr(TestJitGenerated, test_name, do_test)
 
 for test in method_tests:
     add_test(*test)
 
+for test in nn_functional_tests:
+    add_nn_test(*test)
+
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_nn.py b/test/test_nn.py
index 137102db862bb..1a242d917472d 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -771,6 +771,11 @@ def _test_dropout(self, cls, cuda, input):
         output.backward(input)
         self.assertLess(abs(input_var.grad.data.mean() - (1 - p)), 0.05)
 
+        # check eval mode doesn't change anything
+        for inplace in [True, False]:
+            module = cls(p, inplace).eval()
+            self.assertEqual(input, module(input))
+
         # Check that these don't raise errors
         module.__repr__()
         str(module)
diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp
index 45fec2d7a95e0..07b02632b80ab 100644
--- a/tools/autograd/templates/python_variable_methods.cpp
+++ b/tools/autograd/templates/python_variable_methods.cpp
@@ -255,7 +255,7 @@ static PyObject * THPVariable_cuda(PyObject* self, PyObject* args, PyObject* kwa
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
   ParsedArgs<2> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
-  auto backend = self_.is_sparse() ? at::kSparseCUDA : at::kCUDA;
+  auto backend = self_.is_sparse() ? at::Backend::SparseCUDA : at::Backend::CUDA;
   auto& type = self_.type().toBackend(backend);
   auto device_obj = r.device(0);
   if (!r.isNone(0) && device_obj.is_cpu()) {
diff --git a/tools/build_libtorch.py b/tools/build_libtorch.py
index 8a6bf0ac697c9..46b73687a1206 100644
--- a/tools/build_libtorch.py
+++ b/tools/build_libtorch.py
@@ -13,6 +13,7 @@
 
     os.environ['BUILD_TORCH'] = 'ON'
     os.environ['ONNX_NAMESPACE'] = 'onnx_torch'
+    os.environ['PYTORCH_PYTHON'] = sys.executable
 
     tools_path = os.path.dirname(os.path.abspath(__file__))
     build_pytorch_libs = os.path.join(tools_path, 'build_pytorch_libs.sh')
diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh
index c2d22ec433798..c1e0e1975167f 100755
--- a/tools/build_pytorch_libs.sh
+++ b/tools/build_pytorch_libs.sh
@@ -145,6 +145,9 @@ echo "Building in $BUILD_TYPE mode"
 
 # Used to build an individual library
 function build() {
+  if [[ -z "$CMAKE_ARGS" ]]; then
+    CMAKE_ARGS=()
+  fi
   # We create a build directory for the library, which will
   # contain the cmake output
   mkdir -p build/$1
@@ -192,7 +195,7 @@ function build() {
               -DCMAKE_DEBUG_POSTFIX="" \
               -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
               ${@:2} \
-              -DCMAKE_EXPORT_COMPILE_COMMANDS=1
+              -DCMAKE_EXPORT_COMPILE_COMMANDS=1 ${CMAKE_ARGS[@]}
   ${CMAKE_INSTALL} -j"$MAX_JOBS"
   popd
 
@@ -257,6 +260,7 @@ function build_caffe2() {
 
   ${CMAKE_VERSION} $BASE_DIR \
   ${CMAKE_GENERATOR} \
+      -DPYTHON_EXECUTABLE=$PYTORCH_PYTHON \
       -DBUILDING_WITH_TORCH_LIBS=ON \
       -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
       -DBUILD_CAFFE2=$FULL_CAFFE2 \
diff --git a/torch/csrc/DynamicTypes.cpp b/torch/csrc/DynamicTypes.cpp
index 2f4b74f71e3cf..0b2cf1f842db7 100644
--- a/torch/csrc/DynamicTypes.cpp
+++ b/torch/csrc/DynamicTypes.cpp
@@ -46,15 +46,15 @@ THPLayout* layout_registry
 at::Backend get_backend(bool is_cuda, bool is_sparse) {
   if (is_cuda) {
     if (is_sparse){
-      return at::kSparseCUDA;
+      return at::Backend::SparseCUDA;
     } else {
-      return at::kCUDA;
+      return at::Backend::CUDA;
     }
   } else {
     if (is_sparse){
-      return at::kSparseCPU;
+      return at::Backend::SparseCPU;
     } else {
-      return at::kCPU;
+      return at::Backend::CPU;
     }
   }
 }
diff --git a/torch/csrc/api/include/torch/serialization.h b/torch/csrc/api/include/torch/serialization.h
index 53ee7da51a87e..6e123b13848c7 100644
--- a/torch/csrc/api/include/torch/serialization.h
+++ b/torch/csrc/api/include/torch/serialization.h
@@ -211,7 +211,7 @@ void save(Archive& archive, const torch::Tensor& tensor) {
   for (auto s : tensor.sizes()) {
     sizes.push_back(s);
   }
-  auto contig = tensor.toBackend(torch::kCPU).contiguous();
+  auto contig = tensor.cpu().contiguous();
   int32_t backend = ::torch::detail::backendId(tensor.type().backend());
 
   archive(CEREAL_NVP(backend), CEREAL_NVP(sizes));
diff --git a/torch/csrc/api/src/utils.cpp b/torch/csrc/api/src/utils.cpp
index 95fe8697630b8..7eb2fd8236126 100644
--- a/torch/csrc/api/src/utils.cpp
+++ b/torch/csrc/api/src/utils.cpp
@@ -7,11 +7,11 @@
 namespace torch {
 void manual_seed(uint64_t seed) {
   // TODO: Move this to at::Context
-  at::globalContext().defaultGenerator(at::Backend::CPU).manualSeed(seed);
+  at::globalContext().defaultGenerator(at::kCPU).manualSeed(seed);
   // NB: Sometimes we build with CUDA, but we don't have any GPUs
   // available. In that case, we must not seed CUDA; it will fail!
   if (at::globalContext().hasCUDA() && at::globalContext().getNumGPUs() > 0) {
-    at::globalContext().defaultGenerator(at::Backend::CUDA).manualSeedAll(seed);
+    at::globalContext().defaultGenerator(at::kCUDA).manualSeedAll(seed);
   }
 }
 } // namespace torch
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index d1d5bfb6a3495..cf18d0001fe2d 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -235,7 +235,7 @@ int THPVariable_set_grad(THPVariable *self, PyObject *py_grad)
       "can't assign Variable as its own grad");
 
   auto& grad = ((THPVariable*)py_grad)->cdata;
-  auto& sparseType = var.type().toBackend(var.is_cuda() ? kSparseCUDA : kSparseCPU);
+  auto& sparseType = var.type().toBackend(var.is_cuda() ? Backend::SparseCUDA : Backend::SparseCPU);
 
   THPUtils_assertRet(-1, grad.type() == var.type() || grad.type() == sparseType,
       "assigned grad has data of a different type");
diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp
index 2d0f6ea68fe39..5ca60c7c62a2a 100644
--- a/torch/csrc/cuda/comm.cpp
+++ b/torch/csrc/cuda/comm.cpp
@@ -54,7 +54,7 @@ std::vector<Tensor> broadcast(const Tensor& tensor, IntList devices) {
 #else
   {
 #endif
-    auto & gpu_type = type.toBackend(type.is_sparse() ? at::kSparseCUDA : at::kCUDA);
+    auto & gpu_type = type.toBackend(type.is_sparse() ? at::Backend::SparseCUDA : at::Backend::CUDA);
     if (type.is_cuda()) {
       tensors.push_back(tensor);
     }
@@ -157,7 +157,7 @@ std::vector<at::Tensor> scatter(
       cuda_guard.set_stream(at::cuda::CUDAStream((*streams)[chunk]));
     }
     chunks[chunk] = chunks[chunk].contiguous().to(
-        {at::kCUDA, device_index}, /*non_blocking=*/true);
+        {at::DeviceType::CUDA, device_index}, /*non_blocking=*/true);
   }
   return chunks;
 }
@@ -186,9 +186,9 @@ at::Tensor gather(
     total_size += tensor.size(dim);
   }
   expected_size[dim] = total_size;
-  at::Device device(at::kCPU);
+  at::Device device(at::DeviceType::CPU);
   if (!destination_index || *destination_index != -1) {
-    device = at::Device(at::kCUDA, destination_index ? *destination_index : -1);
+    device = at::Device(at::DeviceType::CUDA, destination_index ? *destination_index : -1);
   }
   result = at::empty(expected_size, first.options().device(device));
 
diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp
index cbb419f38b95c..0a8cd38fe4f3a 100644
--- a/torch/csrc/cuda/nccl.cpp
+++ b/torch/csrc/cuda/nccl.cpp
@@ -72,7 +72,7 @@ ArrayRef<ncclComm_t> _get_communicators(TensorList inputs) {
 }
 
 ncclDataType_t _get_data_type(const Type& type) {
-  if (type.backend() != kCUDA) {
+  if (type.backend() != Backend::CUDA) {
     throw std::runtime_error("Unconvertible NCCL type");
   }
   switch (type.scalarType()) {
diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp
index 40049504395da..03b979ada19ac 100644
--- a/torch/csrc/jit/export.cpp
+++ b/torch/csrc/jit/export.cpp
@@ -372,7 +372,7 @@ void GraphEncoder::EncodeTensor(
   }
   tensor_proto->set_data_type(ATenTypeToOnnxType(tensor.type().scalarType()));
   // CPU's HalfTensor doesn't have contiguous(), so first calling contiguous()
-  auto t = tensor.contiguous().toBackend(at::kCPU);
+  auto t = tensor.contiguous().cpu();
   // Add a buffer to the raw_data_export_map for the caller to dump into an
   // external data store. If external_ref is not specified, we instead dump
   // the contiguous data into the protobuf itself
@@ -623,7 +623,7 @@ void ModuleEncoder::EncodeTensor(
     tensor_proto->add_int64_data(dedup_it->second);
   } else {
     at::Tensor t = tensor;
-    if (at::detail::get_backend(tensor.storage()->pImpl()) == at::kCUDA) {
+    if (at::detail::get_backend(tensor.storage()->pImpl()) == at::Backend::CUDA) {
       // NB: This new tensor is created to support cuda tensors.
       // Storages can be mutated when converting tensors from cuda to cpu,
       // and we need a cpu tensor to copy data from.
@@ -632,7 +632,7 @@ void ModuleEncoder::EncodeTensor(
           /* storageOffset = */ 0,
           /* size = */ { static_cast<int64_t>(tensor.type().elementSizeInBytes() * tensor.storage()->pImpl()->size()) },
           /* strides = */ { 1 })
-        .toBackend(at::kCPU);
+        .cpu();
     }
 
     auto record_number = file_writer_.writeRecord(
diff --git a/torch/csrc/jit/fusion_compiler.cpp b/torch/csrc/jit/fusion_compiler.cpp
index dad50440944e8..739c4c0d73a6c 100644
--- a/torch/csrc/jit/fusion_compiler.cpp
+++ b/torch/csrc/jit/fusion_compiler.cpp
@@ -418,7 +418,19 @@ std::string encodeRHS(Node * n) {
   return format(str, env);
 }
 
-std::pair<std::vector<ConcatDesc>, bool> emitCompilationUnit(
+static at::optional<Node&> usedInFusedChunk(Value * input) {
+  // If input is an input to prim::FusedChunk, it will only have one use.
+  auto * node = input->uses().at(0).user;
+  if (node->kind() == prim::FusedChunk) {
+		JIT_ASSERT(input->uses().size() == 1);
+    return *node;
+  } else {
+    return at::nullopt;
+  }
+}
+
+// Returns: (input chunk metadata, output concat metadata, is_random)
+std::tuple<std::vector<PartitionDesc>,std::vector<PartitionDesc>,bool> emitCompilationUnit(
     std::ostream& out,
     const std::string& name,
     AnnotatedGraph& agraph,
@@ -445,12 +457,33 @@ std::pair<std::vector<ConcatDesc>, bool> emitCompilationUnit(
     formals.push_back(format("TensorInfo<${scalar_type},${nDim}> ${tensor}",env));
     argument_loads.push_back(format("*static_cast<TensorInfo<${scalar_type},${nDim}>*>(args[${formal_index}])",env));
   };
+
+  std::vector<PartitionDesc> chunk_desc;
+  std::vector<std::pair<Value*,TensorDesc&>> flat_inputs;
   {
-    size_t i = 0;
-    for(auto p : subgraph.inputs())
-      emitFormal(p,agraph.input_desc[i++]);
+    size_t input_index = 0;
+    for(auto p : subgraph.inputs()) {
+      if (auto chunk = usedInFusedChunk(p)) {
+        int64_t dim = chunk->i(attr::dim);
+        int64_t chunks = chunk->i(attr::chunks);
+				auto tensor_type = p->type()->cast<TensorType>();
+        chunk_desc.emplace_back(tensor_type, chunks, dim, false);
+
+        for (auto * o : chunk->outputs()) {
+          flat_inputs.emplace_back(o, *chunk_desc.back().subtensorDesc);
+        }
+        ++input_index;
+      } else {
+        flat_inputs.emplace_back(p, agraph.input_desc[input_index++]);
+        chunk_desc.emplace_back();
+      }
+    }
+    for (auto & input : flat_inputs) {
+      emitFormal(input.first, input.second);
+    }
   }
-  std::vector<ConcatDesc> concat_desc;
+
+  std::vector<PartitionDesc> concat_desc;
   std::vector<Value*> flat_output_nodes;
   {
     size_t i = 0;
@@ -473,7 +506,8 @@ std::pair<std::vector<ConcatDesc>, bool> emitCompilationUnit(
 
   bool has_half_tensor = false;
   size_t formal_count = 0;
-  for(auto p : subgraph.inputs()) {
+  for(auto input : flat_inputs) {
+    auto p = input.first;
     env.s("node",valueName(p));
     env.d("formal",formal_count++);
 
@@ -496,6 +530,8 @@ std::pair<std::vector<ConcatDesc>, bool> emitCompilationUnit(
     // FusedConcat nodes work by narrowing the output Tensors before the kernel runs
     if (n->kind() == prim::FusedConcat)
       continue;
+    if (n->kind() == prim::FusedChunk)
+      continue;
     if(n->kind() == aten::rand_like) {
       has_random = true;
       if(!use_cuda)
@@ -549,7 +585,7 @@ std::pair<std::vector<ConcatDesc>, bool> emitCompilationUnit(
     out << cpu_compilation_unit_template.format(env);
   }
 
-  return std::make_pair(std::move(concat_desc), has_random);
+  return std::make_tuple(std::move(chunk_desc), std::move(concat_desc), has_random);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -624,45 +660,98 @@ void compressContiguous(
 
 } // anonymous namespace
 
+// XXX: Assumes that after at::chunk, all inputs are the same size
+static std::vector<int64_t> computeMapSize(
+    const at::Tensor& tensor,
+    const PartitionDesc& chunkDesc) {
+  std::vector<int64_t> sizes(tensor.sizes().begin(), tensor.sizes().end());
+  // Should have been checked in graph fuser
+  JIT_ASSERT(sizes[chunkDesc.dim] % chunkDesc.nSubtensors == 0);
+  sizes[chunkDesc.dim] /= chunkDesc.nSubtensors;
+  return sizes;
+}
+
+// XXX: this code assumes that inputs are 32-bit addressable
+static uint32_t computeNumel(at::ArrayRef<int64_t> sizes) {
+  uint32_t result = 1;
+  if (sizes.size() == 0) {
+    return 1; // scalar tensor
+  }
+  for (int64_t size : sizes) {
+    result *= size;
+  }
+  return result;
+}
+
 void CompiledFusionFunction::launch_with_tensors(at::ArrayRef<at::Tensor> inputs, at::ArrayRef<at::Tensor> outputs) {
   at::DeviceGuard device_guard(inputs);
   JIT_ASSERT(inputs.size() == input_desc.size());
   JIT_ASSERT(outputs.size() == output_desc.size());
+  size_t flat_inputs_size = 0;
   size_t flat_outputs_size = 0;
+  for(auto & c : chunk_desc)
+    flat_inputs_size += c.nSubtensors;
   for(auto & c : concat_desc)
     flat_outputs_size += c.nSubtensors;
   // XXX: this code assumes that inputs are 32-bit addressable
   // XXX: this code assumes that all inputs are of the same size
   JIT_ASSERT(inputs[0].numel() <= std::numeric_limits<uint32_t>::max());
-  uint32_t numel = inputs[0].numel();
-  at::IntList map_size = inputs[0].sizes();
+
+  // Compute map_size, numel from the first input
+  at::IntList map_size;
+  uint32_t numel;
+  std::vector<int64_t> keep_alive_size;
+  if (chunk_desc[0].isNoop()) {
+    map_size = inputs[0].sizes();
+    numel = inputs[0].numel();
+  } else {
+    keep_alive_size = computeMapSize(inputs[0], chunk_desc[0]);
+    map_size = keep_alive_size;
+    numel = computeNumel(map_size);
+  }
+
   // Compute the storage needed to store TensorInfo structs for inputs and outputs.
   size_t uncompressedDim = input_desc.at(0).contiguity.size();
   size_t maxPossibleTensorInfoSize = sizeof(TensorInfo) + 2 * sizeof(uint32_t) * uncompressedDim;
-  size_t maxPossibleBufferSize = maxPossibleTensorInfoSize * (inputs.size() + flat_outputs_size);
+  size_t maxPossibleBufferSize = maxPossibleTensorInfoSize * (flat_inputs_size + flat_outputs_size);
   std::vector<char> buffer(maxPossibleBufferSize);
   char * buffer_next = buffer.data();
   // A vector of arguments to the kernel. It's (numel, *input_descs, *output_descs)
   std::vector<void*> arguments;
-  arguments.reserve(3 + inputs.size() + flat_outputs_size);
-  // Asserts that t's dims can be compressed in the same way as in desc
-  // (that's what the kernel assumes), and appends it to the arguments vector.
-  auto addTensorInfo = [&](TensorDesc & desc, const at::Tensor & t) {
+  arguments.reserve(3 + flat_inputs_size + flat_outputs_size);
+  auto addTensorInfoRaw = [&](TensorDesc & desc, void* data_ptr, at::IntList sizes, at::IntList strides) {
     size_t nDim = desc.nDim(); // NOTE: this is the compressed dim
     JIT_ASSERT(nDim <= uncompressedDim); // We'd overflow the space otherwise
     auto ti = reinterpret_cast<TensorInfo*>(buffer_next);
-    ti->data = t.data_ptr();
-    compressContiguous(t.sizes(), t.strides(), desc.contiguity, ti->sizes(nDim), ti->strides(nDim));
+    ti->data = data_ptr;
+    compressContiguous(sizes, strides, desc.contiguity, ti->sizes(nDim), ti->strides(nDim));
     buffer_next += maxPossibleTensorInfoSize;
     arguments.push_back(ti);
   };
+  // Asserts that t's dims can be compressed in the same way as in desc
+  // (that's what the kernel assumes), and appends it to the arguments vector.
+  auto addTensorInfo = [&](TensorDesc & desc, const at::Tensor & t) {
+    addTensorInfoRaw(desc, t.data_ptr(), t.sizes(), t.strides());
+  };
   arguments.push_back(&numel);
-  for (size_t i = 0; i < input_desc.size(); ++i)
-    addTensorInfo(input_desc[i], inputs[i]);
+  for (size_t i = 0; i < input_desc.size(); ++i) {
+    auto & chunk = chunk_desc[i];
+    const at::Tensor& tensor = inputs[i];
+    if (chunk.isNoop()) {
+      addTensorInfo(input_desc[i], tensor);
+    } else {
+      size_t chunk_offset = map_size[chunk.dim] * tensor.stride(chunk.dim) * elementSize(tensor.type().scalarType());
+      char * data_ptr = reinterpret_cast<char*>(tensor.data_ptr());
+      for (size_t chunks = 0; chunks < chunk.nSubtensors; ++chunks) {
+        addTensorInfoRaw(*chunk.subtensorDesc, data_ptr, map_size, tensor.strides());
+        data_ptr += chunk_offset;
+      }
+    }
+  }
   for (size_t i = 0; i < output_desc.size(); ++i) {
     auto & c = concat_desc[i];
     at::Tensor o = outputs[i];
-    if(c.nSubtensors == 1) {
+    if(c.isNoop()) {
       o.resize_(map_size);
       addTensorInfo(output_desc[i], outputs[i]);
     } else {
@@ -685,7 +774,7 @@ void CompiledFusionFunction::launch_with_tensors(at::ArrayRef<at::Tensor> inputs
   // If the kernel call contains a random op, we need to pass in random seeds as
   // well.
   #ifdef USE_CUDA
-  if(has_random && this->backend() == at::kCUDA) {
+  if(has_random && this->backend() == at::Backend::CUDA) {
     auto gen_ = THCRandom_getGenerator(at::globalContext().getTHCState());
     uint64_t offset =
         gen_->state.philox_seed_offset.fetch_add(this->get_rand_offset(numel));
@@ -729,9 +818,7 @@ struct CUDAFusionFunction : public CompiledFusionFunction {
     checkCUDAVersion(prop);
 
     std::stringstream cu;
-    auto ret = codegen::emitCompilationUnit(cu, name, agraph, true);
-    concat_desc = std::move(ret.first);
-    has_random = ret.second;
+    std::tie(chunk_desc, concat_desc, has_random) = codegen::emitCompilationUnit(cu, name, agraph, true);
     compilation_unit = cu.str();
     nvrtcProgram program;
     TORCH_NVRTC_CHECK(nvrtcCreateProgram(&program, compilation_unit.c_str(), NULL, 0, nullptr, nullptr));
@@ -769,7 +856,7 @@ struct CUDAFusionFunction : public CompiledFusionFunction {
   }
 protected:
   virtual at::Backend backend() const override {
-    return at::kCUDA;
+    return at::Backend::CUDA;
   }
   virtual uint64_t get_rand_offset(uint32_t numel) override {
      int numBlocks = std::min(maxBlocks, ceilDiv(numel, blockSize));
@@ -932,9 +1019,7 @@ struct CPUFusionFunction : public CompiledFusionFunction {
     TempFile cpp_file(cpp_template, 4);
 
     std::stringstream cu;
-    auto ret = codegen::emitCompilationUnit(cu, name, agraph, false);
-    concat_desc = std::move(ret.first);
-    has_random = ret.second;
+    std::tie(chunk_desc, concat_desc, has_random) = codegen::emitCompilationUnit(cu, name, agraph, false);
     JIT_ASSERT(!has_random);
     compilation_unit = cu.str();
     cpp_file.write(compilation_unit);
@@ -950,7 +1035,7 @@ struct CPUFusionFunction : public CompiledFusionFunction {
   }
 protected:
   virtual at::Backend backend() const override {
-    return at::kCPU;
+    return at::Backend::CPU;
   }
   virtual uint64_t get_rand_offset(uint32_t numel) override {
      return numel;
diff --git a/torch/csrc/jit/fusion_compiler.h b/torch/csrc/jit/fusion_compiler.h
index 99a2303d37af2..c0037faf4d081 100644
--- a/torch/csrc/jit/fusion_compiler.h
+++ b/torch/csrc/jit/fusion_compiler.h
@@ -60,13 +60,18 @@ struct AnnotatedGraph {
   std::vector<TensorDesc> output_desc;
 };
 
-struct ConcatDesc {
-  size_t nSubtensors; // == 1 for outputs that are not concats, otherwise it is the number tensors concatenated
-  size_t dim; // dimension along which the concat occurs
+// Descriptor for chunk-ing an input tensor into subtensors
+// OR concat-ing an output tensor from subtensors
+struct PartitionDesc {
+  size_t nSubtensors; // == 1 for tensors that should not be operated on via chunk/cat
+  size_t dim; // dimension along which the chunk/concat occurs
   std::unique_ptr<TensorDesc> subtensorDesc; // descriptor for the subtensor, if it exists
-  ConcatDesc()
+  PartitionDesc()
   : nSubtensors(1), dim(0) {}
-  ConcatDesc(const TensorDesc & desc, size_t nSubtensors, size_t dim)
+
+  // Constructor for cat descriptors
+  // desc: TensorDesc for output tensor
+  PartitionDesc(const TensorDesc & desc, size_t nSubtensors, size_t dim)
   : nSubtensors(nSubtensors), dim(dim) {
     JIT_ASSERT(nSubtensors > 1);
     std::vector<bool> cont = desc.contiguity;
@@ -79,6 +84,25 @@ struct ConcatDesc {
     }
     subtensorDesc.reset(new TensorDesc(desc.scalar_type, cont));
   }
+
+  // Constructor for chunk descriptors
+  // tensor_type: the type of the input to the chunk node
+  // ignored: XXX: Compiler gets confused without this arg
+  PartitionDesc(TensorTypePtr tensor_type, size_t chunks, size_t dim, bool ignored)
+  : nSubtensors(chunks), dim(dim) {
+    (void)ignored;
+    JIT_ASSERT(chunks > 1);
+
+    std::vector<int64_t> sizes(tensor_type->sizes().begin(), tensor_type->sizes().end());
+    JIT_ASSERT(sizes[dim] % chunks == 0); // Should have been checked in graph fuser
+    sizes[dim] /= chunks;
+    // Computes contiguity, which is what we really care about
+    subtensorDesc.reset(new TensorDesc(tensor_type->scalarType(), sizes, tensor_type->strides()));
+  }
+
+  bool isNoop() const {
+    return nSubtensors == 1;
+  }
 };
 
 struct CompiledFusionFunction {
@@ -120,7 +144,12 @@ struct CompiledFusionFunction {
   // same size as output_desc, describes whether
   // an output is actually a concatenation of
   // many subtensors that the fusion group produces
-  std::vector<ConcatDesc> concat_desc;
+  std::vector<PartitionDesc> concat_desc;
+
+  // same size as input_desc, describes whether an
+  // input should be broken into subtensors (chunks)
+  // to be consumed by the fusion group
+  std::vector<PartitionDesc> chunk_desc;
 };
 
 struct FusionCompilerConfig {
diff --git a/torch/csrc/jit/interned_strings.h b/torch/csrc/jit/interned_strings.h
index 819591e1c9cdb..20fd13ee08cf2 100644
--- a/torch/csrc/jit/interned_strings.h
+++ b/torch/csrc/jit/interned_strings.h
@@ -53,6 +53,7 @@ _(prim, AutogradAdd) \
 _(prim, GradOf) \
 _(prim, AnyDefined) \
 _(prim, FusedConcat) \
+_(prim, FusedChunk) \
 _(aten, __not__) \
 FORALL_ATEN_BASE_SYMBOLS(_) \
 _(onnx, Add) \
diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp
index 4de21160fa09b..0adef407527bd 100644
--- a/torch/csrc/jit/passes/graph_fuser.cpp
+++ b/torch/csrc/jit/passes/graph_fuser.cpp
@@ -1,7 +1,9 @@
 #include "torch/csrc/jit/passes/graph_fuser.h"
+#include "torch/csrc/jit/passes/common_subexpression_elimination.h"
 #include "torch/csrc/jit/fusion_compiler.h"
 #include "torch/csrc/jit/autodiff.h"
 #include "torch/csrc/jit/assertions.h"
+#include "ATen/ExpandUtils.h"
 #include <unordered_map>
 
 #ifdef USE_CUDA
@@ -87,6 +89,47 @@ bool isSimpleMap(Node *node) {
   return true;
 }
 
+enum class DeviceType { Unknown, AnyDevice, CPU, CUDA };
+
+struct Device {
+
+  DeviceType type() {
+    return type_;
+  }
+
+  int index() {
+    JIT_ASSERT(can_have_index(type_));
+    return index_;
+  }
+
+  static Device fromIndex(int index) {
+    JIT_ASSERT(index >= kCPUDevice);
+    if (index == kCPUDevice) {
+      return Device(DeviceType::CPU, index);
+    }
+    return Device(DeviceType::CUDA, index);
+  }
+
+  static Device AnyDevice() {
+    return Device(DeviceType::AnyDevice, 0);
+  }
+
+  static Device Unknown() {
+    return Device(DeviceType::Unknown, 0);
+  }
+
+private:
+  DeviceType type_;
+  int index_;
+
+  Device(DeviceType type, int index)
+  : type_(type), index_(index) {}
+
+  bool can_have_index(DeviceType type) {
+    return type == DeviceType::CPU || type == DeviceType::CUDA;
+  }
+};
+
 
 struct GraphFuser {
   Block * block;
@@ -101,15 +144,19 @@ struct GraphFuser {
   GraphFuser(Block * block)
   : block(block) {}
 
-  at::optional<int> getDevice(Node * node) {
+  Device getDevice(Node * node) {
     if(node->kind() == prim::FusionGroup) {
-      return node->i(attr::device);
+      return Device::fromIndex(node->i(attr::device));
     }
     if(auto tt = node->output()->type()->cast<TensorType>()) {
-      return tt->device();
+      return Device::fromIndex(tt->device());
     }
-    return at::nullopt;
+    if (node->output()->type()->isSubtypeOf(NumberType::get())) {
+      return Device::AnyDevice();
+    }
+    return Device::Unknown();
   }
+
   // TODO: the fusion compiler has a lot of float-specific codegen
   // so for now we only consider nodes that operate on floating point numbers
   // and half values when running on a GPU with sufficient CUDA arch
@@ -141,8 +188,7 @@ struct GraphFuser {
   }
 
   bool hasSupportedType(Node* node) {
-    return areTensorsOfSameShape(node->inputs()) &&
-           haveSupportedType(node->inputs()) &&
+    return haveSupportedType(node->inputs()) &&
            haveSupportedType(node->outputs());
   }
 
@@ -153,23 +199,58 @@ struct GraphFuser {
     return true;
   }
 
+  value_list tensorInputs(Node * node) {
+    return filter(node->inputs(), [](Value * v) {
+      return v->type()->isSubtypeOf(DynamicType::get());
+    });
+  }
 
+  // Checks if the node is fusible into a FusionGroup. A node is fusible if:
+  // - it is a FusionGroup
+  // - it is a simple map op and its inputs/outputs have compatible types.
+  // NB: two nodes that are fusible might not be fused together
+  // if they don't have compatible map_size.
   bool isFusable(Node * node) {
     if (node->owningBlock() != block) return false;
     if (node->kind() == prim::FusionGroup) return true;
     if (!isSimpleMap(node)) return false;
 
-    if (node->matches("aten::add(Tensor self, Tensor other, *, Scalar alpha) -> Tensor", /*const=*/attr::alpha)) {
-      std::vector<Value*> inputs {node->namedInput(attr::self), node->namedInput(attr::other)};
-      return areTensorsOfSameShape(inputs) && haveSupportedType(inputs);
-    } else if (node->matches("aten::lt(Tensor self, Tensor other) -> Tensor") ||
-               node->matches("aten::le(Tensor self, Tensor other) -> Tensor") ||
-               node->matches("aten::gt(Tensor self, Tensor other) -> Tensor") ||
-               node->matches("aten::ge(Tensor self, Tensor other) -> Tensor") ||
-               node->matches("aten::eq(Tensor self, Tensor other) -> Tensor") ||
-               node->matches("aten::ne(Tensor self, Tensor other) -> Tensor")) {
+    if (node->matches("aten::add(Tensor self, Tensor other, *, Scalar alpha) -> Tensor",
+          /*const=*/attr::alpha) ||
+        node->matches("aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor",
+          /*const=*/{attr::other, attr::alpha}) ||
+        node->matches("aten::add(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
+        node->matches("aten::sub(Tensor self, Tensor other, *, Scalar alpha) -> Tensor",
+          /*const=*/attr::alpha) ||
+        node->matches("aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor",
+          /*const=*/{attr::other, attr::alpha}) ||
+        node->matches("aten::sub(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
+        node->matches("aten::mul(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
+        node->matches("aten::mul(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
+        node->matches("aten::div(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
+        node->matches("aten::div(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other)) {
+      auto inputs = tensorInputs(node);
+      return haveSupportedType(inputs);
+    }
+    else if (
+        node->matches("aten::lt(Tensor self, Tensor other) -> Tensor") ||
+        node->matches("aten::lt(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
+        node->matches("aten::lt(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
+        node->matches("aten::le(Tensor self, Tensor other) -> Tensor") ||
+        node->matches("aten::le(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
+        node->matches("aten::le(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
+        node->matches("aten::ge(Tensor self, Tensor other) -> Tensor") ||
+        node->matches("aten::ge(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
+        node->matches("aten::ge(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
+        node->matches("aten::eq(Tensor self, Tensor other) -> Tensor") ||
+        node->matches("aten::eq(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
+        node->matches("aten::eq(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
+        node->matches("aten::ne(Tensor self, Tensor other) -> Tensor") ||
+        node->matches("aten::ne(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
+        node->matches("aten::ne(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other)) {
       // comparison operators produce Byte type, and it's ok, check only inputs
-      return areTensorsOfSameShape(node->inputs()) && haveSupportedType(node->inputs());
+      auto inputs = tensorInputs(node);
+      return haveSupportedType(inputs);
     } else if (node->matches("aten::type_as(Tensor self, Tensor other) -> Tensor")) {
       // type_as can have different input types as long as output is float, check only output
       return haveSupportedType(node->outputs());
@@ -249,18 +330,135 @@ struct GraphFuser {
     return isFusableOnlyAsExitNode(node);
   }
 
+  // unknown (u) any (a) cpu (c) cuda (g) compatibility:
+  // x u a c g   y = yes
+  // u . . . .   . = no
+  // a . n y y
+  // c . y y .
+  // g . y . y
+  bool compatibleDevices(Node * consumer, Value * producer) {
+    auto consumer_device = getDevice(consumer);
+    auto producer_device = getDevice(producer->node());
+
+    if (consumer_device.type() == DeviceType::Unknown ||
+        producer_device.type() == DeviceType::Unknown) {
+      return false;
+    }
+
+    if (consumer_device.type() == DeviceType::CUDA &&
+        producer_device.type() == DeviceType::CPU) {
+      return false;
+    } else if (producer_device.type() == DeviceType::CUDA &&
+               consumer_device.type() == DeviceType::CPU) {
+      return false;
+    } else if (producer_device.type() == DeviceType::AnyDevice &&
+               consumer_device.type() == DeviceType::AnyDevice) {
+      // XXX: This case means we're fusing operations on non-constant numbers.
+      // The graph fuser doesn't support this at the moment (#9940).
+      return false;
+    }
+
+    // At this point, the devices are matched. Last thing to check
+    // is that if we're compiling on CPU, the fusion compiler works.
+    if (consumer_device.type() == DeviceType::CPU ||
+        producer_device.type() == DeviceType::CPU) {
+      return sharedFusionCompiler().canCompileOnCPU();
+    }
+    return true;
+  }
+
+  at::optional<at::IntList> mapSize(Node * node) {
+    if (isSimpleMap(node)) {
+      auto type = node->output()->type()->cast<TensorType>();
+      if (!type) {
+        return at::nullopt;
+      }
+      return at::optional<at::IntList>(at::in_place, type->sizes());
+    }
+    if (node->kind() == prim::FusionGroup) {
+      // inputs are guaranteed to be the map_size
+      auto type = node->inputs().at(0)->type()->cast<TensorType>();
+      JIT_ASSERT(type);
+      return at::optional<at::IntList>(at::in_place, type->sizes());
+    }
+    if (node->kind() == aten::cat) {
+      // Assuming all inputs to aten::cat are same size. This is
+      // a condition for aten::cat to be fusible.
+      Node * list_construct = node->namedInput(attr::tensors)->node();
+      JIT_ASSERT(areTensorsOfSameShape(list_construct->inputs()));
+      auto type = list_construct->inputs().at(0)->type()->cast<TensorType>();
+      return at::optional<at::IntList>(at::in_place, type->sizes());
+    }
+    if (node->kind() == aten::chunk) {
+      // Assuming all outputs to aten::chunk are same size.
+      // This is a condition for the graph fuser to operate on
+      // aten::chunk nodes and is checked elsewhere.
+      JIT_ASSERT(areTensorsOfSameShape(node->outputs()));
+      auto type = node->outputs().at(0)->type()->cast<TensorType>();
+      return at::optional<at::IntList>(at::in_place, type->sizes());
+    }
+    return at::nullopt;
+  }
+
+  bool equalSizes(at::IntList a, at::IntList b) {
+    return a.size() == b.size() && std::equal(a.begin(), a.end(), b.begin());
+  }
+
+  bool haveSameMapSize(Node * consumer, Node * producer) {
+    auto consumer_map_size = mapSize(consumer);
+    auto producer_map_size = mapSize(producer);
+    if (!consumer_map_size || !producer_map_size) {
+      return false;
+    }
+    return equalSizes(*consumer_map_size, *producer_map_size);
+  }
+
   bool shouldFuse(Node * consumer, Value * producer) {
     // this handles cases where producer can be moved _into_ the fusion group of consumer.
     // TODO: extend to fusion of consumer into _producer's_ fusion blob
     // if the consumer allInputsAreThisProducer(consumer,producer)
     // we can move the consumer up into the producer.
     // but this requires better handling of merging fusion groups so it is not done now
-    at::optional<int> consumer_device = getDevice(consumer);
     Node *real_consumer = consumer->kind() == aten::cat ? consumer->namedInput(attr::tensors)->node() : consumer;
     return isFusable(producer->node()) &&
+      haveSameMapSize(consumer, producer->node()) &&
       allUsersAreThisConsumerOrOccurAfterIt(real_consumer, producer) &&
-      consumer_device && consumer_device == getDevice(producer->node()) &&
-      (*consumer_device != kCPUDevice || sharedFusionCompiler().canCompileOnCPU());
+      compatibleDevices(consumer, producer);
+  }
+
+  void maybeInsertExplicitExpands(Node * node) {
+    if (!isSimpleMap(node)) {
+      return;
+    }
+    WithInsertPoint guard(node);
+
+    auto map_size = mapSize(node).value();
+    auto * graph = node->owningGraph();
+
+    auto tensor_inputs = tensorInputs(node);
+    for (auto * producer: tensor_inputs) {
+      auto type = producer->type()->cast<TensorType>();
+      JIT_ASSERT(type);
+      if (equalSizes(map_size, type->sizes())) {
+        continue;
+      }
+      // Insert explicit expand node when input doesn't have correct size.
+      //
+      // XXX: This hardcodes the "map size" for this FusionGroup.
+      // If we want to make the graph fuser more general in the future,
+      // we could use aten::broadcast_tensors or add a primitive op that broadcasts.
+      auto * expand = graph->insert(
+          aten::expand,
+          {producer, graph->insertConstant(IValue(map_size)), graph->insertConstant(0)})->node();
+      {
+        std::vector<int64_t> sizes, strides;
+        std::tie(sizes, strides) = at::inferExpandGeometry(
+            type->sizes(), type->strides(), map_size);
+        expand->output()->setType(type->withSizesStrides(sizes, strides));
+      }
+      topological_index[expand] = topological_index[producer->node()];
+      node->replaceInputWith(producer, expand->output());
+    }
   }
 
   // insert a producer node into a consuming fusion group.
@@ -328,6 +526,7 @@ struct GraphFuser {
 
   Node * mergeNodeIntoGroup(Node* group, Node * n) {
     JIT_ASSERT(n->kind() != prim::FusionGroup);
+    maybeInsertExplicitExpands(n);
     auto & subgraph = getSubgraph(group);
     // map from nodes in the surrounding graph to parameters in the fusion
     // group's subgraph that correspond to them
@@ -338,7 +537,7 @@ struct GraphFuser {
       inputs_map[input] = subgraph.inputs()[i++];
     }
     // add n's inputs to the fusion group's input list if we don't already have them
-    Node * insert_after = nullptr;
+    WithInsertPoint guard(*subgraph.nodes().begin());
     for (auto input : n->inputs()) {
       if (inputs_map.count(input) == 0) {
         if (input->type()->isSubtypeOf(DynamicType::get())) {
@@ -352,8 +551,7 @@ struct GraphFuser {
           // cases we inline the constants directly in the body of the fused group.
           JIT_ASSERT(input->node()->kind() == prim::Constant);
           Node * in_const = subgraph.createClone(input->node(), [](Value*) -> Value* { throw std::runtime_error("unexpected input"); });
-          subgraph.prependNode(in_const);
-          insert_after = in_const;
+          subgraph.insertNode(in_const);
           inputs_map[input] = in_const->output();
         }
       }
@@ -362,25 +560,33 @@ struct GraphFuser {
     Node * in_graph = subgraph.createClone(n,[&](Value * k)-> Value* {
       return inputs_map[k];
     });
-    // if n is already an input to the fusion group,
-    // we need to remove it because n is now inside the fusion group
+    // if n's outputs are already inputs to the fusion group,
+    // we need to remove them because n is now inside the fusion group.
+    //
+    // i.e.,
+    // x = f(w); group(x, y, z) becomes group(w, y, z).
+    // x, y, z = f(w); group(x, y, z) becomes group(w).
+    //
     // remapping nodes that used the input to the newly-merged node
     // n is not an input when the fusion group is empty
     auto inputs = group->inputs();
-    auto it = std::find(inputs.begin(), inputs.end(), n->output());
-    if(it != inputs.end()) {
-      size_t p = it - inputs.begin();
-      group->removeInput(p);
-      subgraph.inputs()[p]->replaceAllUsesWith(in_graph->output());
-      subgraph.eraseInput(p);
+    for (size_t i = 0; i < n->outputs().size(); ++i) {
+      auto it = std::find(inputs.begin(), inputs.end(), n->outputs()[i]);
+      if(it != inputs.end()) {
+        size_t p = it - inputs.begin();
+        group->removeInput(p);
+        subgraph.inputs()[p]->replaceAllUsesWith(in_graph->outputs()[i]);
+        subgraph.eraseInput(p);
+      }
     }
-    return insert_after ? in_graph->insertAfter(insert_after) : subgraph.prependNode(in_graph);
+    return subgraph.insertNode(in_graph);
   }
 
   // turn consumer node n into a fusion group with just n inside
   // to prepare for fusion and replace uses of n with the new group
   Node * createSingletonFusionGroup(Node * n) {
-    auto group = block->owningGraph()->createFusionGroup(getDevice(n).value());
+    maybeInsertExplicitExpands(n);
+    auto group = block->owningGraph()->createFusionGroup(getDevice(n).index());
     // propogate position information for the new node so we can always
     // have a valid mapping
     topological_index[group] = topological_index[n];
@@ -429,6 +635,7 @@ struct GraphFuser {
       mergeFusionGroups(group, producer->node());
       return group;
     }
+    JIT_ASSERT(producer->node()->outputs().size() == 1);
     Node * merged = mergeNodeIntoGroup(group, producer->node());
     // remaining uses of this producer can occur because we allow
     // fusion in cases where uses remain after the consumer
@@ -449,6 +656,155 @@ struct GraphFuser {
     return node->kind() == aten::split || node->kind() == aten::chunk;
   }
 
+  bool canFuseChunk(Node* consumer, Value* producer) {
+    if (consumer->kind() != prim::FusionGroup) {
+      return false;
+    }
+    // Does the chunk have constant chunks/dim?
+    auto * chunk = producer->node();
+    if (!chunk->matches(
+        "aten::chunk(Tensor self, int chunks, int dim) -> Tensor[]",
+        /*const=*/{attr::chunks, attr::dim})) {
+      return false;
+    }
+    // and all uses of the chunk are in this consumer
+    for (auto s : chunk->outputs()) {
+      for (auto u : s->uses()) {
+        if (u.user != consumer) {
+          return false;
+        }
+      }
+    }
+    // and isn't a no-op chunk (chunks == 1). Have CSE clean this up.
+    // We could fuse this but it's better to just delete the node.
+    int64_t chunks = chunk->get<int64_t>(attr::chunks).value();
+    if (chunks == 1) {
+      return false;
+    }
+    // and chunks evenly divides the tensor
+    int64_t dim = chunk->get<int64_t>(attr::dim).value();
+    auto expected_type = chunk->namedInput(attr::self)->type()->cast<TensorType>();
+    if (!expected_type) {
+      return false;
+    }
+    return expected_type->sizes().at(dim) % chunks == 0;
+  }
+
+  at::optional<Node*> findFusedChunk(Node * group, Value * input) {
+    JIT_ASSERT(group->kind() == prim::FusionGroup);
+    auto it = std::find(group->inputs().begin(), group->inputs().end(), input);
+    if (it == group->inputs().end()) {
+      return at::nullopt;
+    }
+    size_t input_index = it - group->inputs().begin();
+    auto & subgraph = getSubgraph(group);
+    auto * subgraph_input = subgraph.inputs().at(input_index);
+    // If subgraph_input is an input to prim::FusedChunk, it will have 1 use
+    auto * node = subgraph_input->uses().at(0).user;
+    if (node->kind() == prim::FusedChunk) {
+      JIT_ASSERT(subgraph_input->uses().size() == 1);
+      return node;
+    }
+    return at::nullopt;
+  }
+
+  void fuseChunkByReusingExistingFusedChunk(
+      Node * group, Node * chunk, Node * existingFusedChunk) {
+    JIT_ASSERT(chunk->outputs().size() == existingFusedChunk->outputs().size());
+    auto & subgraph = getSubgraph(group);
+    for (size_t i = 0; i < chunk->outputs().size(); ++i) {
+      // Find the input to the FusionGroup (group)
+      auto * replacement_val = existingFusedChunk->outputs().at(i);
+      auto * val = chunk->outputs().at(i);
+      auto it = std::find(group->inputs().begin(), group->inputs().end(), val);
+      auto input_index = it - group->inputs().begin();
+
+      // Rewrite the graph to use replacement_val
+      auto group_input = subgraph.inputs().at(input_index);
+      group_input->replaceAllUsesWith(replacement_val);
+
+      // Remove the input, it's no longer needed
+      group->removeInput(input_index);
+      subgraph.eraseInput(input_index);
+    }
+    chunk->destroy();
+  }
+
+  // There are two invariants for prim::FusedChunk:
+  // (1) the tensor input to prim::FusedChunk must be an input to the fusion group
+  // (2) no two FusedChunk in the same FusionGroup can share a tensor input.
+  graph_node_list::iterator fuseChunk(Node * consumer, Value * producer) {
+    JIT_ASSERT(consumer->kind() == prim::FusionGroup);
+    auto * chunk = producer->node();
+    JIT_ASSERT(chunk->matches(
+        "aten::chunk(Tensor self, int chunks, int dim) -> Tensor[]",
+        /*const=*/{attr::chunks, attr::dim}));
+
+    // if producer's input is already an input to a prim::FusedChunk node,
+    // we cannot add a new prim::FusedChunk node because of invariant (2).
+    auto * chunked_tensor = producer->node()->namedInput(attr::self);
+    if (auto existingFusedChunk = findFusedChunk(consumer, chunked_tensor)) {
+      fuseChunkByReusingExistingFusedChunk(consumer, chunk, *existingFusedChunk);
+      return consumer->reverseIterator();
+    }
+
+    WithInsertPoint guard(chunk);
+
+    // Create a prim::FusedChunk
+    int64_t chunks = chunk->get<int64_t>(attr::chunks).value();
+    int64_t dim = chunk->get<int64_t>(attr::dim).value();
+    auto * graph = chunk->owningGraph();
+    auto * fused_chunk = graph->create(
+        prim::FusedChunk, {chunk->namedInput(attr::self)}, /*num_outputs=*/chunks)
+      ->i_(attr::chunks, chunks)
+      ->i_(attr::dim, dim);
+    graph->insertNode(fused_chunk);
+
+    // Replace aten::chunk with prim::FusedChunk
+    for (auto it = chunk->outputs().begin(); it != chunk->outputs().end(); ++it) {
+      auto offset = it - chunk->outputs().begin();
+      (*it)->replaceAllUsesWith(fused_chunk->outputs().at(offset));
+    }
+
+    // Move prim::FusedChunk into the FusionGroup
+    mergeNodeIntoGroup(consumer, fused_chunk);
+    fused_chunk->destroy();
+    chunk->destroy();
+    return consumer->reverseIterator();
+  }
+
+  value_list sortReverseTopological(ArrayRef<Value*> inputs) {
+    value_list result;
+    for (auto i : inputs) {
+      if (i->node()->owningBlock() == block) {
+        result.push_back(i);
+        JIT_ASSERT(topological_index.count(i->node()) > 0);
+      }
+    }
+    // Sort in reverse topological order
+    std::sort(result.begin(), result.end(), [&](Value * a, Value * b) {
+      return topological_index.at(a->node()) > topological_index.at(b->node());
+    });
+    return result;
+  }
+
+  graph_node_list::iterator scanNodeForChunks(Node * consumer) {
+    if (consumer->kind() == prim::FusionGroup) {
+      auto stage_guard = block->owningGraph()->setStageTemporary(consumer->stage());
+      auto inputs = sortReverseTopological(consumer->inputs());
+      for(auto producer : inputs) {
+        // Don't fuse accross stage boundaries
+        if (producer->stage() != consumer->stage()) continue;
+        if (!canFuseChunk(consumer, producer)) {
+          continue;
+        }
+        return fuseChunk(consumer, producer);
+      }
+    }
+    return ++consumer->reverseIterator();
+  }
+
+
   // in places where op can be fused into a consumer but chunk is in the way
   // distribute chunk to op's operands:
   // replace a,b = chunk(op(x,y,z)) with:
@@ -473,7 +829,12 @@ struct GraphFuser {
       return false;
     // and the thing being chunked is fusable into the consumer
     Value * producer_for_chunk = chunk->namedInput(attr::self);
-    if (!isFusable(producer_for_chunk->node()) || !allUsersAreThisConsumer(chunk,producer_for_chunk))
+    if (!isFusable(producer_for_chunk->node()) ||
+        !allUsersAreThisConsumer(chunk,producer_for_chunk) ||
+        !areTensorsOfSameShape(chunk->outputs()) ||
+        // After moving the chunk, op will have the same map_size as chunk.
+        // This checks if op will have same map_size as consumer after the move.
+        !haveSameMapSize(consumer, chunk))
       return false;
     // and all uses of the chunk are in this consumer
     for (auto s : chunk->outputs()) {
@@ -483,6 +844,15 @@ struct GraphFuser {
       }
     }
 
+    // First, we'll add explicit expands where necessary to make the chunk
+    // move valid. Let's say we have:
+    // %z = aten::mul(%x, %y)
+    // %z.1, %z.2 = aten::chunk(%z, ...)
+    // ... = prim::FusionGroup(%z.1, %z.2, ...)
+    // It's possible that %x and %y do not have the same size as %z and
+    // need to be expanded first so that they can be chunked like %z
+    maybeInsertExplicitExpands(producer_for_chunk->node());
+
     // multiple return operators
     Node * producer_for_chunk_node = producer_for_chunk->node();
     JIT_ASSERT(producer_for_chunk_node->outputs().size() == 1);
@@ -553,22 +923,13 @@ struct GraphFuser {
   std::pair<graph_node_list::iterator, bool> scanNode(Node * consumer) {
     auto stage_guard = block->owningGraph()->setStageTemporary(consumer->stage());
     if(isFusableAsExitNode(consumer)) {
-      value_list inputs;
       auto consumer_inputs = consumer->kind() == aten::cat ?
         consumer->namedInput(attr::tensors)->node()->inputs() :
         consumer->inputs();
       // handle inputs in reverse topological order as well...
       // otherwise in f(a,a+b) it will appear a is used twice if we consider
       // the f-a fusion before the f-(a+b) fusion first.
-      for(auto i : consumer_inputs) {
-        if (i->node()->owningBlock() == block) {
-          inputs.push_back(i);
-          JIT_ASSERT(topological_index.count(i->node()) > 0);
-        }
-      }
-      std::sort(inputs.begin(), inputs.end(), [&](Value * a, Value * b) {
-        return topological_index.at(a->node()) > topological_index.at(b->node());
-      });
+      auto inputs = sortReverseTopological(consumer_inputs);
       for(auto producer : inputs) {
         // Don't fuse accross stage boundaries
         if (producer->stage() != consumer->stage()) continue;
@@ -625,6 +986,10 @@ struct GraphFuser {
         any_changed |= changed;
       }
     }
+    // Fuse starting chunks into the group.
+    for (auto it = block->nodes().rbegin(); it != block->nodes().rend();) {
+      it = scanNodeForChunks(*it);
+    }
     for (Node * node : block->nodes()) {
       for (Block * sub_block : node->blocks()) {
         GraphFuser(sub_block).run();
@@ -637,6 +1002,8 @@ struct GraphFuser {
 
 void FuseGraph(std::shared_ptr<Graph>& graph) {
   GraphFuser(graph->block()).run();
+  // After FuseGraph some common subexpressions may come back
+  EliminateCommonSubexpression(graph);
 }
 
 }}
diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp
index 4a2265a1f1a76..0195d8c035279 100644
--- a/torch/csrc/jit/passes/shape_analysis.cpp
+++ b/torch/csrc/jit/passes/shape_analysis.cpp
@@ -45,7 +45,7 @@ IValue representativeValue(Value* v) {
     return *iv;
   }
   if (TensorTypePtr type = type_->cast<TensorType>()) {
-    auto backend = type->device() == -1 ? at::kCPU : at::kCUDA;
+    auto backend = type->device() == -1 ? at::Backend::CPU : at::Backend::CUDA;
     at::DeviceGuard device_guard(type->device());
     auto& attype = at::getType(backend, type->scalarType());
     auto t = attype.tensor(type->sizes(), type->strides()).zero_();
diff --git a/torch/csrc/jit/python_arg_flatten.h b/torch/csrc/jit/python_arg_flatten.h
index 3e1477e52e070..06a8676605d7b 100644
--- a/torch/csrc/jit/python_arg_flatten.h
+++ b/torch/csrc/jit/python_arg_flatten.h
@@ -62,7 +62,7 @@ struct IODescriptor {
 };
 
 static inline std::ostream& operator<<(std::ostream& out, const IODescriptor::VariableMetadata& meta) {
-  auto & t = at::getType(meta.device < 0 ? at::kCPU : at::kCUDA, meta.type);
+  auto & t = at::getType(meta.device < 0 ? at::Backend::CPU : at::Backend::CUDA, meta.type);
   out << t << "(requires_grad=" << meta.requires_grad;
   if (meta.device > 0) {
     out << ", device=" << meta.device;
diff --git a/torch/csrc/jit/register_prim_ops.cpp b/torch/csrc/jit/register_prim_ops.cpp
index 400f9da621889..aa3345c13c871 100644
--- a/torch/csrc/jit/register_prim_ops.cpp
+++ b/torch/csrc/jit/register_prim_ops.cpp
@@ -300,7 +300,7 @@ RegisterOperators reg({
 // e.g. s - d == -d + s
 
 #define DEFINE_ST_OP(aten_op, reverse_exp)                             \
-  Operator("aten::" #aten_op "(Scalar a, Tensor b) -> Tensor", [](Node* node) { \
+  Operator("aten::" #aten_op "(Scalar other, Tensor self) -> Tensor", [](Node* node) { \
     return [=](Stack& stack) {                                         \
       at::Scalar a;                                                    \
       at::Tensor b;                                                    \
diff --git a/torch/csrc/tensor/python_tensor.cpp b/torch/csrc/tensor/python_tensor.cpp
index cd43b3e1f86fc..a7a4457c55ea3 100644
--- a/torch/csrc/tensor/python_tensor.cpp
+++ b/torch/csrc/tensor/python_tensor.cpp
@@ -171,10 +171,10 @@ static void py_initialize_tensor_type(PyTypeObject& type, const char* name, PyOb
 
 static const char* get_module(Backend backend) {
   switch (backend) {
-    case kCPU: return "torch";
-    case kCUDA: return "torch.cuda";
-    case kSparseCPU: return "torch.sparse";
-    case kSparseCUDA: return "torch.cuda.sparse";
+    case Backend::CPU: return "torch";
+    case Backend::CUDA: return "torch.cuda";
+    case Backend::SparseCPU: return "torch.sparse";
+    case Backend::SparseCUDA: return "torch.cuda.sparse";
     default: AT_ERROR("invalid backend: ", toString(backend));
   }
 }
@@ -389,8 +389,8 @@ at::Type& get_default_tensor_type() {
 
 Device getDevice(const at::Tensor& tensor) {
   if (tensor.type().is_cuda()) {
-    return at::Device(at::kCUDA, tensor.get_device());
+    return at::Device(at::DeviceType::CUDA, tensor.get_device());
   }
-  return at::Device(at::kCPU);
+  return at::Device(at::DeviceType::CPU);
 }
 }} // namespace torch::tensors
diff --git a/torch/csrc/torch.cpp b/torch/csrc/torch.cpp
index 094067f2b642e..3110295c09cee 100644
--- a/torch/csrc/torch.cpp
+++ b/torch/csrc/torch.cpp
@@ -8,11 +8,11 @@ at::Type& getType(at::Backend backend, at::ScalarType type) {
 }
 
 at::Type& CPU(at::ScalarType type) {
-  return torch::getType(at::kCPU, type);
+  return torch::getType(at::Backend::CPU, type);
 }
 
 at::Type& CUDA(at::ScalarType type) {
-  return torch::getType(at::kCUDA, type);
+  return torch::getType(at::Backend::CUDA, type);
 }
 
 at::Tensor toTensor(const at::Scalar& scalar) {
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index 0f2f51904c255..2b389baaf1278 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -346,7 +346,7 @@ static std::string cpu_prefix = "cpu:";
 inline at::Device PythonArgs::device(int i) {
   if (!args[i]) {
     const auto& default_tensor_type = torch::tensors::get_default_tensor_type();
-    return at::Device(default_tensor_type.backend());
+    return at::Device(default_tensor_type.device_type());
   }
   if (THPDevice_Check(args[i])) {
     const auto device = reinterpret_cast<THPDevice*>(args[i]);
@@ -355,21 +355,21 @@ inline at::Device PythonArgs::device(int i) {
   if (THPUtils_checkLong(args[i])) {
     const auto device_index = THPUtils_unpackLong(args[i]);
     AT_CHECK(device_index >= 0, "Device index must not be negative");
-    return at::Device(at::kCUDA, device_index);
+    return at::Device(at::DeviceType::CUDA, device_index);
   }
   const std::string device_str = THPUtils_unpackString(args[i]);
   if (device_str == cpu_str) {
-    return at::Device(at::kCPU);
+    return at::Device(at::DeviceType::CPU);
   } else if (device_str == cuda_str) {
-    return at::Device(at::kCUDA);
+    return at::Device(at::DeviceType::CUDA);
   } else if (device_str.compare(0, cpu_prefix.length(), cpu_prefix) == 0) {
     const auto device_index = std::stoi(device_str.substr(cpu_prefix.length()));
     AT_CHECK(device_index >= 0, "Device index must not be negative");
-    return at::Device(at::kCPU, device_index);
+    return at::Device(at::DeviceType::CPU, device_index);
   } else if (device_str.compare(0, cuda_prefix.length(), cuda_prefix) == 0) {
     const auto device_index = std::stoi(device_str.substr(cuda_prefix.length()));
     AT_CHECK(device_index >= 0, "Device index must not be negative");
-    return at::Device(at::kCUDA, device_index);
+    return at::Device(at::DeviceType::CUDA, device_index);
   }
   throw torch::TypeError("only \"cuda\" and \"cpu\" are valid device types, got %s", device_str.c_str());
 }
diff --git a/torch/csrc/utils/tensor_apply.cpp b/torch/csrc/utils/tensor_apply.cpp
index 2e53f60ee0437..cb8261115798d 100644
--- a/torch/csrc/utils/tensor_apply.cpp
+++ b/torch/csrc/utils/tensor_apply.cpp
@@ -54,7 +54,7 @@ static void recursive_apply(IntList sizes, ScalarType scalarType, int64_t dim,
 }
 
 Tensor & apply_(Tensor & self, PyObject* fn) {
-  if (self.type().backend() != kCPU) {
+  if (self.type().backend() != Backend::CPU) {
     throw TypeError("apply_ is only implemented on CPU tensors");
   }
   auto scalarType = self.type().scalarType();
@@ -63,7 +63,7 @@ Tensor & apply_(Tensor & self, PyObject* fn) {
 }
 
 Tensor & map_(Tensor & self, const Tensor & other_, PyObject* fn) {
-  if (self.type().backend() != kCPU) {
+  if (self.type().backend() != Backend::CPU) {
     throw TypeError("map_ is only implemented on CPU tensors");
   }
   if (other_.type() != self.type()) {
@@ -78,7 +78,7 @@ Tensor & map_(Tensor & self, const Tensor & other_, PyObject* fn) {
 }
 
 Tensor & map2_(Tensor & self, const Tensor & x_, const Tensor & y_, PyObject* fn) {
-  if (self.type().backend() != kCPU || x_.type().backend() != kCPU || y_.type().backend() != kCPU) {
+  if (self.type().backend() != Backend::CPU || x_.type().backend() != Backend::CPU || y_.type().backend() != Backend::CPU) {
     throw TypeError("map2_ is only implemented on CPU tensors");
   }
   if (x_.type() != self.type()) {
diff --git a/torch/csrc/utils/tensor_list.cpp b/torch/csrc/utils/tensor_list.cpp
index 30408fc12c285..c339a37e636d5 100644
--- a/torch/csrc/utils/tensor_list.cpp
+++ b/torch/csrc/utils/tensor_list.cpp
@@ -30,9 +30,9 @@ static PyObject* recursive_to_list(
 
 PyObject* tensor_to_list(const Tensor& tensor) {
   Tensor data = tensor;
-  if (data.type().backend() != kCPU) {
+  if (data.type().backend() != Backend::CPU) {
     with_no_gil([&]() {
-      data = data.toBackend(kCPU);
+      data = data.toBackend(Backend::CPU);
     });
   }
   auto& type = data.type();
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index d9510837a06b3..060896c393ca9 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -28,8 +28,6 @@ using at::IntList;
 using at::kCPU;
 using at::kCUDA;
 using at::kLong;
-using at::kSparseCPU;
-using at::kSparseCUDA;
 using at::optional;
 using at::Scalar;
 using at::ScalarType;
@@ -37,6 +35,7 @@ using at::Storage;
 using at::Tensor;
 using at::TensorOptions;
 using at::Type;
+using at::Backend;
 
 namespace torch { namespace utils {
 namespace {
@@ -319,7 +318,7 @@ Tensor legacy_sparse_tensor_new(const Type& type, PyObject* args, PyObject* kwar
 
 const Type& typeWithDefault(PythonArgs& r, int64_t dtype_idx, int64_t device_idx, const Type& type) {
   const auto scalartype = r.scalartypeWithDefault(dtype_idx, type.scalarType());
-  const Device types_device_type(toDense(type.backend()));
+  const Device types_device_type(type.device_type());
   const auto device_type = r.isNone(device_idx) ? types_device_type : r.device(device_idx).type();
   return torch::getType(scalartype, *torch::getLayout(type.backend()), device_type);
 }
@@ -410,7 +409,7 @@ Tensor legacy_new_from_data(const Type & type, at::optional<Device> device, PyOb
 }
 
 Tensor sparse_coo_tensor_ctor(const Type& type, PyObject* args, PyObject* kwargs) {
-  const auto sparse_backend = type.is_cuda() ? kSparseCUDA : kSparseCPU;
+  const auto sparse_backend = type.is_cuda() ? Backend::SparseCUDA : Backend::SparseCPU;
   const auto& default_sparse_type = type.toBackend(sparse_backend);
 
   static PythonArgParser parser({
@@ -423,23 +422,23 @@ Tensor sparse_coo_tensor_ctor(const Type& type, PyObject* args, PyObject* kwargs
   if (r.idx == 0) {
     bool type_inference = r.isNone(2);
     const auto& sparse_type = typeWithDefault(r, 2, 3, default_sparse_type);
-    const auto& dense_type = sparse_type.toBackend(sparse_type.is_cuda() ? kCUDA : kCPU);
+    const auto& dense_type = sparse_type.toBackend(sparse_type.is_cuda() ? Backend::CUDA : Backend::CPU);
     at::DeviceGuard device_guard(r.device(3));
     Tensor values = internal_new_from_data(dense_type, r.deviceOptional(3), r.pyobject(1), false, true, type_inference);
     // if no dtype provided, infer type based on value type.
     const auto& index_type = values.type().toScalarType(kLong);
     Tensor indices = internal_new_from_data(index_type, r.deviceOptional(3), r.pyobject(0), false, true, false);
-    const auto& sparse_type_to_use = values.type().toBackend(values.type().is_cuda() ? kSparseCUDA : kSparseCPU);
+    const auto& sparse_type_to_use = values.type().toBackend(values.type().is_cuda() ? Backend::SparseCUDA : Backend::SparseCPU);
     return sparse_type_to_use.sparse_coo_tensor(indices, values).set_requires_grad(r.toBool(4));
   } else if (r.idx == 1) {
     bool type_inference = r.isNone(3);
     const auto& sparse_type = typeWithDefault(r, 3, 4, default_sparse_type);
-    const auto& dense_type = sparse_type.toBackend(sparse_type.is_cuda() ? kCUDA : kCPU);
+    const auto& dense_type = sparse_type.toBackend(sparse_type.is_cuda() ? Backend::CUDA : Backend::CPU);
     at::DeviceGuard device_guard(r.device(4));
     Tensor values = internal_new_from_data(dense_type, r.deviceOptional(4), r.pyobject(1), false, true, type_inference);
     const auto& index_type = values.type().toScalarType(kLong);
     Tensor indices = internal_new_from_data(index_type, r.deviceOptional(4), r.pyobject(0), false, true, false);
-    const auto& sparse_type_to_use = values.type().toBackend(values.type().is_cuda() ? kSparseCUDA : kSparseCPU);
+    const auto& sparse_type_to_use = values.type().toBackend(values.type().is_cuda() ? Backend::SparseCUDA : Backend::SparseCPU);
     return sparse_type_to_use.sparse_coo_tensor(indices, values, r.intlist(2)).set_requires_grad(r.toBool(5));
   }
   throw std::runtime_error("sparse_coo_tensor(): invalid arguments");
diff --git a/torch/csrc/utils/tensor_numpy.cpp b/torch/csrc/utils/tensor_numpy.cpp
index cf7a534bd522c..0cee4661208c6 100644
--- a/torch/csrc/utils/tensor_numpy.cpp
+++ b/torch/csrc/utils/tensor_numpy.cpp
@@ -140,7 +140,7 @@ static int aten_to_dtype(const at::Type& type) {
         "can't convert sparse tensor to numpy. Use Tensor.to_dense() to "
         "convert to a dense tensor first.");
   }
-  if (type.backend() == kCPU) {
+  if (type.backend() == Backend::CPU) {
     switch (type.scalarType()) {
       case kDouble: return NPY_DOUBLE;
       case kFloat: return NPY_FLOAT;
diff --git a/torch/csrc/utils/tensor_types.cpp b/torch/csrc/utils/tensor_types.cpp
index 7fd6319539cf6..60b098e8d5f8f 100644
--- a/torch/csrc/utils/tensor_types.cpp
+++ b/torch/csrc/utils/tensor_types.cpp
@@ -16,11 +16,11 @@ namespace torch { namespace utils {
 
 static const char* backend_to_string(const at::Type& type) {
   switch (type.backend()) {
-    case at::kCPU: return "torch";
-    case at::kCUDA: return "torch.cuda";
-    case at::kSparseCPU: return "torch.sparse";
-    case at::kSparseCUDA: return "torch.cuda.sparse";
-    default: throw std::runtime_error("Unimplemented backend");
+    case at::Backend::CPU: return "torch";
+    case at::Backend::CUDA: return "torch.cuda";
+    case at::Backend::SparseCPU: return "torch.sparse";
+    case at::Backend::SparseCUDA: return "torch.cuda.sparse";
+    default: AT_ERROR("Unimplemented backend ", type.backend());
   }
 }
 
diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py
index a2086ae95b899..8c1ee681122ed 100644
--- a/torch/distributed/__init__.py
+++ b/torch/distributed/__init__.py
@@ -33,8 +33,7 @@ def is_available():
 
 
 def destroy_process_group():
-    """
-    Destroy the initialized distributed package
+    r"""Destroy the initialized distributed package
     """
     global _backend
     global _initialized
@@ -44,17 +43,17 @@ def destroy_process_group():
 
 
 def is_initialized():
-    """Checking if the process group has been initialized
+    r"""Checking if the process group has been initialized
     """
     return _initialized == _INITIALIZED_PG
 
 
 def init_process_group(backend, init_method='env://', **kwargs):
-    """Initializes the distributed package.
+    r"""Initializes the distributed package.
 
     Arguments:
         backend (str): Name of the backend to use. Depending on build-time configuration
-            valid values include: ``tcp``, ``mpi`` and ``gloo``.
+            valid values include: ``tcp``, ``mpi``, ``gloo`` and ``nccl``.
         init_method (str, optional): URL specifying how to initialize the package.
         world_size (int, optional): Number of processes participating in the job.
         rank (int, optional): Rank of the current process.
@@ -108,7 +107,7 @@ def init_master_worker(backend, init_method='env://', **kwargs):
                                         WARNING
     ================================================================================
     Master-worker mode is still experimental. The API will change without
-    notice and we're can't guarantee full correctness and expected performance yet.
+    notice and we do not guarantee full correctness and expected performance yet.
     We'll announce it once it's ready.
     """)
     world_size = kwargs.pop('world_size', -1)
@@ -156,23 +155,24 @@ def wait(self):
 
 
 def get_rank():
-    """Returns the rank of current process.
+    r"""Returns the rank of current process.
 
     Rank is a unique identifier assigned to each process within a distributed
-    group. They are always consecutive integers ranging from 0 to ``world_size``.
+    group. They are always consecutive integers ranging from ``0`` to
+    ``world_size``.
     """
     assert torch.distributed._initialized
     return torch._C._dist_get_rank()
 
 
 def get_world_size():
-    """Returns the number of processes in the distributed group."""
+    r"""Returns the number of processes in the distributed group."""
     assert torch.distributed._initialized
     return torch._C._dist_get_num_processes()
 
 
 def isend(tensor, dst):
-    """Sends a tensor asynchronously.
+    r"""Sends a tensor asynchronously.
 
     Arguments:
         tensor (Tensor): Tensor to send.
@@ -187,7 +187,7 @@ def isend(tensor, dst):
 
 
 def irecv(tensor, src):
-    """Receives a tensor asynchronously.
+    r"""Receives a tensor asynchronously.
 
     Arguments:
         tensor (Tensor): Tensor to fill with received data.
@@ -202,7 +202,7 @@ def irecv(tensor, src):
 
 
 def send(tensor, dst):
-    """Sends a tensor synchronously.
+    r"""Sends a tensor synchronously.
 
     Arguments:
         tensor (Tensor): Tensor to send.
@@ -214,7 +214,7 @@ def send(tensor, dst):
 
 
 def recv(tensor, src=None):
-    """Receives a tensor synchronously.
+    r"""Receives a tensor synchronously.
 
     Arguments:
         tensor (Tensor): Tensor to fill with received data.
@@ -232,15 +232,16 @@ def recv(tensor, src=None):
 
 
 def broadcast_multigpu(tensor_list, src, group=group.WORLD):
-    """Broadcasts the tensor to the whole group with multiple GPU tensors
+    r"""Broadcasts the tensor to the whole group with multiple GPU tensors
     per node.
 
-    ``tensor`` must have the same number of elements in all the GPUs from
+    :attr:`tensor` must have the same number of elements in all the GPUs from
     all processes participating in the collective. each tensor in the list must
-    be on a different GPU
+    be on a different GPU.
 
-    Only nccl backend is currently supported
-    tensors should only be GPU tensors
+    .. note::
+      Only NCCL backend is currently supported. :attr:`tensor_list` should only
+      contain GPU tensors.
 
     Arguments:
         tensor_list (List[Tensor]): Tensors that participate in the collective
@@ -261,14 +262,15 @@ def broadcast_multigpu(tensor_list, src, group=group.WORLD):
 
 
 def broadcast(tensor, src, group=group.WORLD):
-    """Broadcasts the tensor to the whole group.
+    r"""Broadcasts the tensor to the whole group.
 
-    ``tensor`` must have the same number of elements in all processes
+    :attr:`tensor` must have the same number of elements in all processes
     participating in the collective.
 
     Arguments:
-        tensor (Tensor): Data to be sent if ``src`` is the rank of current
-            process, and tensor to be used to save received data otherwise.
+        tensor (Tensor): Data to be sent if :attr:`src` is the rank of
+            current process, and tensor to be used to save received data
+            otherwise.
         src (int): Source rank.
         group (optional): Group of the collective.
     """
@@ -278,20 +280,21 @@ def broadcast(tensor, src, group=group.WORLD):
 
 
 def all_reduce_multigpu(tensor_list, op=reduce_op.SUM, group=group.WORLD):
-    """Reduces the tensor data across all machines in such a way that all get
+    r"""Reduces the tensor data across all machines in such a way that all get
     the final result. This function reduces a number of tensors on every node,
-    while each tensor resides on different GPUs.
+    while each tensor resides on a different GPU.
     Therefore, the input tensor in the tensor list needs to be GPU tensors.
     Also, each tensor in the tensor list needs to reside on a different GPU.
 
-    After the call, all ``tensor`` in ``tensor_list`` is going to be bitwise
-    identical in all processes.
+    After the call, all tensors in :attr:`tensor_list` will be bitwise identical
+    in all processes.
 
-    Only nccl backend is currently supported
-    tensors should only be GPU tensors
+    .. note::
+      Only NCCL backend is currently supported. :attr:`tensor_list` should only
+      contain GPU tensors.
 
     Arguments:
-        tensor list (List[Tensor]): List of input and output tensors of
+        tensor_list (List[Tensor]): List of input and output tensors of
             the collective. The function operates in-place and requires that
             each tensor to be a GPU tensor on different GPUs.
             You also need to make sure that ``len(tensor_list)`` is the same for
@@ -308,10 +311,10 @@ def all_reduce_multigpu(tensor_list, op=reduce_op.SUM, group=group.WORLD):
 
 
 def all_reduce(tensor, op=reduce_op.SUM, group=group.WORLD):
-    """Reduces the tensor data across all machines in such a way that all get
+    r"""Reduces the tensor data across all machines in such a way that all get
     the final result.
 
-    After the call ``tensor`` is going to be bitwise identical in all processes.
+    After the call :attr:`tensor` will be bitwise identical in all processes.
 
     Arguments:
         tensor (Tensor): Input and output of the collective. The function
@@ -326,14 +329,15 @@ def all_reduce(tensor, op=reduce_op.SUM, group=group.WORLD):
 
 
 def reduce_multigpu(tensor_list, dst, op=reduce_op.SUM, group=group.WORLD):
-    """Reduces the tensor data on multiple GPUs across all machines. Each tensor
-    in ``tensor_list`` should reside on a separate GPU
+    r"""Reduces the tensor data on multiple GPUs across all machines. Each tensor
+    in :attr`tensor_list` should reside on a separate GPU.
 
-    Only the GPU of ``tensor_list[0]`` on the process with rank ``dst`` is
+    Only the GPU of ``tensor_list[0]`` on the process with rank :attr:`dst` is
     going to receive the final result.
 
-    Only nccl backend is currently supported
-    tensors should only be GPU tensors
+    .. note::
+      Only NCCL backend is currently supported. :attr:`tensor_list` should only
+      contain GPU tensors.
 
     Arguments:
         tensor_list (List[Tensor]): Input and output GPU tensors of the
@@ -353,9 +357,9 @@ def reduce_multigpu(tensor_list, dst, op=reduce_op.SUM, group=group.WORLD):
 
 
 def reduce(tensor, dst, op=reduce_op.SUM, group=group.WORLD):
-    """Reduces the tensor data across all machines.
+    r"""Reduces the tensor data across all machines.
 
-    Only the process with rank ``dst`` is going to receive the final result.
+    Only the process with rank :attr`dst` is going to receive the final result.
 
     Arguments:
         tensor (Tensor): Input and output of the collective. The function
@@ -373,11 +377,12 @@ def reduce(tensor, dst, op=reduce_op.SUM, group=group.WORLD):
 def all_gather_multigpu(output_tensor_lists,
                         input_tensor_list,
                         group=group.WORLD):
-    """Gathers tensors from the whole group in a list.
-    Each tensor in ``tensor_list`` should reside on a separate GPU
+    r"""Gathers tensors from the whole group in a list.
+    Each tensor in :attr:`input_tensor_list` should reside on a separate GPU.
 
-    Only nccl backend is currently supported
-    tensors should only be GPU tensors
+    .. note::
+      Only NCCL backend is currently supported. :attr:`output_tensor_lists` and
+      :attr:`input_tensor_list` should only contain GPU tensors.
 
     Arguments:
         output_tensor_lists (List[List[Tensor]]): Output lists. It should
@@ -396,7 +401,7 @@ def all_gather_multigpu(output_tensor_lists,
             therefore ``len(output_tensor_lists[i])``) need to be the same
             for all the distributed processes calling this function.
 
-        input_tensor_list (List[Tensor]): List of tensors(on different GPUs) to
+        input_tensor_list (List[Tensor]): List of tensors (on different GPUs) to
             be broadcast from current process.
             Note that ``len(input_tensor_list)`` needs to be the same for
             all the distributed processes calling this function.
@@ -424,7 +429,7 @@ def all_gather_multigpu(output_tensor_lists,
 
 
 def all_gather(tensor_list, tensor, group=group.WORLD):
-    """Gathers tensors from the whole group in a list.
+    r"""Gathers tensors from the whole group in a list.
 
     Arguments:
         tensor_list (list[Tensor]): Output list. It should contain
@@ -441,7 +446,7 @@ def all_gather(tensor_list, tensor, group=group.WORLD):
 
 
 def gather(tensor, **kwargs):
-    """Gathers a list of tensors in a single process.
+    r"""Gathers a list of tensors in a single process.
 
     Arguments:
         tensor (Tensor): Input tensor.
@@ -470,10 +475,10 @@ def gather(tensor, **kwargs):
 
 
 def scatter(tensor, **kwargs):
-    """Scatters a list of tensors to all processes in a group.
+    r"""Scatters a list of tensors to all processes in a group.
 
     Each process will receive exactly one tensor and store its data in the
-    ``tensor`` argument.
+    :attr:`tensor` argument.
 
     Arguments:
         tensor (Tensor): Output tensor.
@@ -490,7 +495,7 @@ def scatter(tensor, **kwargs):
     scatter_list = kwargs.pop('scatter_list', None)
     _group = kwargs.pop('group', group.WORLD)
     if kwargs:
-        raise RuntimeError("got unexpected kwargs")
+        raise RuntimeError("got unexpected kwargs: {}".format(", ".join(kwargs.keys())))
     if src == my_rank:
         if scatter_list is None:
             raise RuntimeError("scatter_list is a required argument in scatter source")
@@ -502,7 +507,7 @@ def scatter(tensor, **kwargs):
 
 
 def barrier(group=group.WORLD):
-    """Synchronizes all processes.
+    r"""Synchronizes all processes.
 
     This collective blocks processes until the whole group enters this function.
 
@@ -515,9 +520,9 @@ def barrier(group=group.WORLD):
 
 
 def new_group(ranks=None):
-    """Creates a new distributed group.
+    r"""Creates a new distributed group.
 
-    This function requires that all processes in the main group (i.e. all
+    This function requires that all processes in the main group (i.e., all
     processes that are part of the distributed job) enter this function, even
     if they are not going to be members of the group. Additionally, groups
     should be created in the same order in all processes.
@@ -536,11 +541,11 @@ def new_group(ranks=None):
 
 
 def _clear_group_cache(group=group.WORLD):
-    """Clear the created distributed group's cached resource
+    r"""Clear the created distributed group's cached resource.
 
-    Only nccl backend is currently supported
+    Only NCCL backend is currently supported.
 
-    Cached resource includes NCCL communicators and CUDA events
+    Cached resource includes NCCL communicators and CUDA events.
 
     Arguments:
         group (optional): Group of the collective.
diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp
index ee55e29c72cb2..aa133a0291f52 100644
--- a/torch/lib/c10d/ProcessGroupGloo.cpp
+++ b/torch/lib/c10d/ProcessGroupGloo.cpp
@@ -322,7 +322,7 @@ void ProcessGroupGloo::createAllreduce(AlgorithmEntry& entry) {
   auto& context = contexts_[0];
   at::DeviceGuard guard(entry.src[0]);
 
-  if (backend == at::kCPU) {
+  if (backend == at::Backend::CPU) {
     if (getSize() < 16) {
       entry.algorithm = std::unique_ptr<::gloo::Algorithm>(
           new ::gloo::AllreduceRingChunked<T>(
@@ -341,7 +341,7 @@ void ProcessGroupGloo::createAllreduce(AlgorithmEntry& entry) {
     return;
   }
 
-  if (backend == at::kCUDA) {
+  if (backend == at::Backend::CUDA) {
     if (getSize() < 16) {
       entry.algorithm = std::unique_ptr<::gloo::Algorithm>(
           new ::gloo::CudaAllreduceRingChunked<T>(
@@ -373,7 +373,7 @@ void ProcessGroupGloo::createBroadcast(AlgorithmEntry& entry) {
   auto& context = contexts_[0];
   at::DeviceGuard guard(entry.src[0]);
 
-  if (backend == at::kCPU) {
+  if (backend == at::Backend::CPU) {
     entry.algorithm =
         std::unique_ptr<::gloo::Algorithm>(new ::gloo::BroadcastOneToAll<T>(
             context,
@@ -384,7 +384,7 @@ void ProcessGroupGloo::createBroadcast(AlgorithmEntry& entry) {
     return;
   }
 
-  if (backend == at::kCUDA) {
+  if (backend == at::Backend::CUDA) {
     entry.algorithm =
         std::unique_ptr<::gloo::Algorithm>(new ::gloo::CudaBroadcastOneToAll<T>(
             context,
diff --git a/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp
index 332c553ebaa1b..b8fa26690fc95 100644
--- a/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp
+++ b/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp
@@ -69,7 +69,7 @@ class AsyncInputIsOutputTest : public AsyncTest {
         numTensors_(numTensors),
         numDevices_(cudaNumDevices()),
         state_(::at::globalContext().lazyInitCUDA()) {
-    const auto& type = at::getType(at::kCUDA, at::kFloat);
+    const auto& type = at::getType(at::Backend::CUDA, at::kFloat);
 
     // Allocate inputs on available devices in a round robin fashion.
     inputs_.resize(numTensors_);
@@ -116,7 +116,7 @@ class AsyncInputIsOutputTest : public AsyncTest {
 
     // Copy inputs to outputs
     for (auto i = 0; i < numTensors_; i++) {
-      outputs[i] = inputs_[i].toBackend(at::kCPU);
+      outputs[i] = inputs_[i].cpu();
     }
 
     return outputs;
diff --git a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
index 16e686f0fda22..dcbb780c7e3d9 100644
--- a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
+++ b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
@@ -141,7 +141,7 @@ std::vector<std::vector<at::Tensor>> copyTensors(
     const auto& input = inputs[i];
     std::vector<at::Tensor> output(input.size());
     for (size_t j = 0; j < input.size(); j++) {
-      output[j] = input[j].toBackend(at::kCPU);
+      output[j] = input[j].cpu();
     }
     outputs[i] = std::move(output);
   }
@@ -261,22 +261,22 @@ int main(int argc, char** argv) {
 
   {
     TemporaryFile file;
-    testAllreduce(file.path, at::kCPU);
+    testAllreduce(file.path, at::Backend::CPU);
   }
 
   {
     TemporaryFile file;
-    testAllreduce(file.path, at::kCUDA);
+    testAllreduce(file.path, at::Backend::CUDA);
   }
 
   {
     TemporaryFile file;
-    testBroadcast(file.path, at::kCPU);
+    testBroadcast(file.path, at::Backend::CPU);
   }
 
   {
     TemporaryFile file;
-    testBroadcast(file.path, at::kCUDA);
+    testBroadcast(file.path, at::Backend::CUDA);
   }
 
   return 0;
diff --git a/torch/lib/libshm/core.cpp b/torch/lib/libshm/core.cpp
index 78954c46860aa..a68e6932764a9 100644
--- a/torch/lib/libshm/core.cpp
+++ b/torch/lib/libshm/core.cpp
@@ -118,7 +118,7 @@ static void deleteTHManagedMapAllocator(void* ptr) {
 
 at::DataPtr THManagedMapAllocator::makeDataPtr(const char* manager_handle, const char* filename, int flags, ptrdiff_t size) {
   auto* context = new THManagedMapAllocator(manager_handle, filename, flags, size);
-  return {context->data(), context, &deleteTHManagedMapAllocator, at::kCPU};
+  return {context->data(), context, &deleteTHManagedMapAllocator, at::DeviceType::CPU};
 }
 
 THManagedMapAllocator* THManagedMapAllocator::fromDataPtr(const at::DataPtr& dptr) {
diff --git a/torch/nn/_functions/rnn.py b/torch/nn/_functions/rnn.py
index ba7db5aa2d23c..702cdb156b578 100644
--- a/torch/nn/_functions/rnn.py
+++ b/torch/nn/_functions/rnn.py
@@ -13,8 +13,7 @@
 
 
 def _select_rnn_impl(mode, input_size, hidden_size, num_layers=1, batch_first=False,
-                     dropout=0, train=True, bidirectional=False, variable_length=False,
-                     dropout_state=None, flat_weight=None):
+                     dropout=0, train=True, bidirectional=False, dropout_state=None):
     hidden_is_tensor = True
     if mode == 'RNN_RELU':
         impl = torch._C._VariableFunctions.rnn_relu
@@ -31,18 +30,11 @@ def _select_rnn_impl(mode, input_size, hidden_size, num_layers=1, batch_first=Fa
     def forward(input, weight, hidden, batch_sizes):
         has_biases = len(weight[0]) == 4
         weight = sum(weight, type(weight[0])())
-        if cudnn.is_acceptable(input):
-            dropout_seed = int(torch.IntTensor(1).random_())
-            with torch.cuda.device(input.get_device()):
-                dropout_ts = cudnn.rnn.init_dropout_state(dropout, train, dropout_seed, dropout_state)
-        else:
-            dropout_ts = None
-        if not variable_length:
-            result = impl(input, hidden, weight, has_biases, num_layers, dropout, train, bidirectional,
-                          batch_first, flat_weight, dropout_ts)
+
+        if batch_sizes is None:
+            result = impl(input, hidden, weight, has_biases, num_layers, dropout, train, bidirectional, batch_first)
         else:
-            result = impl(input, batch_sizes, hidden, weight, has_biases, num_layers, dropout, train,
-                          bidirectional, flat_weight, dropout_ts)
+            result = impl(input, batch_sizes, hidden, weight, has_biases, num_layers, dropout, train, bidirectional)
         return result[0], (result[1] if hidden_is_tensor else result[1:])
 
     return forward
diff --git a/torch/nn/modules/rnn.py b/torch/nn/modules/rnn.py
index 68dc70a6d4be5..20ff911ecd209 100644
--- a/torch/nn/modules/rnn.py
+++ b/torch/nn/modules/rnn.py
@@ -80,7 +80,6 @@ def flatten_parameters(self):
         """
         any_param = next(self.parameters()).data
         if not any_param.is_cuda or not torch.backends.cudnn.is_acceptable(any_param):
-            self._data_ptrs = []
             return
 
         # If any parameters alias, we fall back to the slower, copying code path. This is
@@ -89,7 +88,6 @@ def flatten_parameters(self):
         # Module.named_parameters().
         unique_data_ptrs = set(p.data_ptr() for l in self.all_weights for p in l)
         if len(unique_data_ptrs) != sum(len(l) for l in self.all_weights):
-            self._data_ptrs = []
             return
 
         with torch.cuda.device_of(any_param):
@@ -108,9 +106,6 @@ def flatten_parameters(self):
                     self.input_size, rnn.get_cudnn_mode(self.mode), self.hidden_size, self.num_layers,
                     self.batch_first, bool(self.bidirectional))
 
-            self._param_buf_size = weight_buf.size(0)
-            self._data_ptrs = list(p.data.data_ptr() for p in self.parameters())
-
     def _apply(self, fn):
         ret = super(RNNBase, self)._apply(fn)
         self.flatten_parameters()
@@ -171,14 +166,6 @@ def forward(self, input, hx=None):
             if self.mode == 'LSTM':
                 hx = (hx, hx)
 
-        has_flat_weights = list(p.data.data_ptr() for p in self.parameters()) == self._data_ptrs
-        if has_flat_weights:
-            first_data = next(self.parameters()).data
-            assert first_data.storage().size() == self._param_buf_size
-            flat_weight = first_data.new().set_(first_data.storage(), 0, torch.Size([self._param_buf_size]))
-        else:
-            flat_weight = None
-
         self.check_forward_args(input, hx, batch_sizes)
         func = _get_rnn_impl(
             self.mode,
@@ -190,8 +177,6 @@ def forward(self, input, hx=None):
             train=self.training,
             bidirectional=self.bidirectional,
             dropout_state=self.dropout_state,
-            variable_length=is_packed,
-            flat_weight=flat_weight
         )
         output, hidden = func(input, self.all_weights, hx, batch_sizes)
         if is_packed:
@@ -214,7 +199,6 @@ def extra_repr(self):
 
     def __setstate__(self, d):
         super(RNNBase, self).__setstate__(d)
-        self.__dict__.setdefault('_data_ptrs', [])
         if 'all_weights' in d:
             self._all_weights = d['all_weights']
         if isinstance(self._all_weights[0][0], str):
diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py
index ca5702837bd59..f5e7503f71f97 100644
--- a/torch/onnx/symbolic.py
+++ b/torch/onnx/symbolic.py
@@ -266,6 +266,30 @@ def tanh(g, self):
     return g.op("Tanh", self)
 
 
+def sin(g, self):
+    return g.op("Sin", self)
+
+
+def cos(g, self):
+    return g.op("Cos", self)
+
+
+def tan(g, self):
+    return g.op("Tan", self)
+
+
+def asin(g, self):
+    return g.op("Asin", self)
+
+
+def acos(g, self):
+    return g.op("Acos", self)
+
+
+def atan(g, self):
+    return g.op("Atan", self)
+
+
 def sigmoid(g, self):
     return g.op("Sigmoid", self)
 
@@ -1066,6 +1090,7 @@ def symbolic_flattened_wrapper(g, input, *args):
     inputs = list(itertools.chain.from_iterable(
         [[input], flattened_weights, hiddens,
             [batch_sizes] if batch_sizes else []]))
+
     outputs = g.wrapPyFuncWithSymbolic(
         forward_flattened_wrapper,
         inputs,
diff --git a/torch/utils/data/__init__.py b/torch/utils/data/__init__.py
index 087ccee98a788..05c94d173cd55 100644
--- a/torch/utils/data/__init__.py
+++ b/torch/utils/data/__init__.py
@@ -1,4 +1,5 @@
 
 from .sampler import Sampler, SequentialSampler, RandomSampler, SubsetRandomSampler, WeightedRandomSampler, BatchSampler
+from .distributed import DistributedSampler
 from .dataset import Dataset, TensorDataset, ConcatDataset, Subset, random_split
 from .dataloader import DataLoader