diff --git a/aten/src/ATen/Backend.h b/aten/src/ATen/Backend.h
index ccb96feeed238a..40db1ee67f2476 100644
--- a/aten/src/ATen/Backend.h
+++ b/aten/src/ATen/Backend.h
@@ -1,4 +1,9 @@
 #pragma once
+
+#include <ATen/core/TensorTypeId.h>
+#include <ATen/core/TensorTypeIdRegistration.h>
+#include <ATen/core/Error.h>
+
 #include <stdexcept>
 
 namespace at {
@@ -40,6 +45,39 @@ static inline Backend toDense(Backend b) {
   }
 }
 
+static inline Backend tensorTypeIdToBackend(TensorTypeId t) {
+  if (t == CPUTensorId()) {
+    return Backend::CPU;
+  } else if (t == CUDATensorId()) {
+    return Backend::CUDA;
+  } else if (t == SparseCPUTensorId()) {
+    return Backend::SparseCPU;
+  } else if (t == SparseCUDATensorId()) {
+    return Backend::SparseCUDA;
+  } else if (t == UndefinedTensorId()) {
+    return Backend::Undefined;
+  } else {
+    AT_ERROR("Unrecognized tensor type ID: ", t);
+  }
+}
+
+static inline TensorTypeId backendToTensorTypeId(Backend b) {
+  switch (b) {
+    case Backend::CPU:
+      return CPUTensorId();
+    case Backend::CUDA:
+      return CUDATensorId();
+    case Backend::SparseCPU:
+      return SparseCPUTensorId();
+    case Backend::SparseCUDA:
+      return SparseCUDATensorId();
+    case Backend::Undefined:
+      return UndefinedTensorId();
+    default:
+      throw std::runtime_error("Unknown backend");
+  }
+}
+
 static inline const char* toString(Backend b) {
   switch (b) {
     case Backend::CPU:
diff --git a/aten/src/ATen/SparseTensorImpl.cpp b/aten/src/ATen/SparseTensorImpl.cpp
index 03a5a6008e7d24..f4146153f798fe 100644
--- a/aten/src/ATen/SparseTensorImpl.cpp
+++ b/aten/src/ATen/SparseTensorImpl.cpp
@@ -3,6 +3,18 @@
 
 namespace at {
 
+namespace {
+  Backend sparseTensorIdToDenseBackend(TensorTypeId type_id) {
+    if (type_id == SparseCPUTensorId()) {
+      return Backend::CPU;
+    } else if (type_id == SparseCUDATensorId()) {
+      return Backend::CUDA;
+    } else {
+      AT_ERROR("Cannot construct SparseTensor with non-sparse tensor type ID ", type_id);
+    }
+  }
+}
+
 
 // An empty dense tensor defaults to a 1-dimensional tensor of size [0]
 // (recall, it is not a 0-dimensional tensor, because such a tensor would
@@ -18,15 +30,13 @@ namespace at {
 // tensor and a [0] size values tensor for such an empty tensor.  However,
 // we don't currently support zero-size dimensions, so we can't actually
 // do this; so we just allocate zero-size tensors for everything.
-SparseTensorImpl::SparseTensorImpl(at::Backend backend, at::ScalarType scalar_type)
-    : TensorImpl(backend, scalar_type, nullptr, false)
+SparseTensorImpl::SparseTensorImpl(at::TensorTypeId type_id, at::ScalarType scalar_type)
+    : TensorImpl(type_id, scalar_type, nullptr, false)
     , size_{0}
     , sparseDims_(1)
     , denseDims_(0)
-    , indices_(globalContext().getTypeOpt(toDense(backend), ScalarType::Long)->tensor())
-    , values_(globalContext().getTypeOpt(toDense(backend), scalar_type)->tensor()) {
-      AT_ASSERT(backend == Backend::SparseCPU || backend == Backend::SparseCUDA);
-    }
+    , indices_(globalContext().getTypeOpt(sparseTensorIdToDenseBackend(type_id), ScalarType::Long)->tensor())
+    , values_(globalContext().getTypeOpt(sparseTensorIdToDenseBackend(type_id), scalar_type)->tensor()) {}
 
 IntList SparseTensorImpl::sizes() const {
   return size_;
diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h
index 8c44200cee31bf..3a1fa66a4ca3aa 100644
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@@ -48,7 +48,7 @@ struct AT_API SparseTensorImpl : public TensorImpl {
 
 public:
   // Public for now...
-  explicit SparseTensorImpl(at::Backend, at::ScalarType);
+  explicit SparseTensorImpl(at::TensorTypeId, at::ScalarType);
 
   int64_t nnz() const { return nnz_; }
   int64_t sparseDims() const { return sparseDims_; }
diff --git a/aten/src/ATen/TensorImpl.cpp b/aten/src/ATen/TensorImpl.cpp
index 77c99c4be9f27f..e5f9bf98fb3a67 100644
--- a/aten/src/ATen/TensorImpl.cpp
+++ b/aten/src/ATen/TensorImpl.cpp
@@ -4,6 +4,7 @@
 #include <ATen/Tensor.h>
 #include <ATen/core/optional.h>
 #include <ATen/Context.h>
+#include <ATen/Backend.h>
 
 #include <ATen/detail/VariableHooksInterface.h>
 
@@ -12,7 +13,10 @@
 namespace at {
 
 Type& TensorImpl::type() const {
-  Type* base_type = &globalContext().getType(backend_, scalar_type_);
+  // Select backend from the hard-coded ones that the legacy ATen dispatcher
+  // knows about
+  Backend backend = tensorTypeIdToBackend(type_id_);
+  Type* base_type = &globalContext().getType(backend, scalar_type_);
   if (is_variable_) {
     return detail::getVariableHooks().getVariableType(*base_type);
   } else {
@@ -55,10 +59,9 @@ void Tensor::backward(
   pImpl->backward(std::move(gradient), keep_graph, create_graph);
 }
 
-TensorImpl::TensorImpl(Backend backend, ScalarType scalar_type) {
-  backend_ = backend;
-  scalar_type_ = scalar_type;
-  auto type = &globalContext().getType(backend, scalar_type);
+TensorImpl::TensorImpl(TensorTypeId type_id, ScalarType scalar_type)
+    : type_id_(type_id), scalar_type_(scalar_type) {
+  auto type = &globalContext().getType(tensorTypeIdToBackend(type_id), scalar_type);
   Storage* storage = type->storage(true).release();
   StorageImpl* storage_impl = storage->pImpl();
   tensor = new THTensor(storage_impl);
diff --git a/aten/src/ATen/TensorImpl.h b/aten/src/ATen/TensorImpl.h
index fb6de9b6592e7c..c9b701c02f324d 100644
--- a/aten/src/ATen/TensorImpl.h
+++ b/aten/src/ATen/TensorImpl.h
@@ -6,6 +6,8 @@
 #include "ATen/Retainable.h"
 #include "ATen/ScalarType.h"
 #include "ATen/core/optional.h"
+#include "ATen/core/TensorTypeId.h"
+#include "ATen/core/TensorTypeIdRegistration.h"
 
 struct THTensor;
 
@@ -18,9 +20,9 @@ struct Tensor;
 
 namespace at {
 struct AT_API TensorImpl : public Retainable {
-  explicit TensorImpl(Backend backend, ScalarType scalar_type, THTensor * tensor, bool is_variable)
-  : backend_(backend), scalar_type_(scalar_type), is_variable_(is_variable), tensor(tensor) {}
-  TensorImpl(Backend backend, ScalarType scalar_type);
+  explicit TensorImpl(TensorTypeId type_id, ScalarType scalar_type, THTensor * tensor, bool is_variable)
+  : type_id_(type_id), scalar_type_(scalar_type), is_variable_(is_variable), tensor(tensor) {}
+  TensorImpl(TensorTypeId type_id, ScalarType scalar_type);
 
   virtual ~TensorImpl();
 
@@ -94,7 +96,7 @@ struct AT_API TensorImpl : public Retainable {
   virtual void set_data(Tensor new_data);
 
 protected:
-  Backend backend_;
+  TensorTypeId type_id_;
   // INVARIANT: When storage is non-null, this scalar type must
   // agree with the scalar type in storage
   ScalarType scalar_type_;
diff --git a/aten/src/ATen/UndefinedTensor.cpp b/aten/src/ATen/UndefinedTensor.cpp
index 8a818538be4d4c..0390b412d9c2e6 100644
--- a/aten/src/ATen/UndefinedTensor.cpp
+++ b/aten/src/ATen/UndefinedTensor.cpp
@@ -6,7 +6,7 @@ namespace at {
 
 // should this use the globalContext?  Can it get a context passed in somehow?
 UndefinedTensor::UndefinedTensor()
-: TensorImpl(Backend::Undefined, ScalarType::Undefined, nullptr, /* is variable */ false) {
+: TensorImpl(UndefinedTensorId(), ScalarType::Undefined, nullptr, /* is variable */ false) {
 }
 
 IntList UndefinedTensor::sizes() const {
diff --git a/aten/src/ATen/UndefinedType.cpp b/aten/src/ATen/UndefinedType.cpp
index a18bbc1e0e85a9..63e9098ede528c 100644
--- a/aten/src/ATen/UndefinedType.cpp
+++ b/aten/src/ATen/UndefinedType.cpp
@@ -4,7 +4,7 @@
 namespace at {
 
 UndefinedType::UndefinedType(Context* context)
-    : Type(context, /*is_variable=*/false, /*is_undefined=*/true) {}
+    : Type(context, UndefinedTensorId(), /*is_variable=*/false, /*is_undefined=*/true) {}
 ScalarType UndefinedType::scalarType() const {
   return ScalarType::Undefined;
 }
diff --git a/aten/src/ATen/core/DeviceType.h b/aten/src/ATen/core/DeviceType.h
index 2d2a090fddf41d..5614d247af7ae5 100644
--- a/aten/src/ATen/core/DeviceType.h
+++ b/aten/src/ATen/core/DeviceType.h
@@ -1,3 +1,5 @@
+#pragma once
+
 // This is directly synchronized with caffe2/proto/caffe2.proto, but
 // doesn't require me to figure out how to get Protobuf headers into
 // ATen/core (which would require a lot more build system hacking.)
diff --git a/aten/src/ATen/core/IdWrapper.h b/aten/src/ATen/core/IdWrapper.h
index 7d152269d9a8c2..58632ce111db57 100644
--- a/aten/src/ATen/core/IdWrapper.h
+++ b/aten/src/ATen/core/IdWrapper.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <functional>
+#include <ATen/core/Macros.h>
 
 namespace at {
 
@@ -21,7 +22,7 @@ namespace at {
  * for you, given the underlying type supports it.
  */
 template <class ConcreteType, class UnderlyingType>
-class IdWrapper {
+class AT_CORE_API IdWrapper {
  public:
   using underlying_type = UnderlyingType;
   using concrete_type = ConcreteType;
diff --git a/aten/src/ATen/core/TensorTypeId.cpp b/aten/src/ATen/core/TensorTypeId.cpp
index 605d303ad62ee3..a07c8326edb531 100644
--- a/aten/src/ATen/core/TensorTypeId.cpp
+++ b/aten/src/ATen/core/TensorTypeId.cpp
@@ -1,5 +1,9 @@
 #include "ATen/core/TensorTypeId.h"
 
+namespace at {
+
 std::ostream& operator<<(std::ostream& str, at::TensorTypeId rhs) {
   return str << rhs.underlyingId();
 }
+
+} // namespace at
diff --git a/aten/src/ATen/core/TensorTypeId.h b/aten/src/ATen/core/TensorTypeId.h
index 5fc411137e08b4..d01437bbe9197b 100644
--- a/aten/src/ATen/core/TensorTypeId.h
+++ b/aten/src/ATen/core/TensorTypeId.h
@@ -5,12 +5,7 @@
 #include <string>
 #include <unordered_set>
 #include "ATen/core/IdWrapper.h"
-
-namespace at {
-class TensorTypeId;
-}
-
-std::ostream& operator<<(std::ostream&, at::TensorTypeId);
+#include "ATen/core/Macros.h"
 
 namespace at {
 
@@ -22,7 +17,7 @@ using _tensorTypeId_underlyingType = uint8_t;
  * Dynamic type ID of a Tensor argument.  It represents something like
  * CPUTensor, etc.
  */
-class TensorTypeId final
+class AT_CORE_API TensorTypeId final
     : public at::
           IdWrapper<TensorTypeId, details::_tensorTypeId_underlyingType> {
  public:
@@ -37,9 +32,11 @@ class TensorTypeId final
       : IdWrapper(id) {}
 
   friend class TensorTypeIdCreator;
-  friend std::ostream& ::operator<<(std::ostream&, TensorTypeId);
+  friend AT_CORE_API std::ostream& operator<<(std::ostream&, TensorTypeId);
 };
 
+AT_CORE_API std::ostream& operator<<(std::ostream&, at::TensorTypeId);
+
 } // namespace at
 
 AT_DEFINE_HASH_FOR_IDWRAPPER(at::TensorTypeId)
diff --git a/aten/src/ATen/core/TensorTypeIdRegistration.cpp b/aten/src/ATen/core/TensorTypeIdRegistration.cpp
index af0b992e51c6ff..17e2b2e7cd8baf 100644
--- a/aten/src/ATen/core/TensorTypeIdRegistration.cpp
+++ b/aten/src/ATen/core/TensorTypeIdRegistration.cpp
@@ -4,8 +4,6 @@
 
 namespace at {
 
-constexpr at::TensorTypeId TensorTypeIdCreator::max_id_;
-
 TensorTypeIds::TensorTypeIds() : creator_(), registry_() {}
 
 TensorTypeIds& TensorTypeIds::singleton() {
@@ -16,9 +14,10 @@ TensorTypeIds& TensorTypeIds::singleton() {
 TensorTypeIdCreator::TensorTypeIdCreator() : last_id_(0) {}
 
 at::TensorTypeId TensorTypeIdCreator::create() {
+
   auto id = TensorTypeId(++last_id_);
 
-  if (id == max_id_) {
+  if (last_id_ == 0) { // overflow happened!
     // If this happens in prod, we have to change
     // details::_tensorTypeId_underlyingType to uint16_t.
     AT_ERROR(
@@ -59,4 +58,10 @@ TensorTypeIdRegistrar::~TensorTypeIdRegistrar() {
   TensorTypeIds::singleton().deregister(id_);
 }
 
+AT_DEFINE_TENSOR_TYPE(UndefinedTensorId);
+AT_DEFINE_TENSOR_TYPE(CPUTensorId);
+AT_DEFINE_TENSOR_TYPE(CUDATensorId);
+AT_DEFINE_TENSOR_TYPE(SparseCPUTensorId);
+AT_DEFINE_TENSOR_TYPE(SparseCUDATensorId);
+
 } // namespace at
diff --git a/aten/src/ATen/core/TensorTypeIdRegistration.h b/aten/src/ATen/core/TensorTypeIdRegistration.h
index 0286115fdc66ac..a7b30932cebe85 100644
--- a/aten/src/ATen/core/TensorTypeIdRegistration.h
+++ b/aten/src/ATen/core/TensorTypeIdRegistration.h
@@ -16,7 +16,7 @@
 
 namespace at {
 
-class TensorTypeIdCreator final {
+class AT_CORE_API TensorTypeIdCreator final {
  public:
   TensorTypeIdCreator();
 
@@ -29,13 +29,10 @@ class TensorTypeIdCreator final {
  private:
   std::atomic<details::_tensorTypeId_underlyingType> last_id_;
 
-  static constexpr at::TensorTypeId max_id_ = TensorTypeId(
-      std::numeric_limits<details::_tensorTypeId_underlyingType>::max());
-
   AT_DISABLE_COPY_AND_ASSIGN(TensorTypeIdCreator);
 };
 
-class TensorTypeIdRegistry final {
+class AT_CORE_API TensorTypeIdRegistry final {
  public:
   TensorTypeIdRegistry();
 
@@ -49,7 +46,7 @@ class TensorTypeIdRegistry final {
   AT_DISABLE_COPY_AND_ASSIGN(TensorTypeIdRegistry);
 };
 
-class TensorTypeIds final {
+class AT_CORE_API TensorTypeIds final {
  public:
   static TensorTypeIds& singleton();
 
@@ -71,7 +68,7 @@ inline constexpr at::TensorTypeId TensorTypeIds::undefined() noexcept {
   return TensorTypeIdCreator::undefined();
 }
 
-class TensorTypeIdRegistrar final {
+class AT_CORE_API TensorTypeIdRegistrar final {
  public:
   TensorTypeIdRegistrar();
   ~TensorTypeIdRegistrar();
@@ -88,12 +85,18 @@ inline at::TensorTypeId TensorTypeIdRegistrar::id() const noexcept {
   return id_;
 }
 
-} // namespace at
-
-#define AT_DECLARE_TENSOR_TYPE(TensorName) at::TensorTypeId TensorName();
+#define AT_DECLARE_TENSOR_TYPE(TensorName) AT_CORE_API at::TensorTypeId TensorName();
 
 #define AT_DEFINE_TENSOR_TYPE(TensorName)           \
   at::TensorTypeId TensorName() {                   \
     static TensorTypeIdRegistrar registration_raii; \
     return registration_raii.id();                  \
   }
+
+AT_DECLARE_TENSOR_TYPE(UndefinedTensorId);
+AT_DECLARE_TENSOR_TYPE(CPUTensorId); // Caffe2 supported
+AT_DECLARE_TENSOR_TYPE(CUDATensorId); // Caffe2 supported
+AT_DECLARE_TENSOR_TYPE(SparseCPUTensorId);
+AT_DECLARE_TENSOR_TYPE(SparseCUDATensorId);
+
+} // namespace at
diff --git a/caffe2/core/typeid.cc b/aten/src/ATen/core/typeid.cpp
similarity index 69%
rename from caffe2/core/typeid.cc
rename to aten/src/ATen/core/typeid.cpp
index d4c5294f4b5d35..0ad13150f7c63c 100644
--- a/caffe2/core/typeid.cc
+++ b/aten/src/ATen/core/typeid.cpp
@@ -1,7 +1,5 @@
-#include "caffe2/core/typeid.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/scope_guard.h"
-#include "caffe2/core/tensor.h"
+#include <ATen/core/typeid.h>
+#include <ATen/core/Error.h>
 
 #include <atomic>
 
@@ -28,26 +26,9 @@ std::mutex& gTypeRegistrationMutex() {
   return g_type_registration_mutex;
 }
 
-#if defined(_MSC_VER)
-// Windows does not have cxxabi.h, so we will simply return the original.
-string Demangle(const char* name) {
-  return string(name);
-}
-#else
-string Demangle(const char* name) {
-  int status = 0;
-  auto demangled = ::abi::__cxa_demangle(name, nullptr, nullptr, &status);
-  if (demangled) {
-    auto guard = caffe2::MakeGuard([demangled]() { free(demangled); });
-    return string(demangled);
-  }
-  return name;
-}
-#endif
-
 string GetExceptionString(const std::exception& e) {
 #ifdef __GXX_RTTI
-  return Demangle(typeid(e).name()) + ": " + e.what();
+  return at::demangle(typeid(e).name()) + ": " + e.what();
 #else
   return string("Exception (no RTTI available): ") + e.what();
 #endif // __GXX_RTTI
@@ -56,20 +37,21 @@ string GetExceptionString(const std::exception& e) {
 void TypeMeta::_ThrowRuntimeTypeLogicError(const std::string& msg) {
   // In earlier versions it used to be std::abort() but it's a bit hard-core
   // for a library
-  CAFFE_THROW(msg);
+  AT_ERROR(msg);
 }
 
 TypeIdentifier TypeIdentifier::createTypeId() {
   static std::atomic<TypeIdentifier::underlying_type> counter(
       TypeMeta::Id<_CaffeHighestPreallocatedTypeId>().underlyingId());
   const TypeIdentifier::underlying_type new_value = ++counter;
-  if (new_value == std::numeric_limits<TypeIdentifier::underlying_type>::max()) {
-    throw std::logic_error("Ran out of available type ids. If you need more than 2^16 CAFFE_KNOWN_TYPEs, we need to increase TypeIdentifier to use more than 16 bit.");
+  if (new_value ==
+      std::numeric_limits<TypeIdentifier::underlying_type>::max()) {
+    throw std::logic_error(
+        "Ran out of available type ids. If you need more than 2^16 CAFFE_KNOWN_TYPEs, we need to increase TypeIdentifier to use more than 16 bit.");
   }
   return TypeIdentifier(new_value);
 }
 
-CAFFE_DEFINE_KNOWN_TYPE(Tensor);
 CAFFE_DEFINE_KNOWN_TYPE(float);
 CAFFE_DEFINE_KNOWN_TYPE(int);
 CAFFE_DEFINE_KNOWN_TYPE(std::string);
@@ -102,9 +84,9 @@ namespace {
 // for unintializied blob. You should not use this struct yourself - it is
 // intended to be only instantiated once here.
 struct UninitializedTypeNameRegisterer {
-    UninitializedTypeNameRegisterer() {
-      gTypeNames()[TypeIdentifier::uninitialized()] = "nullptr (uninitialized)";
-    }
+  UninitializedTypeNameRegisterer() {
+    gTypeNames()[TypeIdentifier::uninitialized()] = "nullptr (uninitialized)";
+  }
 };
 static UninitializedTypeNameRegisterer g_uninitialized_type_name_registerer;
 
diff --git a/aten/src/ATen/core/typeid.h b/aten/src/ATen/core/typeid.h
new file mode 100644
index 00000000000000..fd9131b5812246
--- /dev/null
+++ b/aten/src/ATen/core/typeid.h
@@ -0,0 +1,490 @@
+#pragma once
+
+#include <atomic>
+#include <cassert>
+#include <cstdlib>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#ifdef __GXX_RTTI
+#include <typeinfo>
+#endif
+
+#include <exception>
+
+#include "ATen/core/Error.h"
+#include "ATen/core/Backtrace.h"
+#include "ATen/core/Macros.h"
+#include "ATen/core/Half.h"
+#include "ATen/core/IdWrapper.h"
+
+// TODO: This file is still in the caffe2 namespace, despite living
+// in the ATen directory.  This is because the macro CAFFE_DECLARE_KNOWN_TYPE
+// defines a template specialization, which relies on the namespace of TypeMeta
+// matching the namespace where the macro is called.  This requires us to
+// fix all of the call-sites, which I want to do later.  So the namespace
+// is not fixed at the moment.
+
+namespace caffe2 {
+class TypeIdentifier;
+}
+
+std::ostream& operator<<(std::ostream& stream, caffe2::TypeIdentifier typeId);
+
+namespace caffe2 {
+
+class TypeMeta;
+
+/**
+ * A type id is a unique id for a given C++ type.
+ * You need to register your types using CAFFE_KNOWN_TYPE(MyType) to be able to
+ * use TypeIdentifier with custom types. This is for example used to store the
+ * dtype of tensors.
+ */
+class TypeIdentifier final : public at::IdWrapper<TypeIdentifier, uint16_t> {
+ public:
+  static TypeIdentifier createTypeId();
+
+  friend std::ostream& ::operator<<(
+      std::ostream& stream,
+      TypeIdentifier typeId);
+  friend bool operator<(TypeIdentifier lhs, TypeIdentifier rhs);
+
+  // This is 8, because 0 is uint8_t (due to ScalarType BC constraint)
+  static constexpr TypeIdentifier uninitialized() {
+    return TypeIdentifier(8);
+  }
+
+ private:
+  constexpr explicit TypeIdentifier(uint16_t id) : IdWrapper(id) {}
+  friend class TypeMeta;
+};
+
+// Allow usage in std::map / std::set
+// TODO Disallow this and rather use std::unordered_map/set everywhere
+inline bool operator<(TypeIdentifier lhs, TypeIdentifier rhs) {
+  return lhs.underlyingId() < rhs.underlyingId();
+}
+
+} // namespace caffe2
+
+AT_DEFINE_HASH_FOR_IDWRAPPER(caffe2::TypeIdentifier)
+
+inline std::ostream& operator<<(
+    std::ostream& stream,
+    caffe2::TypeIdentifier typeId) {
+  return stream << typeId.underlyingId();
+}
+
+namespace caffe2 {
+
+std::unordered_map<TypeIdentifier, std::string>& gTypeNames();
+std::unordered_set<std::string>& gRegisteredTypeNames();
+
+// A utility function to return an exception std::string by prepending its
+// exception type before its what() content.
+std::string GetExceptionString(const std::exception& e);
+
+std::mutex& gTypeRegistrationMutex();
+
+template <typename T>
+struct TypeNameRegisterer {
+  TypeNameRegisterer(TypeIdentifier id, const std::string& literal_name) {
+    std::lock_guard<std::mutex> guard(gTypeRegistrationMutex());
+#ifdef __GXX_RTTI
+    (void)literal_name;
+
+    std::string name = at::demangle(typeid(T).name());
+    // If we are in RTTI mode, we will also use this opportunity to do sanity
+    // check if there are duplicated ids registered for the same type. This
+    // usually happens when one does not do RTLD_GLOBAL, which is often the
+    // case in Python. The way we do the check is to make sure that there are
+    // no duplicated names registered - this could be done by checking the
+    // uniqueness of names.
+    if (gRegisteredTypeNames().count(name)) {
+      AT_ERROR("typeid.h: Type name ", name, " was registered twice.  "
+               "This should not happen.  Things to check:\n"
+               "1. Did you add a new CAFFE_KNOWN_TYPE?  If so, check that "
+               "it is not duplicated with an existing CAFFE_KNOWN_TYPE.\n"
+               "2. Did you build and install PyTorch and Caffe2 separately? "
+               "For example, this would be the case if you ran scripts/onnx/install.sh or "
+               "scripts/onnx/install-develop.sh prior to Aug 12, 2018 "
+               "(commit 1756daaa7530d).  If so, rebuild using the environment variable "
+               " FULL_CAFFE2=1 (if you build latest master, the ONNX scripts are "
+               "updated to do this for you.) "
+               "For more context, see https://github.com/pytorch/pytorch/issues/10460");
+    }
+    gRegisteredTypeNames().insert(name);
+    gTypeNames()[id] = name;
+#else // __GXX_RTTI
+    if (literal_name.empty()) {
+      gTypeNames()[id] = "(RTTI disabled, cannot show name)";
+    } else {
+      gTypeNames()[id] = literal_name;
+    }
+#endif // __GXX_RTTI
+  }
+};
+
+/**
+ * TypeMeta is a thin class that allows us to store the type of a container such
+ * as a blob, or the data type of a tensor, with a unique run-time id. It also
+ * stores some additional data such as the item size and the name of the type
+ * for run-time inspection.
+ */
+class TypeMeta {
+ public:
+  using PlacementNew = void(void*, size_t);
+  using TypedCopy = void(const void*, void*, size_t);
+  using TypedDestructor = void(void*, size_t);
+  /** Create a dummy TypeMeta object. To create a TypeMeta object for a specific
+   * type, use TypeMeta::Make<T>().
+   */
+  TypeMeta() noexcept
+      : id_(TypeIdentifier::uninitialized()),
+        itemsize_(0),
+        ctor_(nullptr),
+        copy_(nullptr),
+        dtor_(nullptr) {}
+
+  /**
+   * Copy constructor.
+   */
+  TypeMeta(const TypeMeta& src) noexcept = default;
+
+  /**
+   * Assignment operator.
+   */
+  TypeMeta& operator=(const TypeMeta& src) noexcept = default;
+
+  TypeMeta(TypeMeta&& rhs) noexcept = default;
+
+ private:
+  // TypeMeta can only be created by Make, making sure that we do not
+  // create incorrectly mixed up TypeMeta objects.
+  TypeMeta(
+      TypeIdentifier i,
+      size_t s,
+      PlacementNew* ctor,
+      TypedCopy* copy,
+      TypedDestructor* dtor) noexcept
+      : id_(i), itemsize_(s), ctor_(ctor), copy_(copy), dtor_(dtor) {}
+
+  // Mechanism for throwing errors which can't be prevented at compile time
+  // due to type erasure. E.g. somebody calling TypeMeta::copy() for
+  // non-copiable type. Right now just throws exception but is implemented
+  // in .cpp to manage dependencies
+  static void _ThrowRuntimeTypeLogicError(const std::string& msg);
+
+ public:
+  /**
+   * Returns the type id.
+   */
+  const TypeIdentifier& id() const noexcept {
+    return id_;
+  }
+  /**
+   * Returns the size of the item.
+   */
+  const size_t& itemsize() const noexcept {
+    return itemsize_;
+  }
+  /**
+   * Returns the placement new function pointer for individual items.
+   */
+  PlacementNew* ctor() const noexcept {
+    return ctor_;
+  }
+  /**
+   * Returns the typed copy function pointer for individual iterms.
+   */
+  TypedCopy* copy() const noexcept {
+    return copy_;
+  }
+  /**
+   * Returns the destructor function pointer for individual items.
+   */
+  TypedDestructor* dtor() const noexcept {
+    return dtor_;
+  }
+  /**
+   * Returns a printable name for the type.
+   */
+  const char* name() const noexcept {
+    auto it = gTypeNames().find(id_);
+    assert(it != gTypeNames().end());
+    return it->second.c_str();
+  }
+
+  friend bool operator==(const TypeMeta& lhs, const TypeMeta& rhs) noexcept;
+
+  template <typename T>
+  bool Match() const {
+    return (id_ == Id<T>());
+  }
+
+  // Below are static functions that can be called by passing a specific type.
+
+  /**
+   * Returns the unique id for the given type T. The id is unique for the type T
+   * in the sense that for any two different types, their id are different; for
+   * the same type T, the id remains the same over different calls of the
+   * function. However, this is not guaranteed over different runs, as the id
+   * is generated during run-time. Do NOT serialize the id for storage.
+   */
+  template <typename T>
+  AT_CORE_API static TypeIdentifier Id();
+
+  /**
+   * Returns the item size of the type. This is equivalent to sizeof(T).
+   */
+  template <typename T>
+  static size_t ItemSize() {
+    return sizeof(T);
+  }
+
+  /**
+   * Returns the registered printable name of the type.
+   *
+   * Works for only the ones registered with CAFFE_KNOWN_TYPE
+   */
+  template <typename T>
+  static const char* TypeName() {
+    auto it = gTypeNames().find(Id<T>());
+    assert(it != gTypeNames().end());
+    return it->second.c_str();
+  }
+
+  /**
+   * Placement new function for the type.
+   */
+  template <typename T>
+  static void _Ctor(void* ptr, size_t n) {
+    T* typed_ptr = static_cast<T*>(ptr);
+    for (size_t i = 0; i < n; ++i) {
+      new (typed_ptr + i) T;
+    }
+  }
+
+  template <typename T>
+  static void _CtorNotDefault(void* /*ptr*/, size_t /*n*/) {
+    _ThrowRuntimeTypeLogicError(
+        "Type " + std::string(at::demangle_type<T>()) +
+        " is not default-constructible.");
+  }
+
+  template <
+      typename T,
+      typename std::enable_if<std::is_default_constructible<T>::value>::type* =
+          nullptr>
+  static inline PlacementNew* _PickCtor() {
+    return _Ctor<T>;
+  }
+
+  template <
+      typename T,
+      typename std::enable_if<!std::is_default_constructible<T>::value>::type* =
+          nullptr>
+  static inline PlacementNew* _PickCtor() {
+    return _CtorNotDefault<T>;
+  }
+
+  /**
+   * Typed copy function for classes.
+   */
+  template <typename T>
+  static void _Copy(const void* src, void* dst, size_t n) {
+    const T* typed_src = static_cast<const T*>(src);
+    T* typed_dst = static_cast<T*>(dst);
+    for (size_t i = 0; i < n; ++i) {
+      typed_dst[i] = typed_src[i];
+    }
+  }
+
+  /**
+   * A placeholder function for types that do not allow assignment.
+   */
+  template <typename T>
+  static void _CopyNotAllowed(
+      const void* /*src*/,
+      void* /*dst*/,
+      size_t /*n*/) {
+    _ThrowRuntimeTypeLogicError(
+        "Type " + std::string(at::demangle_type<T>()) +
+        " does not allow assignment.");
+  }
+
+  template <
+      typename T,
+      typename std::enable_if<std::is_copy_assignable<T>::value>::type* =
+          nullptr>
+  static inline TypedCopy* _PickCopy() {
+    return _Copy<T>;
+  }
+
+  template <
+      typename T,
+      typename std::enable_if<!std::is_copy_assignable<T>::value>::type* =
+          nullptr>
+  static inline TypedCopy* _PickCopy() {
+    return _CopyNotAllowed<T>;
+  }
+
+  /**
+   * Destructor for non-fundamental types.
+   */
+  template <typename T>
+  static void _Dtor(void* ptr, size_t n) {
+    T* typed_ptr = static_cast<T*>(ptr);
+    for (size_t i = 0; i < n; ++i) {
+      typed_ptr[i].~T();
+    }
+  }
+
+  /**
+   * Returns a TypeMeta object that corresponds to the typename T.
+   */
+  template <typename T>
+  static typename std::enable_if<
+      std::is_fundamental<T>::value || std::is_pointer<T>::value,
+      TypeMeta>::type
+  Make() {
+    return TypeMeta(Id<T>(), ItemSize<T>(), nullptr, nullptr, nullptr);
+  }
+
+  template <typename T>
+  static typename std::enable_if<
+      !(std::is_fundamental<T>::value || std::is_pointer<T>::value),
+      TypeMeta>::type
+  Make() {
+    return TypeMeta(
+        Id<T>(), ItemSize<T>(), _PickCtor<T>(), _PickCopy<T>(), _Dtor<T>);
+  }
+
+ private:
+  TypeIdentifier id_;
+  size_t itemsize_;
+  PlacementNew* ctor_;
+  TypedCopy* copy_;
+  TypedDestructor* dtor_;
+};
+
+inline bool operator==(const TypeMeta& lhs, const TypeMeta& rhs) noexcept {
+  return (lhs.id_ == rhs.id_);
+}
+inline bool operator!=(const TypeMeta& lhs, const TypeMeta& rhs) noexcept {
+  return !operator==(lhs, rhs);
+}
+
+/**
+ * Register unique id for a type so it can be used in TypeMeta context, e.g. be
+ * used as a type for Blob or for Tensor elements.
+ *
+ * CAFFE_KNOWN_TYPE does explicit instantiation of TypeMeta::Id<T> template
+ * function and thus needs to be put in a single translation unit (.cpp file)
+ * for a given type T. Other translation units that use type T as a type of the
+ * caffe2::Blob or element type of caffe2::Tensor need to depend on the
+ * translation unit that contains CAFFE_KNOWN_TYPE declaration via regular
+ * linkage dependencies.
+ *
+ * NOTE: the macro needs to be invoked in ::caffe2 namespace
+ */
+// Implementation note: in MSVC, we will need to prepend the AT_CORE_API
+// keyword in order to get things compiled properly. in Linux, gcc seems to
+// create attribute ignored error for explicit template instantiations, see
+//   http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0537r0.html
+//   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51930
+// and as a result, we define these two macros slightly differently.
+
+#ifdef _MSC_VER
+#define CAFFE_KNOWN_TYPE(T)                                               \
+  template <>                                                             \
+  AT_CORE_API TypeIdentifier TypeMeta::Id<T>() {                          \
+    static const TypeIdentifier type_id = TypeIdentifier::createTypeId(); \
+    static TypeNameRegisterer<T> registerer(type_id, #T);                 \
+    return type_id;                                                       \
+  }
+#else // _MSC_VER
+#define CAFFE_KNOWN_TYPE(T)                                               \
+  template <>                                                             \
+  TypeIdentifier TypeMeta::Id<T>() {                                      \
+    static const TypeIdentifier type_id = TypeIdentifier::createTypeId(); \
+    static TypeNameRegisterer<T> registerer(type_id, #T);                 \
+    return type_id;                                                       \
+  }
+#endif
+
+/**
+ * CAFFE_DECLARE_KNOWN_TYPE and CAFFE_DEFINE_KNOWN_TYPE are used
+ * to preallocate ids for types that are queried very often so that they
+ * can be resolved at compile time. Please use CAFFE_KNOWN_TYPE() instead
+ * for your own types to allocate dynamic ids for them.
+ */
+#ifdef _MSC_VER
+#define CAFFE_DECLARE_KNOWN_TYPE(PreallocatedId, T)     \
+  template <>                                           \
+  inline AT_CORE_API TypeIdentifier TypeMeta::Id<T>() { \
+    return TypeIdentifier(PreallocatedId);              \
+  }
+#else // _MSC_VER
+#define CAFFE_DECLARE_KNOWN_TYPE(PreallocatedId, T) \
+  template <>                                       \
+  inline TypeIdentifier TypeMeta::Id<T>() {         \
+    return TypeIdentifier(PreallocatedId);          \
+  }
+#endif
+
+#define CONCAT_IMPL(x, y) x##y
+#define MACRO_CONCAT(x, y) CONCAT_IMPL(x, y)
+
+#define CAFFE_DEFINE_KNOWN_TYPE(T)                             \
+  namespace {                                                  \
+  TypeNameRegisterer<T> MACRO_CONCAT(registerer, __COUNTER__)( \
+      TypeMeta::Id<T>(),                                       \
+      #T);                                                     \
+  }
+
+class Tensor;
+
+// Note: we have preallocated the numbers 0-8 so they line up exactly
+// with at::ScalarType's numbering.  All other numbers do not matter.
+//
+// Notably, the "uninitialized" type id is 8, not 0, for hysterical raisins.
+
+struct _CaffeHighestPreallocatedTypeId final {};
+
+CAFFE_DECLARE_KNOWN_TYPE(0, uint8_t);
+CAFFE_DECLARE_KNOWN_TYPE(1, int8_t);
+CAFFE_DECLARE_KNOWN_TYPE(2, int16_t);
+CAFFE_DECLARE_KNOWN_TYPE(3, int);
+CAFFE_DECLARE_KNOWN_TYPE(4, int64_t);
+CAFFE_DECLARE_KNOWN_TYPE(5, at::Half);
+CAFFE_DECLARE_KNOWN_TYPE(6, float);
+CAFFE_DECLARE_KNOWN_TYPE(7, double);
+// 8 = undefined type id
+
+CAFFE_DECLARE_KNOWN_TYPE(9, Tensor);
+CAFFE_DECLARE_KNOWN_TYPE(10, std::string);
+CAFFE_DECLARE_KNOWN_TYPE(11, bool);
+CAFFE_DECLARE_KNOWN_TYPE(12, uint16_t);
+CAFFE_DECLARE_KNOWN_TYPE(13, char);
+CAFFE_DECLARE_KNOWN_TYPE(14, std::unique_ptr<std::mutex>);
+CAFFE_DECLARE_KNOWN_TYPE(15, std::unique_ptr<std::atomic<bool>>);
+CAFFE_DECLARE_KNOWN_TYPE(16, std::vector<int32_t>);
+CAFFE_DECLARE_KNOWN_TYPE(17, std::vector<int64_t>);
+CAFFE_DECLARE_KNOWN_TYPE(18, std::vector<unsigned long>);
+CAFFE_DECLARE_KNOWN_TYPE(19, bool*);
+CAFFE_DECLARE_KNOWN_TYPE(20, char*);
+CAFFE_DECLARE_KNOWN_TYPE(21, int*);
+
+#ifdef CAFFE2_UNIQUE_LONG_TYPEMETA
+CAFFE_DECLARE_KNOWN_TYPE(22, long);
+CAFFE_DECLARE_KNOWN_TYPE(23, std::vector<long>);
+#endif // CAFFE2_UNIQUE_LONG_TYPEMETA
+
+CAFFE_DECLARE_KNOWN_TYPE(24, _CaffeHighestPreallocatedTypeId);
+} // namespace caffe2
diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
index fdbc6f2612e8de..b19b962a23dc0e 100644
--- a/aten/src/ATen/function_wrapper.py
+++ b/aten/src/ATen/function_wrapper.py
@@ -180,7 +180,7 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 }""")
 
 BUFFER_DEFINITION = CodeTemplate("""\
-auto ${name}_ = new TensorImpl(Backend::${Backend}, ScalarType::${ScalarName}, ${THTensor}_new(), false);
+auto ${name}_ = new TensorImpl(${Backend}TensorId(), ScalarType::${ScalarName}, ${THTensor}_new(), false);
 auto ${name} = Tensor(${name}_, false);""")
 
 CONDITIONAL_INITIALIZER = CodeTemplate("""\
@@ -320,23 +320,23 @@ def __init__(self, reason):
 CHECKED_USE_NULLABLE = CodeTemplate('${arg_name}_ ? ${usage} : NULL')
 
 ALLOC_NOARGS_WRAP = {
-    'THTensor*': 'new TensorImpl(Backend::${Backend}, ScalarType::${ScalarName})',
-    'THBoolTensor*': 'new TensorImpl(Backend::${Backend}, ScalarType::Byte)',
-    'THIndexTensor*': 'new TensorImpl(Backend::${Backend}, ScalarType::Long)',
-    'THIntegerTensor*': 'new TensorImpl(Backend::${Backend}, ScalarType::Int)',
+    'THTensor*': 'new TensorImpl(${Backend}TensorId(), ScalarType::${ScalarName})',
+    'THBoolTensor*': 'new TensorImpl(${Backend}TensorId(), ScalarType::Byte)',
+    'THIndexTensor*': 'new TensorImpl(${Backend}TensorId(), ScalarType::Long)',
+    'THIntegerTensor*': 'new TensorImpl(${Backend}TensorId(), ScalarType::Int)',
     'THSTensor*': 'detail::new_Sparse${Tensor}()',
-    'THDenseTensor*': 'new TensorImpl(Backend::${Backend}, ScalarType::${ScalarName})',
-    'THDenseIndexTensor*': 'new TensorImpl(Backend::${Backend}, ScalarType::Long)'
+    'THDenseTensor*': 'new TensorImpl(${Backend}TensorId(), ScalarType::${ScalarName})',
+    'THDenseIndexTensor*': 'new TensorImpl(${Backend}TensorId(), ScalarType::Long)'
 }
 
 ALLOC_WRAP = {
-    'THTensor*': 'new TensorImpl(Backend::${Backend}, ScalarType::${ScalarName}, ${arguments}, false)',
-    'THBoolTensor*': 'new TensorImpl(Backend::${Backend}, ScalarType::Byte, ${arguments}, false)',
-    'THIndexTensor*': 'new TensorImpl(Backend::${Backend}, ScalarType::Long, ${arguments}, false)',
-    'THIntegerTensor*': 'new TensorImpl(Backend::${Backend}, ScalarType::Int, ${arguments}, false)',
+    'THTensor*': 'new TensorImpl(${Backend}TensorId(), ScalarType::${ScalarName}, ${arguments}, false)',
+    'THBoolTensor*': 'new TensorImpl(${Backend}TensorId(), ScalarType::Byte, ${arguments}, false)',
+    'THIndexTensor*': 'new TensorImpl(${Backend}TensorId(), ScalarType::Long, ${arguments}, false)',
+    'THIntegerTensor*': 'new TensorImpl(${Backend}TensorId(), ScalarType::Int, ${arguments}, false)',
     'THSTensor*': 'new Sparse${Tensor}(${arguments})',
-    'THDenseTensor*': 'new TensorImpl(Backend::${Backend}, ScalarType::${ScalarName}, ${arguments}, false)',
-    'THDenseIndexTensor*': 'new TensorImpl(Backend::${Backend}, ScalarType::Long, ${arguments}, false)',
+    'THDenseTensor*': 'new TensorImpl(${Backend}TensorId(), ScalarType::${ScalarName}, ${arguments}, false)',
+    'THDenseIndexTensor*': 'new TensorImpl(${Backend}TensorId(), ScalarType::Long, ${arguments}, false)',
 }
 
 # Replacements for constants when calling into TH
diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
index 7a7e8be5c7ff6a..6eb9b8cfb58071 100644
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -62,8 +62,13 @@ SparseTensor new_sparse(const SparseType& dtype) {
   AT_ASSERT(!dtype.is_undefined());
   AT_ASSERT(!dtype.is_variable());
   AT_ASSERT(dtype.is_sparse());
-  // TODO: Hmm... this const_cast business seems a bit dodgy
-  return SparseTensor(new SparseTensorImpl(dtype.backend(), dtype.scalarType()), /* retain */ false);
+  TensorTypeId type_id;
+  if (dtype.is_cuda()) {
+    type_id = SparseCUDATensorId();
+  } else {
+    type_id = SparseCPUTensorId();
+  }
+  return SparseTensor(new SparseTensorImpl(type_id, dtype.scalarType()), /* retain */ false);
 }
 
 /*** Helper methods ***/
diff --git a/aten/src/ATen/templates/SparseTypeDerived.cpp b/aten/src/ATen/templates/SparseTypeDerived.cpp
index 977bbbb079460d..6c3094e71aa0df 100644
--- a/aten/src/ATen/templates/SparseTypeDerived.cpp
+++ b/aten/src/ATen/templates/SparseTypeDerived.cpp
@@ -28,7 +28,7 @@
 namespace at {
 
 ${Type}::${Type}(Context* context)
-  : Type(context, /*is_variable=*/false, /*is_undefined=*/false) {}
+  : Type(context, ${Backend}TensorId(), /*is_variable=*/false, /*is_undefined=*/false) {}
 ScalarType ${Type}::scalarType() const {
   return ScalarType::${ScalarName};
 }
diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h
index a01c3c5e3447f5..10036a5286b5bc 100644
--- a/aten/src/ATen/templates/Type.h
+++ b/aten/src/ATen/templates/Type.h
@@ -13,6 +13,7 @@
 #include "ATen/Tensor.h"
 #include "ATen/core/ArrayRef.h"
 #include "ATen/core/Half.h"
+#include "ATen/core/TensorTypeIdRegistration.h"
 #include "THNN/Reduction.h"
 
 #include <array>
@@ -44,8 +45,8 @@ enum class TypeID {
 };
 
 struct AT_API Type {
-  explicit Type(Context* context, bool is_variable, bool is_undefined)
-      : context(context), is_variable_(is_variable), is_undefined_(is_undefined) {}
+  explicit Type(Context* context, TensorTypeId type_id, bool is_variable, bool is_undefined)
+      : context(context), type_id_(type_id), is_variable_(is_variable), is_undefined_(is_undefined) {}
   virtual ~Type() {}
   virtual ScalarType scalarType() const = 0;
   virtual Backend backend() const = 0;
@@ -79,6 +80,9 @@ struct AT_API Type {
   // for external dispatch
   virtual TypeID ID() const = 0;
 
+  // New-style TensorTypeId that supports open registration.
+  TensorTypeId type_id() const { return type_id_; }
+
   Tensor copy(const Tensor & src, bool non_blocking=false) const;
   Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking=false) const;
   virtual Tensor & s_copy_(Tensor & self, const Tensor & src, bool non_blocking) const = 0;
@@ -98,6 +102,7 @@ struct AT_API Type {
   ${type_method_declarations}
 protected:
   Context* context;
+  TensorTypeId type_id_;
   bool is_variable_;
   bool is_undefined_;
 
diff --git a/aten/src/ATen/templates/TypeDerived.cpp b/aten/src/ATen/templates/TypeDerived.cpp
index a4ad1dfc205211..2bd041e37a7fc5 100644
--- a/aten/src/ATen/templates/TypeDerived.cpp
+++ b/aten/src/ATen/templates/TypeDerived.cpp
@@ -39,7 +39,7 @@ static int getPointerDevice(void* ptr) {
 #endif
 
 ${Type}::${Type}(Context* context)
-  : Type(context, /*is_variable=*/false, /*is_undefined=*/false) {}
+  : Type(context, ${Backend}TensorId(), /*is_variable=*/false, /*is_undefined=*/false) {}
 ScalarType ${Type}::scalarType() const {
   return ScalarType::${ScalarName};
 }
@@ -95,7 +95,7 @@ std::unique_ptr<Storage> ${Type}::storageWithAllocator(int64_t size, Allocator*
 Tensor ${Type}::unsafeTensorFromTH(void * th_pointer, bool retain) const {
   if (retain)
     ${THTensor}_retain(${state,} (${THTensor}*) th_pointer);
-  return Tensor(new TensorImpl(Backend::${Backend}, ScalarType::${ScalarName},
+  return Tensor(new TensorImpl(${Backend}TensorId(), ScalarType::${ScalarName},
         (${THTensor}*)(th_pointer), false), false);
 }
 std::unique_ptr<Storage> ${Type}::unsafeStorageFromTH(void * th_pointer, bool retain) const {
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index f5cdc310ae18b8..9aba3ffc9bda4f 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -59,8 +59,7 @@ else()
   list(APPEND Caffe2_CPU_SRCS ${ATen_CORE_SRCS})
   list(APPEND Caffe2_CPU_INCLUDE ${ATen_CORE_INCLUDE})
   list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CORE_TEST_SRCS})
-  # TODO: We should probably install the headers, but I don't know
-  # how to do that.
+  # See cmake/Codegen.cmake for header installation
 endif()
 
 # ---[ Torch build
@@ -326,6 +325,14 @@ if(USE_CUDA)
   target_link_libraries(
       caffe2_gpu PUBLIC caffe2 ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
 
+  # See Note [Supporting both static and dynamic libraries on Window]
+  # TODO: I'm actually not sure why this is necessary, because caffe2_gpu
+  # should depend on caffe2 (which itself would give us the necessary
+  # macro definition).
+  if (MSVC AND NOT BUILD_SHARED_LIBS)
+      target_compile_options(caffe2_gpu PUBLIC "-DAT_CORE_STATIC_WINDOWS=1")
+  endif()
+
   # Set standard properties on the target
   aten_set_target_props(caffe2_gpu)
 
diff --git a/caffe2/core/flags.h b/caffe2/core/flags.h
index 6d8904560fe915..5a7adf7d143af0 100644
--- a/caffe2/core/flags.h
+++ b/caffe2/core/flags.h
@@ -26,12 +26,12 @@ namespace caffe2 {
 /**
  * Sets the usage message when a commandline tool is called with "--help".
  */
-void SetUsageMessage(const string& str);
+CAFFE2_API void SetUsageMessage(const string& str);
 
 /**
  * Returns the usage message for the commandline tool set by SetUsageMessage.
  */
-const char* UsageMessage();
+CAFFE2_API const char* UsageMessage();
 
 /**
  * Parses the commandline flags.
@@ -41,11 +41,11 @@ const char* UsageMessage();
  * commandline args that caffe2 does not deal with. Note that following
  * convention, argv[0] contains the binary name and is not parsed.
  */
-bool ParseCaffeCommandLineFlags(int* pargc, char*** pargv);
+CAFFE2_API bool ParseCaffeCommandLineFlags(int* pargc, char*** pargv);
 /**
  * Checks if the commandline flags has already been passed.
  */
-bool CommandLineFlagsHasBeenParsed();
+CAFFE2_API bool CommandLineFlagsHasBeenParsed();
 
 }  // namespace caffe2
 
@@ -56,6 +56,10 @@ bool CommandLineFlagsHasBeenParsed();
 
 #ifdef CAFFE2_USE_GFLAGS
 
+////////////////////////////////////////////////////////////////////////////////
+// Begin gflags section: most functions are basically rerouted to gflags.
+////////////////////////////////////////////////////////////////////////////////
+
 #include <gflags/gflags.h>
 
 // gflags before 2.0 uses namespace google and after 2.1 uses namespace gflags.
@@ -64,41 +68,70 @@ bool CommandLineFlagsHasBeenParsed();
 namespace gflags = google;
 #endif  // GFLAGS_GFLAGS_H_
 
-#define CAFFE2_GFLAGS_DEF_WRAPPER(type, name, default_value, help_str)         \
+// Motivation about the gflags wrapper:
+// (1) We would need to make sure that the gflags version and the non-gflags
+// version of Caffe2 are going to expose the same flags abstraction. One should
+// explicitly use caffe2::FLAGS_flag_name to access the flags.
+// (2) For flag names, it is recommended to start with caffe2_ to distinguish it
+// from regular gflags flags. For example, do
+//    CAFFE2_DEFINE_BOOL(caffe2_my_flag, true, "An example");
+// to allow one to use caffe2::FLAGS_caffe2_my_flag.
+// (3) Gflags has a design issue that does not properly expose the global flags,
+// if one builds the library with -fvisibility=hidden. The current gflags (as of
+// Aug 2018) only deals with the Windows case using dllexport, and not the Linux
+// counterparts. As a result, we will explciitly use CAFFE2_EXPORT to export the
+// flags defined in Caffe2. This is done via a global reference, so the flag
+// itself is not duplicated - under the hood it is the same global gflags flag.
+#define CAFFE2_GFLAGS_DEF_WRAPPER(                                             \
+    type, real_type, name, default_value, help_str)                            \
   DEFINE_##type(name, default_value, help_str);                                \
   namespace caffe2 {                                                           \
-    using ::FLAGS_##name;                                                      \
+    CAFFE2_EXPORT real_type& FLAGS_##name = ::FLAGS_##name;                    \
   }
 
 #define CAFFE2_DEFINE_int(name, default_value, help_str)                       \
-  CAFFE2_GFLAGS_DEF_WRAPPER(int32, name, default_value, help_str)
+  CAFFE2_GFLAGS_DEF_WRAPPER(int32, gflags::int32, name, default_value, help_str)
 #define CAFFE2_DEFINE_int64(name, default_value, help_str)                     \
-  CAFFE2_GFLAGS_DEF_WRAPPER(int64, name, default_value, help_str)              
+  CAFFE2_GFLAGS_DEF_WRAPPER(int64, gflags::int64, name, default_value, help_str)              
 #define CAFFE2_DEFINE_double(name, default_value, help_str)                    \
-  CAFFE2_GFLAGS_DEF_WRAPPER(double, name, default_value, help_str)
+  CAFFE2_GFLAGS_DEF_WRAPPER(double, double, name, default_value, help_str)
 #define CAFFE2_DEFINE_bool(name, default_value, help_str)                      \
-  CAFFE2_GFLAGS_DEF_WRAPPER(bool, name, default_value, help_str)
-#define CAFFE2_DEFINE_string(name, default_value, help_str) \
-  CAFFE2_GFLAGS_DEF_WRAPPER(string, name, default_value, help_str)
+  CAFFE2_GFLAGS_DEF_WRAPPER(bool, bool, name, default_value, help_str)
+#define CAFFE2_DEFINE_string(name, default_value, help_str)                    \
+  CAFFE2_GFLAGS_DEF_WRAPPER(                                                   \
+      string, ::fLS::clstring, name, default_value, help_str)
 
 // DECLARE_typed_var should be used in header files and in the global namespace.
-#define CAFFE2_GFLAGS_DECLARE_WRAPPER(type, name)                             \
-  DECLARE_##type(name);                                                       \
-  namespace caffe2 {                                                          \
-    using ::FLAGS_##name;                                                     \
+#define CAFFE2_GFLAGS_DECLARE_WRAPPER(type, real_type, name)                   \
+  DECLARE_##type(name);                                                        \
+  namespace caffe2 {                                                           \
+    extern real_type& FLAGS_##name ;                                           \
   }  // namespace caffe2
 
-#define CAFFE2_DECLARE_int(name) CAFFE2_GFLAGS_DECLARE_WRAPPER(int32, name)
-#define CAFFE2_DECLARE_int64(name) CAFFE2_GFLAGS_DECLARE_WRAPPER(int64, name)
-#define CAFFE2_DECLARE_double(name) CAFFE2_GFLAGS_DECLARE_WRAPPER(double, name)
-#define CAFFE2_DECLARE_bool(name) CAFFE2_GFLAGS_DECLARE_WRAPPER(bool, name)
-#define CAFFE2_DECLARE_string(name) CAFFE2_GFLAGS_DECLARE_WRAPPER(string, name)
+#define CAFFE2_DECLARE_int(name)                                               \
+  CAFFE2_GFLAGS_DECLARE_WRAPPER(int32, gflags::int32, name)
+#define CAFFE2_DECLARE_int64(name)                                             \
+  CAFFE2_GFLAGS_DECLARE_WRAPPER(int64, gflags::int64, name)
+#define CAFFE2_DECLARE_double(name)                                            \
+  CAFFE2_GFLAGS_DECLARE_WRAPPER(double, double, name)
+#define CAFFE2_DECLARE_bool(name)                                              \
+  CAFFE2_GFLAGS_DECLARE_WRAPPER(bool, bool, name)
+#define CAFFE2_DECLARE_string(name)                                            \
+  CAFFE2_GFLAGS_DECLARE_WRAPPER(string, ::fLS::clstring, name)
+
+////////////////////////////////////////////////////////////////////////////////
+// End gflags section.
+////////////////////////////////////////////////////////////////////////////////
 
 #else   // CAFFE2_USE_GFLAGS
 
+////////////////////////////////////////////////////////////////////////////////
+// Begin non-gflags section: providing equivalent functionality.
+////////////////////////////////////////////////////////////////////////////////
+
 namespace caffe2 {
 
-class Caffe2FlagParser {
+class CAFFE2_API Caffe2FlagParser {
  public:
   Caffe2FlagParser() {}
   bool success() { return success_; }
@@ -117,29 +150,29 @@ CAFFE_DECLARE_REGISTRY(Caffe2FlagsRegistry, Caffe2FlagParser, const string&);
 // write the CAFFE2_DEFINE_* and CAFFE2_DECLARE_* macros outside any namespace
 // as well.
 
-#define CAFFE2_DEFINE_typed_var(type, name, default_value, help_str)          \
-  namespace caffe2 {                                                          \
-  CAFFE2_EXPORT type FLAGS_##name = default_value;                            \
-  namespace {                                                                 \
-  class Caffe2FlagParser_##name : public Caffe2FlagParser {                   \
-   public:                                                                    \
-    explicit Caffe2FlagParser_##name(const string& content) {                 \
-      success_ = Caffe2FlagParser::Parse<type>(content, &FLAGS_##name);       \
-    }                                                                         \
-  };                                                                          \
-  }                                                                           \
-  RegistererCaffe2FlagsRegistry g_Caffe2FlagsRegistry_##name(                 \
-      #name,                                                                  \
-      Caffe2FlagsRegistry(),                                                  \
-      RegistererCaffe2FlagsRegistry::DefaultCreator<Caffe2FlagParser_##name>, \
-      "(" #type ", default " #default_value ") " help_str);                   \
+#define CAFFE2_DEFINE_typed_var(type, name, default_value, help_str)           \
+  namespace caffe2 {                                                           \
+  CAFFE2_EXPORT type FLAGS_##name = default_value;                             \
+  namespace {                                                                  \
+  class Caffe2FlagParser_##name : public Caffe2FlagParser {                    \
+   public:                                                                     \
+    explicit Caffe2FlagParser_##name(const string& content) {                  \
+      success_ = Caffe2FlagParser::Parse<type>(content, &FLAGS_##name);        \
+    }                                                                          \
+  };                                                                           \
+  }                                                                            \
+  RegistererCaffe2FlagsRegistry g_Caffe2FlagsRegistry_##name(                  \
+      #name,                                                                   \
+      Caffe2FlagsRegistry(),                                                   \
+      RegistererCaffe2FlagsRegistry::DefaultCreator<Caffe2FlagParser_##name>,  \
+      "(" #type ", default " #default_value ") " help_str);                    \
   }
 
 #define CAFFE2_DEFINE_int(name, default_value, help_str)                       \
   CAFFE2_DEFINE_typed_var(int, name, default_value, help_str)
-#define CAFFE2_DEFINE_int64(name, default_value, help_str) \
+#define CAFFE2_DEFINE_int64(name, default_value, help_str)                     \
   CAFFE2_DEFINE_typed_var(int64_t, name, default_value, help_str)
-#define CAFFE2_DEFINE_double(name, default_value, help_str) \
+#define CAFFE2_DEFINE_double(name, default_value, help_str)                    \
   CAFFE2_DEFINE_typed_var(double, name, default_value, help_str)
 #define CAFFE2_DEFINE_bool(name, default_value, help_str)                      \
   CAFFE2_DEFINE_typed_var(bool, name, default_value, help_str)
@@ -147,9 +180,9 @@ CAFFE_DECLARE_REGISTRY(Caffe2FlagsRegistry, Caffe2FlagParser, const string&);
   CAFFE2_DEFINE_typed_var(string, name, default_value, help_str)
 
 // DECLARE_typed_var should be used in header files and in the global namespace.
-#define CAFFE2_DECLARE_typed_var(type, name) \
-  namespace caffe2 {                         \
-  CAFFE2_IMPORT extern type FLAGS_##name;    \
+#define CAFFE2_DECLARE_typed_var(type, name)                                   \
+  namespace caffe2 {                                                           \
+    CAFFE2_IMPORT extern type FLAGS_##name;                                    \
   } // namespace caffe2
 
 #define CAFFE2_DECLARE_int(name) CAFFE2_DECLARE_typed_var(int, name)
@@ -158,6 +191,10 @@ CAFFE_DECLARE_REGISTRY(Caffe2FlagsRegistry, Caffe2FlagParser, const string&);
 #define CAFFE2_DECLARE_bool(name) CAFFE2_DECLARE_typed_var(bool, name)
 #define CAFFE2_DECLARE_string(name) CAFFE2_DECLARE_typed_var(string, name)
 
+////////////////////////////////////////////////////////////////////////////////
+// End non-gflags section.
+////////////////////////////////////////////////////////////////////////////////
+
 #endif  // CAFFE2_USE_GFLAGS
 
 #endif  // CAFFE2_CORE_FLAGS_H_
diff --git a/caffe2/core/flags_test.cc b/caffe2/core/flags_test.cc
new file mode 100644
index 00000000000000..28bbe3d0688265
--- /dev/null
+++ b/caffe2/core/flags_test.cc
@@ -0,0 +1,27 @@
+#include <gtest/gtest.h>
+#include "caffe2/core/macros.h"
+#include "caffe2/core/flags.h"
+#include "caffe2/core/logging.h"
+
+CAFFE2_DEFINE_bool(caffe2_flags_test_only_flag, true, "Only used in test.");
+
+namespace caffe2 {
+
+TEST(FlagsTest, TestGflagsCorrectness) {
+#ifdef CAFFE2_USE_GFLAGS
+  EXPECT_EQ(FLAGS_caffe2_flags_test_only_flag, true);
+  EXPECT_EQ(::FLAGS_caffe2_flags_test_only_flag, true);
+  // Change the caffe2 namespace and check global
+  FLAGS_caffe2_flags_test_only_flag = false;  
+  EXPECT_EQ(FLAGS_caffe2_flags_test_only_flag, false);
+  EXPECT_EQ(::FLAGS_caffe2_flags_test_only_flag, false);
+  // Change global and check caffe2 namespace
+  ::FLAGS_caffe2_flags_test_only_flag = true;  
+  EXPECT_EQ(FLAGS_caffe2_flags_test_only_flag, true);
+  EXPECT_EQ(::FLAGS_caffe2_flags_test_only_flag, true);
+#else  // CAFFE2_USE_GFLAGS
+  LOG(INFO) << "Caffe2 is not built with gflags. Nothing to test here.";
+#endif
+}
+
+} // namespace caffe2
diff --git a/caffe2/core/logging.cc b/caffe2/core/logging.cc
index 7de8f9aacd262a..0c49f97932634c 100644
--- a/caffe2/core/logging.cc
+++ b/caffe2/core/logging.cc
@@ -69,20 +69,27 @@ std::function<void(const OperatorDef&)> GetOperatorLogger() {
 #ifdef CAFFE2_USE_GOOGLE_GLOG
 
 #ifdef CAFFE2_USE_GFLAGS
+// When GLOG depends on GFLAGS, these variables are being defined in GLOG
+// directly via the GFLAGS definition, so we will use DECLARE_* to declare
+// them, and use them in Caffe2.
 // GLOG's minloglevel
-CAFFE2_DECLARE_int(minloglevel);
+DECLARE_int32(minloglevel);
 // GLOG's verbose log value.
-CAFFE2_DECLARE_int(v);
+DECLARE_int32(v);
 // GLOG's logtostderr value
-CAFFE2_DECLARE_bool(logtostderr);
-
-#else
+DECLARE_bool(logtostderr);
+#endif // CAFFE2_USE_GFLAGS
 
+// Provide easy access to the above variables, regardless whether GLOG is
+// dependent on GFLAGS or not. Note that the namespace (fLI, fLB) is actually
+// consistent between GLOG and GFLAGS, so we can do the below declaration
+// consistently.
+namespace caffe2 {
 using fLI::FLAGS_minloglevel;
 using fLI::FLAGS_v;
 using fLB::FLAGS_logtostderr;
+}  // namespace caffe2
 
-#endif // CAFFE2_USE_GFLAGS
 
 CAFFE2_DEFINE_int(caffe2_log_level, google::GLOG_ERROR,
                   "The minimum log level that caffe2 will output.");
diff --git a/caffe2/core/registry.h b/caffe2/core/registry.h
index f5e0932228a977..d55b1e181cefde 100644
--- a/caffe2/core/registry.h
+++ b/caffe2/core/registry.h
@@ -178,7 +178,7 @@ class Registerer {
       key,                                                                    \
       RegistryName(),                                                         \
       Registerer##RegistryName::DefaultCreator<__VA_ARGS__>,                  \
-      DemangleType<__VA_ARGS__>());                                           \
+      at::demangle_type<__VA_ARGS__>());                                           \
   }
 
 // CAFFE_DECLARE_REGISTRY and CAFFE_DEFINE_REGISTRY are hard-wired to use string
diff --git a/caffe2/core/tensor.cc b/caffe2/core/tensor.cc
index fa494902b1d5f4..624854515e9e2b 100644
--- a/caffe2/core/tensor.cc
+++ b/caffe2/core/tensor.cc
@@ -16,6 +16,8 @@ CAFFE2_DEFINE_int64(
 
 namespace caffe2 {
 
+CAFFE_DEFINE_KNOWN_TYPE(Tensor);
+
 TensorPrinter::TensorPrinter(
     const std::string& tensor_name,
     const std::string& file_name,
diff --git a/caffe2/core/typeid.h b/caffe2/core/typeid.h
index 609c67a61dbf20..412de1d96a2286 100644
--- a/caffe2/core/typeid.h
+++ b/caffe2/core/typeid.h
@@ -1,480 +1,7 @@
 #pragma once
 
-#include <atomic>
-#include <cassert>
-#include <cstdlib>
-#include <iostream>
-#include <mutex>
-#include <type_traits>
-#include <unordered_map>
-#include <unordered_set>
-#ifdef __GXX_RTTI
-#include <typeinfo>
-#endif
-
-#include <exception>
-
-#include "ATen/core/Half.h"
-#include "caffe2/core/common.h"
-#include "ATen/core/IdWrapper.h"
-
-namespace caffe2 {
-class TypeIdentifier;
-}
-
-std::ostream& operator<<(std::ostream& stream, caffe2::TypeIdentifier typeId);
-
-namespace caffe2 {
-
-class TypeMeta;
-
-/**
- * A type id is a unique id for a given C++ type.
- * You need to register your types using CAFFE_KNOWN_TYPE(MyType) to be able to use TypeIdentifier with custom types.
- * This is for example used to store the dtype of tensors.
- */
-class TypeIdentifier final : public at::IdWrapper<TypeIdentifier, uint16_t> {
-public:
-  static TypeIdentifier createTypeId();
-
-  friend std::ostream& ::operator<<(std::ostream& stream, TypeIdentifier typeId);
-  friend bool operator<(TypeIdentifier lhs, TypeIdentifier rhs);
-
-  // This is 8, because 0 is uint8_t (due to ScalarType BC constraint)
-  static constexpr TypeIdentifier uninitialized() {
-    return TypeIdentifier(8);
-  }
-
-private:
-    constexpr explicit TypeIdentifier(uint16_t id): IdWrapper(id) {}
-    friend class TypeMeta;
-};
-
-// Allow usage in std::map / std::set
-// TODO Disallow this and rather use std::unordered_map/set everywhere
-inline bool operator<(TypeIdentifier lhs, TypeIdentifier rhs) {
-  return lhs.underlyingId() < rhs.underlyingId();
-}
-
-}
-
-AT_DEFINE_HASH_FOR_IDWRAPPER(caffe2::TypeIdentifier)
-
-inline std::ostream& operator<<(std::ostream& stream, caffe2::TypeIdentifier typeId) {
-  return stream << typeId.underlyingId();
-}
-
-namespace caffe2 {
-
-std::unordered_map<TypeIdentifier, std::string>& gTypeNames();
-std::unordered_set<std::string>& gRegisteredTypeNames();
-
-// A utility function to demangle a function name.
-std::string Demangle(const char* name);
-
-/**
- * Returns the printable name of the type.
- *
- * Works for all types, not only the ones registered with CAFFE_KNOWN_TYPE
- */
-template <typename T>
-static const char* DemangleType() {
-#ifdef __GXX_RTTI
-  static const std::string name = Demangle(typeid(T).name());
-  return name.c_str();
-#else // __GXX_RTTI
-  return "(RTTI disabled, cannot show name)";
-#endif // __GXX_RTTI
-}
-
-// A utility function to return an exception std::string by prepending its exception
-// type before its what() content.
-std::string GetExceptionString(const std::exception& e);
-
-std::mutex& gTypeRegistrationMutex();
-
-template <typename T>
-struct TypeNameRegisterer {
-  TypeNameRegisterer(TypeIdentifier id, const std::string& literal_name) {
-    std::lock_guard<std::mutex> guard(gTypeRegistrationMutex());
-#ifdef __GXX_RTTI
-    (void)literal_name;
-
-    std::string name = Demangle(typeid(T).name());
-    // If we are in RTTI mode, we will also use this opportunity to do sanity
-    // check if there are duplicated ids registered for the same type. This
-    // usually happens when one does not do RTLD_GLOBAL, which is often the
-    // case in Python. The way we do the check is to make sure that there are
-    // no duplicated names registered - this could be done by checking the
-    // uniqueness of names.
-    if (gRegisteredTypeNames().count(name)) {
-      std::cerr << "Type name " << name
-                << " registered twice. This should "
-                   "not happen. Do you have duplicated CAFFE_KNOWN_TYPE?"
-                << std::endl;
-      throw std::runtime_error("TypeNameRegisterer error with type " + name);
-    }
-    gRegisteredTypeNames().insert(name);
-    gTypeNames()[id] = name;
-#else // __GXX_RTTI
-    if (literal_name.empty()) {
-      gTypeNames()[id] = "(RTTI disabled, cannot show name)";
-    } else {
-      gTypeNames()[id] = literal_name;
-    }
-#endif // __GXX_RTTI
-  }
-};
-
-/**
- * TypeMeta is a thin class that allows us to store the type of a container such
- * as a blob, or the data type of a tensor, with a unique run-time id. It also
- * stores some additional data such as the item size and the name of the type
- * for run-time inspection.
- */
-class TypeMeta {
- public:
-  using PlacementNew = void (void*, size_t);
-  using TypedCopy = void (const void*, void*, size_t);
-  using TypedDestructor = void (void*, size_t);
-  /** Create a dummy TypeMeta object. To create a TypeMeta object for a specific
-   * type, use TypeMeta::Make<T>().
-   */
-  TypeMeta() noexcept
-      : id_(TypeIdentifier::uninitialized()), itemsize_(0), ctor_(nullptr), copy_(nullptr), dtor_(nullptr) {}
-
-  /**
-   * Copy constructor.
-   */
-  TypeMeta(const TypeMeta& src) noexcept = default;
-
-  /**
-   * Assignment operator.
-   */
-  TypeMeta& operator=(const TypeMeta& src) noexcept = default;
-
-  TypeMeta(TypeMeta &&rhs) noexcept = default;
-
- private:
-  // TypeMeta can only be created by Make, making sure that we do not
-  // create incorrectly mixed up TypeMeta objects.
-  TypeMeta(
-      TypeIdentifier i,
-      size_t s,
-      PlacementNew* ctor,
-      TypedCopy* copy,
-      TypedDestructor* dtor) noexcept
-      : id_(i), itemsize_(s), ctor_(ctor), copy_(copy), dtor_(dtor) {}
-
-  // Mechanism for throwing errors which can't be prevented at compile time
-  // due to type erasure. E.g. somebody calling TypeMeta::copy() for
-  // non-copiable type. Right now just throws exception but is implemented
-  // in .cpp to manage dependencies
-  static void _ThrowRuntimeTypeLogicError(const std::string& msg);
-
- public:
-  /**
-   * Returns the type id.
-   */
-  const TypeIdentifier& id() const noexcept {
-    return id_;
-  }
-  /**
-   * Returns the size of the item.
-   */
-  const size_t& itemsize() const noexcept {
-    return itemsize_;
-  }
-  /**
-   * Returns the placement new function pointer for individual items.
-   */
-  PlacementNew* ctor() const noexcept {
-    return ctor_;
-  }
-  /**
-   * Returns the typed copy function pointer for individual iterms.
-   */
-  TypedCopy* copy() const noexcept {
-    return copy_;
-  }
-  /**
-   * Returns the destructor function pointer for individual items.
-   */
-  TypedDestructor* dtor() const noexcept {
-    return dtor_;
-  }
-  /**
-   * Returns a printable name for the type.
-   */
-  const char* name() const noexcept {
-    auto it = gTypeNames().find(id_);
-    assert(it != gTypeNames().end());
-    return it->second.c_str();
-  }
-
-  friend bool operator==(const TypeMeta& lhs, const TypeMeta& rhs) noexcept;
-
-  template <typename T>
-  bool Match() const {
-    return (id_ == Id<T>());
-  }
-
-  // Below are static functions that can be called by passing a specific type.
-
-  /**
-   * Returns the unique id for the given type T. The id is unique for the type T
-   * in the sense that for any two different types, their id are different; for
-   * the same type T, the id remains the same over different calls of the
-   * function. However, this is not guaranteed over different runs, as the id
-   * is generated during run-time. Do NOT serialize the id for storage.
-   */
-  template <typename T>
-  CAFFE2_API static TypeIdentifier Id();
-
-  /**
-   * Returns the item size of the type. This is equivalent to sizeof(T).
-   */
-  template <typename T>
-  static size_t ItemSize() {
-    return sizeof(T);
-  }
-
-  /**
-   * Returns the registered printable name of the type.
-   *
-   * Works for only the ones registered with CAFFE_KNOWN_TYPE
-   */
-  template <typename T>
-  static const char* TypeName() {
-    auto it = gTypeNames().find(Id<T>());
-    assert(it != gTypeNames().end());
-    return it->second.c_str();
-  }
-
-  /**
-   * Placement new function for the type.
-   */
-  template <typename T>
-  static void _Ctor(void* ptr, size_t n) {
-    T* typed_ptr = static_cast<T*>(ptr);
-    for (size_t i = 0; i < n; ++i) {
-      new (typed_ptr + i) T;
-    }
-  }
-
-  template <typename T>
-  static void _CtorNotDefault(void* /*ptr*/, size_t /*n*/) {
-    _ThrowRuntimeTypeLogicError(
-        "Type " + std::string(DemangleType<T>()) +
-        " is not default-constructible.");
-  }
-
-  template <
-      typename T,
-      typename std::enable_if<std::is_default_constructible<T>::value>::type* =
-          nullptr>
-  static inline PlacementNew* _PickCtor() {
-    return _Ctor<T>;
-  }
-
-  template <
-      typename T,
-      typename std::enable_if<!std::is_default_constructible<T>::value>::type* =
-          nullptr>
-  static inline PlacementNew* _PickCtor() {
-    return _CtorNotDefault<T>;
-  }
-
-  /**
-   * Typed copy function for classes.
-   */
-  template <typename T>
-  static void _Copy(const void* src, void* dst, size_t n) {
-    const T* typed_src = static_cast<const T*>(src);
-    T* typed_dst = static_cast<T*>(dst);
-    for (size_t i = 0; i < n; ++i) {
-      typed_dst[i] = typed_src[i];
-    }
-  }
-
-  /**
-   * A placeholder function for types that do not allow assignment.
-   */
-  template <typename T>
-  static void
-  _CopyNotAllowed(const void* /*src*/, void* /*dst*/, size_t /*n*/) {
-    _ThrowRuntimeTypeLogicError(
-        "Type " + std::string(DemangleType<T>()) +
-        " does not allow assignment.");
-  }
-
-  template <
-      typename T,
-      typename std::enable_if<std::is_copy_assignable<T>::value>::type* =
-          nullptr>
-  static inline TypedCopy* _PickCopy() {
-    return _Copy<T>;
-  }
-
-  template <
-      typename T,
-      typename std::enable_if<!std::is_copy_assignable<T>::value>::type* =
-          nullptr>
-  static inline TypedCopy* _PickCopy() {
-    return _CopyNotAllowed<T>;
-  }
-
-  /**
-   * Destructor for non-fundamental types.
-   */
-  template <typename T>
-  static void _Dtor(void* ptr, size_t n) {
-    T* typed_ptr = static_cast<T*>(ptr);
-    for (size_t i = 0; i < n; ++i) {
-      typed_ptr[i].~T();
-    }
-  }
-
-  /**
-   * Returns a TypeMeta object that corresponds to the typename T.
-   */
-  template <typename T>
-  static typename std::enable_if<
-      std::is_fundamental<T>::value || std::is_pointer<T>::value,
-      TypeMeta>::type
-  Make() {
-    return TypeMeta(Id<T>(), ItemSize<T>(), nullptr, nullptr, nullptr);
-  }
-
-  template <typename T>
-  static typename std::enable_if<
-      !(std::is_fundamental<T>::value || std::is_pointer<T>::value),
-      TypeMeta>::type
-  Make() {
-    return TypeMeta(
-        Id<T>(), ItemSize<T>(), _PickCtor<T>(), _PickCopy<T>(), _Dtor<T>);
-  }
-
- private:
-  TypeIdentifier id_;
-  size_t itemsize_;
-  PlacementNew* ctor_;
-  TypedCopy* copy_;
-  TypedDestructor* dtor_;
-};
-
-inline bool operator==(const TypeMeta& lhs, const TypeMeta& rhs) noexcept {
-  return (lhs.id_ == rhs.id_);
-}
-inline bool operator!=(const TypeMeta& lhs, const TypeMeta& rhs) noexcept {
-  return !operator==(lhs, rhs);
-}
-
-/**
- * Register unique id for a type so it can be used in TypeMeta context, e.g. be
- * used as a type for Blob or for Tensor elements.
- *
- * CAFFE_KNOWN_TYPE does explicit instantiation of TypeMeta::Id<T> template
- * function and thus needs to be put in a single translation unit (.cpp file)
- * for a given type T. Other translation units that use type T as a type of the
- * caffe2::Blob or element type of caffe2::Tensor need to depend on the
- * translation unit that contains CAFFE_KNOWN_TYPE declaration via regular
- * linkage dependencies.
- *
- * NOTE: the macro needs to be invoked in ::caffe2 namespace
- */
-// Implementation note: in MSVC, we will need to prepend the CAFFE2_EXPORT
-// keyword in order to get things compiled properly. in Linux, gcc seems to
-// create attribute ignored error for explicit template instantiations, see
-//   http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0537r0.html
-//   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51930
-// and as a result, we define these two macros slightly differently.
-
-#ifdef _MSC_VER
-#define CAFFE_KNOWN_TYPE(T)                                                        \
-  template <>                                                                      \
-  CAFFE2_EXPORT TypeIdentifier TypeMeta::Id<T>() {                                    \
-    static const TypeIdentifier type_id = TypeIdentifier::createTypeId();                \
-    static TypeNameRegisterer<T> registerer(type_id, #T);                          \
-    return type_id;                                                                \
-  }
-#else // _MSC_VER
-#define CAFFE_KNOWN_TYPE(T)                                                        \
-  template <>                                                                      \
-  TypeIdentifier TypeMeta::Id<T>() {                                                  \
-    static const TypeIdentifier type_id = TypeIdentifier::createTypeId();                \
-    static TypeNameRegisterer<T> registerer(type_id, #T);                          \
-    return type_id;                                                                \
-  }
-#endif
-
-/**
- * CAFFE_DECLARE_KNOWN_TYPE and CAFFE_DEFINE_KNOWN_TYPE are used
- * to preallocate ids for types that are queried very often so that they
- * can be resolved at compile time. Please use CAFFE_KNOWN_TYPE() instead
- * for your own types to allocate dynamic ids for them.
- */
-#ifdef _MSC_VER
-#define CAFFE_DECLARE_KNOWN_TYPE(PreallocatedId, T)              \
-  template <>                                                    \
-  inline CAFFE2_EXPORT TypeIdentifier TypeMeta::Id<T>() { \
-    return TypeIdentifier(PreallocatedId);                          \
-  }
-#else // _MSC_VER
-#define CAFFE_DECLARE_KNOWN_TYPE(PreallocatedId, T) \
-  template <>                                       \
-  inline TypeIdentifier TypeMeta::Id<T>() {  \
-    return TypeIdentifier(PreallocatedId);             \
-  }
-#endif
-
-#define CONCAT_IMPL(x, y) x##y
-#define MACRO_CONCAT(x, y) CONCAT_IMPL(x, y)
-
-#define CAFFE_DEFINE_KNOWN_TYPE(T)                             \
-  namespace {                                                  \
-  TypeNameRegisterer<T> MACRO_CONCAT(registerer, __COUNTER__)( \
-      TypeMeta::Id<T>(),                                       \
-      #T);                                                     \
-  }
-
-class Tensor;
-
-// Note: we have preallocated the numbers 0-8 so they line up exactly
-// with at::ScalarType's numbering.  All other numbers do not matter.
-//
-// Notably, the "uninitialized" type id is 8, not 0, for hysterical raisins.
-
-struct _CaffeHighestPreallocatedTypeId final {};
-
-CAFFE_DECLARE_KNOWN_TYPE(0, uint8_t);
-CAFFE_DECLARE_KNOWN_TYPE(1, int8_t);
-CAFFE_DECLARE_KNOWN_TYPE(2, int16_t);
-CAFFE_DECLARE_KNOWN_TYPE(3, int);
-CAFFE_DECLARE_KNOWN_TYPE(4, int64_t);
-CAFFE_DECLARE_KNOWN_TYPE(5, at::Half);
-CAFFE_DECLARE_KNOWN_TYPE(6, float);
-CAFFE_DECLARE_KNOWN_TYPE(7, double);
-// 8 = undefined type id
-
-CAFFE_DECLARE_KNOWN_TYPE(9, Tensor);
-CAFFE_DECLARE_KNOWN_TYPE(10, std::string);
-CAFFE_DECLARE_KNOWN_TYPE(11, bool);
-CAFFE_DECLARE_KNOWN_TYPE(12, uint16_t);
-CAFFE_DECLARE_KNOWN_TYPE(13, char);
-CAFFE_DECLARE_KNOWN_TYPE(14, std::unique_ptr<std::mutex>);
-CAFFE_DECLARE_KNOWN_TYPE(15, std::unique_ptr<std::atomic<bool>>);
-CAFFE_DECLARE_KNOWN_TYPE(16, std::vector<int32_t>);
-CAFFE_DECLARE_KNOWN_TYPE(17, std::vector<int64_t>);
-CAFFE_DECLARE_KNOWN_TYPE(18, std::vector<unsigned long>);
-CAFFE_DECLARE_KNOWN_TYPE(19, bool*);
-CAFFE_DECLARE_KNOWN_TYPE(20, char*);
-CAFFE_DECLARE_KNOWN_TYPE(21, int*);
-
-#ifdef CAFFE2_UNIQUE_LONG_TYPEMETA
-CAFFE_DECLARE_KNOWN_TYPE(22, long);
-CAFFE_DECLARE_KNOWN_TYPE(23, std::vector<long>);
-#endif // CAFFE2_UNIQUE_LONG_TYPEMETA
-
-CAFFE_DECLARE_KNOWN_TYPE(24, _CaffeHighestPreallocatedTypeId);
-}
+// If I omit this header, the Windows build fails.  The error message
+// was sufficiently bad that I couldn't figure out which downstream file
+// was missing the include of common.h.  So keep it here for BC.
+#include <caffe2/core/common.h>
+#include <ATen/core/typeid.h>
diff --git a/caffe2/core/typeid_test.cc b/caffe2/core/typeid_test.cc
index cc62a108de7daf..c2cc42bd803e08 100644
--- a/caffe2/core/typeid_test.cc
+++ b/caffe2/core/typeid_test.cc
@@ -37,7 +37,7 @@ TEST(TypeMetaTest, Names) {
   EXPECT_TRUE(
       string(string_meta.name()) != typeid(string).name());
   EXPECT_TRUE(
-      string(string_meta.name()) == Demangle(typeid(string).name()));
+      string(string_meta.name()) == at::demangle(typeid(string).name()));
 #endif  // __GXX_RTTI
 }
 
diff --git a/caffe2/db/CMakeLists.txt b/caffe2/db/CMakeLists.txt
index e96d6b79165a0b..d05d87d8107982 100644
--- a/caffe2/db/CMakeLists.txt
+++ b/caffe2/db/CMakeLists.txt
@@ -5,10 +5,14 @@ set(Caffe2_DB_COMMON_CPU_SRC
 set(Caffe2_DB_COMMON_GPU_SRC
     "${CMAKE_CURRENT_SOURCE_DIR}/create_db_op_gpu.cc"
 )
+set(Caffe2_DB_COMMON_HIP_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/hip/create_db_op_hip.cc"
+)
 
 # Common files that are always going to be included.
 list(APPEND Caffe2_CPU_SRCS ${Caffe2_DB_COMMON_CPU_SRC})
 list(APPEND Caffe2_GPU_SRCS ${Caffe2_DB_COMMON_GPU_SRC})
+list(APPEND Caffe2_HIP_SRCS ${Caffe2_DB_COMMON_HIP_SRC}) 
 
 # DB specific files
 if (USE_LMDB)
@@ -25,3 +29,4 @@ endif()
 
 set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
 set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
+set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
index 3829219a933b5d..4e8d2268258416 100644
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@@ -22,7 +22,7 @@ install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../caffe2
         FILES_MATCHING PATTERN "*.h")
 if (NOT BUILD_ATEN)
   install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core
-          DESTINATION include/ATen/core
+          DESTINATION include/ATen
           FILES_MATCHING PATTERN "*.h")
 endif()
 install(FILES ${CMAKE_BINARY_DIR}/caffe2/core/macros.h
diff --git a/setup.py b/setup.py
index 3eb6ba3b6c9621..6a23eb852d7072 100644
--- a/setup.py
+++ b/setup.py
@@ -256,6 +256,7 @@ def patched_link(self, *args, **kwargs):
         version += '+' + sha[:7]
     except Exception:
         pass
+print("Building wheel {}-{}".format(package_name, version))
 
 
 class create_version_file(PytorchCommand):
@@ -776,7 +777,6 @@ def run(self):
     "torch/csrc/autograd/python_variable.cpp",
     "torch/csrc/autograd/python_variable_indexing.cpp",
     "torch/csrc/byte_order.cpp",
-    "torch/csrc/finalizer.cpp",
     "torch/csrc/jit/batched/BatchTensor.cpp",
     "torch/csrc/jit/init.cpp",
     "torch/csrc/jit/ivalue.cpp",
diff --git a/test/cpp/api/module.cpp b/test/cpp/api/module.cpp
index ea8b37d44db54b..c822560e5f1ea3 100644
--- a/test/cpp/api/module.cpp
+++ b/test/cpp/api/module.cpp
@@ -289,7 +289,7 @@ TEST_CASE("module/clone") {
     a->module->weight.data() += 1;
     a->module->value = 123;
 
-    auto b = std::static_pointer_cast<NestedModule>(a->clone());
+    auto b = std::dynamic_pointer_cast<NestedModule>(a->clone());
 
     REQUIRE(!pointer_equal(b->module->weight, a->module->weight));
     REQUIRE(
diff --git a/test/cpp/api/optim.cpp b/test/cpp/api/optim.cpp
index 186159c8e98edf..257bf15d546578 100644
--- a/test/cpp/api/optim.cpp
+++ b/test/cpp/api/optim.cpp
@@ -35,7 +35,8 @@ bool test_optimizer_xor(Options options) {
   const int64_t kBatchSize = 4;
   const int64_t kMaximumNumberOfEpochs = 3000;
 
-  auto optimizer = OptimizerClass(model->parameters(), options);
+  auto optimizer = OptimizerClass(std::vector<torch::Tensor>(), options);
+  optimizer.add_parameters(model->parameters());
 
   float running_loss = 1;
   int epoch = 0;
@@ -258,3 +259,22 @@ TEST_CASE("Optim/ExternalVectorOfParameters") {
   REQUIRE(parameters[1].allclose(original_parameters[1] - 1.0));
   REQUIRE(parameters[2].allclose(original_parameters[2] - 1.0));
 }
+
+TEST_CASE("Optim/AddParameter/LBFGS") {
+  torch::manual_seed(0);
+
+  std::vector<torch::Tensor> parameters = {torch::randn({5, 5})};
+  std::vector<torch::Tensor> original_parameters = {parameters[0].clone()};
+
+  // Set all gradients to one
+  for (auto& parameter : parameters) {
+    parameter.grad() = torch::ones_like(parameter);
+  }
+
+  LBFGS optimizer(std::vector<torch::Tensor>(), 1.0);
+  optimizer.add_parameters(parameters);
+
+  optimizer.step([]() { return torch::tensor(1); });
+
+  // REQUIRE this doesn't throw
+}
diff --git a/test/cpp/api/sequential.cpp b/test/cpp/api/sequential.cpp
index 4d855cb10c9f85..7d07ccccb5887a 100644
--- a/test/cpp/api/sequential.cpp
+++ b/test/cpp/api/sequential.cpp
@@ -278,7 +278,7 @@ TEST_CASE("sequential") {
   SECTION("Is cloneable") {
     Sequential sequential(Linear(3, 4), Functional(torch::relu), BatchNorm(3));
     Sequential clone =
-        std::static_pointer_cast<SequentialImpl>(sequential->clone());
+        std::dynamic_pointer_cast<SequentialImpl>(sequential->clone());
     REQUIRE(sequential->size() == clone->size());
 
     for (size_t i = 0; i < sequential->size(); ++i) {
@@ -309,7 +309,7 @@ TEST_CASE("sequential/clone-to-device", "[cuda]") {
   Sequential sequential(Linear(3, 4), Functional(torch::relu), BatchNorm(3));
   torch::Device device(torch::kCUDA, 0);
   Sequential clone =
-      std::static_pointer_cast<SequentialImpl>(sequential->clone(device));
+      std::dynamic_pointer_cast<SequentialImpl>(sequential->clone(device));
   for (const auto& p : clone->parameters()) {
     REQUIRE(p->device() == device);
   }
diff --git a/test/cpp/api/serialization.cpp b/test/cpp/api/serialization.cpp
index 5cc8cc9e7d27b7..7f8bbe27419231 100644
--- a/test/cpp/api/serialization.cpp
+++ b/test/cpp/api/serialization.cpp
@@ -228,6 +228,14 @@ TEST_CASE("serialization") {
     ss.seekg(0, std::ios::beg);
     torch::load(ss, model3.get());
 
+    auto param1 = model1->parameters();
+    auto param2 = model2->parameters();
+    auto param3 = model3->parameters();
+    for (const auto& p : param1) {
+      REQUIRE(param1[p.key].allclose(param2[p.key]));
+      REQUIRE(param2[p.key].allclose(param3[p.key]));
+    }
+
     // Make some optimizers with momentum (and thus state)
     auto optim1 = torch::optim::SGD(
         model1->parameters(), torch::optim::SGDOptions(1e-1).momentum(0.9));
@@ -240,9 +248,9 @@ TEST_CASE("serialization") {
     auto optim3_2 = torch::optim::SGD(
         model3->parameters(), torch::optim::SGDOptions(1e-1).momentum(0.9));
 
-    auto x = torch::ones({10, 5}, torch::requires_grad());
+    auto x = torch::ones({10, 5});
 
-    auto step = [&](torch::optim::Optimizer& optimizer, Linear model) {
+    auto step = [&x](torch::optim::Optimizer& optimizer, Linear model) {
       optimizer.zero_grad();
       auto y = model->forward(x).sum();
       y.backward();
@@ -264,11 +272,11 @@ TEST_CASE("serialization") {
     torch::load(ss, &optim3_2);
     step(optim3_2, model3);
 
-    auto param1 = model1->parameters();
-    auto param2 = model2->parameters();
-    auto param3 = model3->parameters();
-    for (auto& p : param1) {
-      auto& name = p.key;
+    param1 = model1->parameters();
+    param2 = model2->parameters();
+    param3 = model3->parameters();
+    for (const auto& p : param1) {
+      const auto& name = p.key;
       // Model 1 and 3 should be the same
       REQUIRE(param1[name].norm().toCFloat() == param3[name].norm().toCFloat());
       REQUIRE(param1[name].norm().toCFloat() != param2[name].norm().toCFloat());
diff --git a/tools/amd_build/build_caffe2_amd.py b/tools/amd_build/build_caffe2_amd.py
index 9726bc2ebed542..10f72d999cd0d3 100755
--- a/tools/amd_build/build_caffe2_amd.py
+++ b/tools/amd_build/build_caffe2_amd.py
@@ -17,6 +17,7 @@
     "caffe2/queue/*",
     "caffe2/**/*_test*",
     "caffe2/core/THCCachingAllocator*",
+    "caffe2/db/*",
 ]
 
 ignores = [
diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp
index 4713581728ebb5..75a59063842911 100644
--- a/tools/autograd/templates/VariableType.cpp
+++ b/tools/autograd/templates/VariableType.cpp
@@ -43,7 +43,7 @@ using namespace torch::autograd::generated;
 namespace torch { namespace autograd {
 
 VariableType::VariableType(Context* context, Type* baseType)
-  : Type(context, /*is_variable=*/true, /*is_undefined=*/false)
+  : Type(context, baseType->type_id(), /*is_variable=*/true, /*is_undefined=*/false)
   , baseType(baseType)
   , id_(context->freshTypeID()) {
   str = std::string("Variable[") + baseType->toString() + "]";
diff --git a/torch/csrc/Storage.cpp b/torch/csrc/Storage.cpp
index 9838a282d90b35..4aae5f1c2fd494 100644
--- a/torch/csrc/Storage.cpp
+++ b/torch/csrc/Storage.cpp
@@ -13,7 +13,6 @@
 // See Note [TH abstraction violation]
 //  - Used to get at the allocator associated with a storage
 #include <TH/THStorageFunctions.hpp>
-#include <torch/csrc/finalizer.h>
 #include <libshm.h>
 #include "THP.h"
 #include "copy_utils.h"
diff --git a/torch/csrc/api/include/torch/nn/cloneable.h b/torch/csrc/api/include/torch/nn/cloneable.h
index 759a3341511205..feb4baebaece16 100644
--- a/torch/csrc/api/include/torch/nn/cloneable.h
+++ b/torch/csrc/api/include/torch/nn/cloneable.h
@@ -21,7 +21,7 @@ namespace nn {
 /// `clone()` method. We do not want to use this pattern in the base class,
 /// because then storing a module would always require templatizing it.
 template <typename Derived>
-class Cloneable : public Module {
+class Cloneable : public virtual Module {
  public:
   using Module::Module;
 
diff --git a/torch/csrc/api/include/torch/nn/module.h b/torch/csrc/api/include/torch/nn/module.h
index 0254f3dce5fc4e..e8140659579afd 100644
--- a/torch/csrc/api/include/torch/nn/module.h
+++ b/torch/csrc/api/include/torch/nn/module.h
@@ -205,7 +205,7 @@ std::shared_ptr<ModuleType> Module::register_module(
     std::string name,
     std::shared_ptr<ModuleType> module) {
   auto& base_module = children_.insert(std::move(name), std::move(module));
-  return std::static_pointer_cast<ModuleType>(base_module);
+  return std::dynamic_pointer_cast<ModuleType>(base_module);
 }
 
 template <typename ModuleType>
diff --git a/torch/csrc/api/include/torch/nn/modules/any.h b/torch/csrc/api/include/torch/nn/modules/any.h
index 864f7245fcb69c..f95e81d636afe2 100644
--- a/torch/csrc/api/include/torch/nn/modules/any.h
+++ b/torch/csrc/api/include/torch/nn/modules/any.h
@@ -315,7 +315,7 @@ struct AnyModule::Holder : public AnyModule::Placeholder {
   std::unique_ptr<Placeholder> clone(
       at::optional<Device> device) const override {
     return torch::make_unique<Holder>(
-        std::static_pointer_cast<ModuleType>(module->clone(device)));
+        std::dynamic_pointer_cast<ModuleType>(module->clone(device)));
   }
 
   /// The actual concrete module instance.
diff --git a/torch/csrc/api/include/torch/optim/adagrad.h b/torch/csrc/api/include/torch/optim/adagrad.h
index 0e2b2be251b497..6e64f309f18b3d 100644
--- a/torch/csrc/api/include/torch/optim/adagrad.h
+++ b/torch/csrc/api/include/torch/optim/adagrad.h
@@ -29,13 +29,13 @@ class Adagrad : public Optimizer {
       ParameterContainer&& parameters,
       const AdagradOptions& options)
       : Optimizer(std::forward<ParameterContainer>(parameters)),
-        options_(options),
+        options(options),
         sum_(zero_buffers_like(parameters_)),
         step_(parameters_.size(), 0) {}
 
   void step() override;
 
-  const AdagradOptions& options() const noexcept;
+  AdagradOptions options;
 
   template <class Archive>
   void serialize(Archive& ar) {
@@ -45,12 +45,10 @@ class Adagrad : public Optimizer {
 
  private:
   friend class cereal::access;
-  Adagrad() : options_(0) {}
-
-  AdagradOptions options_;
+  Adagrad() : options(0) {}
 
   std::vector<Tensor> sum_;
-  std::vector<double> step_;
+  std::vector<int64_t> step_;
 };
 } // namespace optim
 } // namespace torch
diff --git a/torch/csrc/api/include/torch/optim/adam.h b/torch/csrc/api/include/torch/optim/adam.h
index 5bf3ef04f0e110..7ad3a5190ce993 100644
--- a/torch/csrc/api/include/torch/optim/adam.h
+++ b/torch/csrc/api/include/torch/optim/adam.h
@@ -30,11 +30,11 @@ class Adam : public Optimizer {
   template <typename ParameterContainer>
   explicit Adam(ParameterContainer&& parameters, const AdamOptions& options)
       : Optimizer(std::forward<ParameterContainer>(parameters)),
-        options_(options),
+        options(options),
         step_buffers_(parameters_.size(), 0),
         exp_average_buffers_(zero_buffers_like(parameters_)),
         exp_average_sq_buffers_(zero_buffers_like(parameters_)) {
-    if (options_.amsgrad_) {
+    if (options.amsgrad_) {
       max_exp_average_sq_buffers_ = zero_buffers_like(parameters_);
     }
   }
@@ -49,13 +49,11 @@ class Adam : public Optimizer {
        CEREAL_NVP(max_exp_average_sq_buffers_));
   }
 
-  const AdamOptions& options() const noexcept;
+  AdamOptions options;
 
  private:
   friend class cereal::access;
-  Adam() : options_(0) {}
-
-  AdamOptions options_;
+  Adam() : options(0) {}
 
   std::vector<int64_t> step_buffers_;
   std::vector<Tensor> exp_average_buffers_;
diff --git a/torch/csrc/api/include/torch/optim/lbfgs.h b/torch/csrc/api/include/torch/optim/lbfgs.h
index fe969c84677e73..d812362ccfbe86 100644
--- a/torch/csrc/api/include/torch/optim/lbfgs.h
+++ b/torch/csrc/api/include/torch/optim/lbfgs.h
@@ -31,13 +31,13 @@ class LBFGS : public LossClosureOptimizer {
   template <typename ParameterContainer>
   explicit LBFGS(ParameterContainer&& parameters, const LBFGSOptions& options)
       : LossClosureOptimizer(std::forward<ParameterContainer>(parameters)),
-        options_(options),
-        ro(options_.history_size_),
-        al(options_.history_size_) {}
+        options(options),
+        ro(options.history_size_),
+        al(options.history_size_) {}
 
   torch::Tensor step(LossClosure closure) override;
 
-  const LBFGSOptions& options() const noexcept;
+  LBFGSOptions options;
 
   template <class Archive>
   void serialize(Archive& ar) {
@@ -52,13 +52,11 @@ class LBFGS : public LossClosureOptimizer {
 
  private:
   friend class cereal::access;
-  LBFGS() : options_(0) {}
+  LBFGS() : options(0) {}
 
   at::Tensor gather_flat_grad();
   void add_grad(const torch::Scalar& step_size, const at::Tensor& update);
 
-  LBFGSOptions options_;
-
   at::Tensor d{torch::empty({0})};
   at::Tensor H_diag{torch::empty({0})};
   at::Tensor prev_flat_grad{torch::empty({0})};
diff --git a/torch/csrc/api/include/torch/optim/optimizer.h b/torch/csrc/api/include/torch/optim/optimizer.h
index eed600ab759bc4..4f56c1f67236e2 100644
--- a/torch/csrc/api/include/torch/optim/optimizer.h
+++ b/torch/csrc/api/include/torch/optim/optimizer.h
@@ -4,6 +4,7 @@
 #include <torch/nn/cursor.h>
 #include <torch/tensor.h>
 
+#include <algorithm>
 #include <functional>
 #include <memory>
 #include <vector>
@@ -64,9 +65,28 @@ class OptimizerBase {
     return result;
   }
 
+  /// Accesses a buffer at the given index.
+  /// Additionally, zeros out the buffers when this is called on the index
+  template<typename T>
+  T& buffer_at(std::vector<T>& buffers, size_t index) {
+    if (buffers.size() <= index) {
+      const auto old_size = buffers.size();
+      buffers.resize(index + 1);
+      std::fill(buffers.begin() + old_size, buffers.end(), T{0});
+    }
+    return buffers[index];
+  }
+
   /// Accesses a buffer at the given index, converts it to the type of the
   /// parameter at the corresponding index (a no-op if they match).
+  /// Additionally, zeros out the buffers when this is called on the index
   Tensor& buffer_at(std::vector<Tensor>& buffers, size_t index) {
+    if (buffers.size() <= index) {
+      for (auto i = buffers.size(); i <= index; i++) {
+        buffers.push_back(torch::zeros_like(parameters_.at(i)));
+      }
+    }
+    // Copy the buffer to the device and dtype of the parameter.
     const auto& parameter = parameters_.at(index);
     const auto& buffer = buffers.at(index);
     if (buffer.device() != parameter.device() ||
diff --git a/torch/csrc/api/include/torch/optim/rmsprop.h b/torch/csrc/api/include/torch/optim/rmsprop.h
index e51cacc586db37..5077536a97a78f 100644
--- a/torch/csrc/api/include/torch/optim/rmsprop.h
+++ b/torch/csrc/api/include/torch/optim/rmsprop.h
@@ -33,7 +33,7 @@ class RMSprop : public Optimizer {
       ParameterContainer&& parameters,
       const RMSpropOptions& options)
       : Optimizer(std::forward<ParameterContainer>(parameters)),
-        options_(options),
+        options(options),
         square_average_buffers_(zero_buffers_like(parameters_)) {
     if (options.centered_ > 0) {
       grad_average_buffers_ = zero_buffers_like(parameters_);
@@ -45,7 +45,7 @@ class RMSprop : public Optimizer {
 
   void step() override;
 
-  const RMSpropOptions& options() const noexcept;
+  RMSpropOptions options;
 
   template <class Archive>
   void serialize(Archive& ar) {
@@ -56,9 +56,7 @@ class RMSprop : public Optimizer {
 
  private:
   friend class cereal::access;
-  RMSprop() : options_(0) {}
-
-  RMSpropOptions options_;
+  RMSprop() : options(0) {}
 
   std::vector<Tensor> square_average_buffers_;
   std::vector<Tensor> momentum_buffers_;
diff --git a/torch/csrc/api/include/torch/optim/sgd.h b/torch/csrc/api/include/torch/optim/sgd.h
index 47196074d380ee..9f58e4a7232915 100644
--- a/torch/csrc/api/include/torch/optim/sgd.h
+++ b/torch/csrc/api/include/torch/optim/sgd.h
@@ -30,8 +30,8 @@ class SGD : public Optimizer {
   template <typename ParameterContainer>
   explicit SGD(ParameterContainer&& parameters, const SGDOptions& options)
       : Optimizer(std::forward<ParameterContainer>(parameters)),
-        options_(options) {
-    if (options_.momentum_ > 0) {
+        options(options) {
+    if (options.momentum_ > 0) {
       momentum_buffers_ = zero_buffers_like(parameters_);
     }
   }
@@ -43,13 +43,12 @@ class SGD : public Optimizer {
     ar(CEREAL_NVP(momentum_buffers_));
   }
 
-  const SGDOptions& options() const noexcept;
+  SGDOptions options;
 
  private:
   friend class cereal::access;
-  SGD() : options_(0) {}
+  SGD() : options(0) {}
 
-  SGDOptions options_;
   std::vector<Tensor> momentum_buffers_;
   /// Counts how often `step()` is called, for dampening.
   size_t iteration_{0};
diff --git a/torch/csrc/api/include/torch/serialization.h b/torch/csrc/api/include/torch/serialization.h
index 61b5b53b59b331..d28930a3ddb31e 100644
--- a/torch/csrc/api/include/torch/serialization.h
+++ b/torch/csrc/api/include/torch/serialization.h
@@ -4,6 +4,7 @@
 
 #include <torch/tensor.h>
 #include <torch/optim.h>
+#include <torch/utils.h>
 
 #include "cereal/archives/binary.hpp"
 #include "cereal/types/polymorphic.hpp"
@@ -168,12 +169,13 @@ loadBinary(BinaryInputArchive& archive, void* data, size_t size) {
 // Gradients will not be saved for variables
 template <class Archive>
 void save(Archive& archive, torch::Tensor const& tensor) {
+  torch::NoGradGuard guard;
   if (!tensor.defined()) {
     int32_t typeId = ::torch::detail::scalarTypeId(torch::Dtype::Undefined);
     archive(CEREAL_NVP(typeId));
     return;
   } else {
-    int32_t typeId = ::torch::detail::scalarTypeId(tensor.data().type().scalarType());
+    int32_t typeId = ::torch::detail::scalarTypeId(tensor.dtype());
     archive(CEREAL_NVP(typeId));
   }
   auto sizes = std::vector<int64_t>();
@@ -199,6 +201,7 @@ void save(Archive& archive, torch::Tensor const& tensor) {
  **/
 template <class Archive>
 void load(Archive& archive, torch::Tensor& tensor) {
+  torch::NoGradGuard guard;
   torch::Dtype type;
   int32_t typeId;
   archive(CEREAL_NVP(typeId));
@@ -214,19 +217,19 @@ void load(Archive& archive, torch::Tensor& tensor) {
   archive(CEREAL_NVP(backendId), CEREAL_NVP(sizes));
 
   at::Backend backend = ::torch::detail::backendFromId(backendId);
-  if (!tensor.defined() || tensor.data().type().scalarType() != type) {
+  if (!tensor.defined() || tensor.dtype() != type) {
     tensor = torch::empty({}, torch::getType(backend, type));
   }
   tensor.data().resize_(sizes);
 
   if (tensor.type().is_cuda()) {
     // should actually use cudamemcpy probably
-    auto cputensor = torch::empty(sizes, tensor.data().type().scalarType());
+    auto cputensor = torch::empty(sizes, tensor.dtype());
     agimpl::loadBinary(
         archive,
         cputensor.data_ptr(),
         cputensor.numel() * cputensor.type().elementSizeInBytes());
-    tensor.copy_(cputensor);
+    tensor.data().copy_(cputensor.data());
   } else {
     agimpl::loadBinary(
         archive,
diff --git a/torch/csrc/api/src/optim/adagrad.cpp b/torch/csrc/api/src/optim/adagrad.cpp
index 7d87e0c3a03914..d0fa9afd79966d 100644
--- a/torch/csrc/api/src/optim/adagrad.cpp
+++ b/torch/csrc/api/src/optim/adagrad.cpp
@@ -12,10 +12,6 @@ namespace optim {
 AdagradOptions::AdagradOptions(double learning_rate)
     : learning_rate_(learning_rate) {}
 
-const AdagradOptions& Adagrad::options() const noexcept {
-  return options_;
-}
-
 /// Adapted from
 /// https://github.com/pytorch/pytorch/blob/master/torch/optim/adagrad.py
 void Adagrad::step() {
@@ -26,16 +22,16 @@ void Adagrad::step() {
       continue;
 
     auto d_p = Tensor(grad).data();
-    if (options_.weight_decay_ > 0) {
-      d_p.add_(p, options_.weight_decay_);
+    if (options.weight_decay_ > 0) {
+      d_p.add_(p, options.weight_decay_);
     }
-    step_.at(i) += 1.0;
-    auto clr = options_.learning_rate_ /
-        (1.0 + (step_.at(i) - 1.0) * options_.lr_decay_);
+    buffer_at(step_, i) += 1.0;
+    auto clr = options.learning_rate_ /
+        (1.0 + (buffer_at(step_, i) - 1.0) * options.lr_decay_);
 
     auto sum = buffer_at(sum_, i);
     sum.data().addcmul_(d_p, d_p, 1.0);
-    auto std = sum_.at(i).data().sqrt().add_(1e-10);
+    auto std = buffer_at(sum_, i).data().sqrt().add_(1e-10);
     p.addcdiv_(d_p, std, -clr);
   }
 }
diff --git a/torch/csrc/api/src/optim/adam.cpp b/torch/csrc/api/src/optim/adam.cpp
index e05b81ef7dd8eb..48a0c6a8562e9a 100644
--- a/torch/csrc/api/src/optim/adam.cpp
+++ b/torch/csrc/api/src/optim/adam.cpp
@@ -14,10 +14,6 @@ namespace optim {
 AdamOptions::AdamOptions(double learning_rate)
     : learning_rate_(learning_rate) {}
 
-const AdamOptions& Adam::options() const noexcept {
-  return options_;
-}
-
 void Adam::step() {
   for (size_t i = 0; i < parameters_.size(); ++i) {
     auto& grad = parameters_.at(i).grad();
@@ -29,32 +25,32 @@ void Adam::step() {
     auto exp_average = buffer_at(exp_average_buffers_, i).data();
     auto exp_average_sq = buffer_at(exp_average_sq_buffers_, i).data();
 
-    step_buffers_.at(i) += 1;
+    buffer_at(step_buffers_, i) += 1;
 
     auto d_p = torch::autograd::as_variable_ref(grad).data();
-    if (options_.weight_decay_ > 0) {
-      d_p.add_(p, options_.weight_decay_);
+    if (options.weight_decay_ > 0) {
+      d_p.add_(p, options.weight_decay_);
     }
 
-    exp_average.mul_(options_.beta1_).add_(d_p, 1 - options_.beta1_);
-    exp_average_sq.mul_(options_.beta2_)
-        .addcmul_(d_p, d_p, 1 - options_.beta2_);
+    exp_average.mul_(options.beta1_).add_(d_p, 1 - options.beta1_);
+    exp_average_sq.mul_(options.beta2_)
+        .addcmul_(d_p, d_p, 1 - options.beta2_);
 
     at::Tensor denom;
-    if (options_.amsgrad_) {
+    if (options.amsgrad_) {
       auto max_exp_average_sq =
           buffer_at(max_exp_average_sq_buffers_, i).data();
       torch::max_out(max_exp_average_sq, max_exp_average_sq, exp_average_sq);
-      denom = max_exp_average_sq.sqrt().add_(options_.eps_);
+      denom = max_exp_average_sq.sqrt().add_(options.eps_);
     } else {
-      denom = exp_average_sq.sqrt().add_(options_.eps_);
+      denom = exp_average_sq.sqrt().add_(options.eps_);
     }
 
     const auto bias_correction1 =
-        1 - std::pow(options_.beta1_, step_buffers_.at(i));
+        1 - std::pow(options.beta1_, buffer_at(step_buffers_, i));
     const auto bias_correction2 =
-        1 - std::pow(options_.beta2_, step_buffers_.at(i));
-    const auto step_size = options_.learning_rate_ *
+        1 - std::pow(options.beta2_, buffer_at(step_buffers_, i));
+    const auto step_size = options.learning_rate_ *
         std::sqrt(bias_correction2) / bias_correction1;
 
     p.addcdiv_(exp_average, denom, -step_size);
diff --git a/torch/csrc/api/src/optim/lbfgs.cpp b/torch/csrc/api/src/optim/lbfgs.cpp
index 8048abc51ed793..3aaf8a63ae2d62 100644
--- a/torch/csrc/api/src/optim/lbfgs.cpp
+++ b/torch/csrc/api/src/optim/lbfgs.cpp
@@ -15,10 +15,6 @@ namespace optim {
 LBFGSOptions::LBFGSOptions(double learning_rate)
     : learning_rate_(learning_rate) {}
 
-const LBFGSOptions& LBFGS::options() const noexcept {
-  return options_;
-}
-
 at::Tensor LBFGS::gather_flat_grad() {
   std::vector<at::Tensor> views;
   for (auto& parameter : parameters_) {
@@ -46,14 +42,14 @@ torch::Tensor LBFGS::step(LossClosure closure) {
   at::Tensor flat_grad = gather_flat_grad();
   torch::Scalar abs_grad_sum = torch::Scalar(flat_grad.abs().sum());
 
-  if (torch::Scalar(abs_grad_sum).toFloat() <= options_.tolerance_grad_) {
+  if (torch::Scalar(abs_grad_sum).toFloat() <= options.tolerance_grad_) {
     return loss;
   }
 
   at::Tensor ONE = flat_grad.type().scalarTensor(1);
 
   int64_t n_iter = 0;
-  while (n_iter < options_.max_iter_) {
+  while (n_iter < options.max_iter_) {
     n_iter++;
     state_n_iter++;
 
@@ -69,7 +65,7 @@ torch::Tensor LBFGS::step(LossClosure closure) {
       if (ys.toFloat() > 1e-10) {
         // updating memory
 
-        if (old_dirs.size() == options_.history_size_) {
+        if (old_dirs.size() == options.history_size_) {
           // shift history by one (limited memory)
           old_dirs.pop_front();
           old_stps.pop_front();
@@ -114,15 +110,15 @@ torch::Tensor LBFGS::step(LossClosure closure) {
     // reset initial guess for step size
     if (n_iter == 1) {
       t = torch::Scalar(
-          at::min(ONE, ONE / abs_grad_sum) * options_.learning_rate_);
+          at::min(ONE, ONE / abs_grad_sum) * options.learning_rate_);
     } else {
-      t = options_.learning_rate_;
+      t = options.learning_rate_;
     }
 
     torch::Scalar gtd = torch::Scalar(flat_grad.dot(d));
     add_grad(t, d);
     int64_t ls_func_evals = 0;
-    if (n_iter != options_.max_iter_) {
+    if (n_iter != options.max_iter_) {
       // re-evaluate function only if not in last iteration
       // the reason we do this: in a stochastic setting,
       // no use to re-evaluate that function here
@@ -138,21 +134,21 @@ torch::Tensor LBFGS::step(LossClosure closure) {
      * Check conditions
      */
 
-    if (n_iter == options_.max_iter_) {
+    if (n_iter == options.max_iter_) {
       break;
-    } else if (current_evals >= options_.max_eval_) {
+    } else if (current_evals >= options.max_eval_) {
       break;
-    } else if (abs_grad_sum.toFloat() <= options_.tolerance_grad_) {
+    } else if (abs_grad_sum.toFloat() <= options.tolerance_grad_) {
       break;
-    } else if (gtd.toFloat() > -options_.tolerance_grad_) {
+    } else if (gtd.toFloat() > -options.tolerance_grad_) {
       break;
     } else if (
         torch::Scalar(d.mul(t).abs_().sum()).toFloat() <=
-        options_.tolerance_change_) {
+        options.tolerance_change_) {
       break;
     } else if (
         std::abs(loss.toCFloat() - prev_loss.toFloat()) <
-        options_.tolerance_change_) {
+        options.tolerance_change_) {
       break;
     }
   }
diff --git a/torch/csrc/api/src/optim/rmsprop.cpp b/torch/csrc/api/src/optim/rmsprop.cpp
index abf2bf9a1f932b..e4d1eaa95418ac 100644
--- a/torch/csrc/api/src/optim/rmsprop.cpp
+++ b/torch/csrc/api/src/optim/rmsprop.cpp
@@ -12,10 +12,6 @@ namespace optim {
 RMSpropOptions::RMSpropOptions(double learning_rate)
     : learning_rate_(learning_rate) {}
 
-const RMSpropOptions& RMSprop::options() const noexcept {
-  return options_;
-}
-
 /// Adapted from
 /// https://github.com/pytorch/pytorch/blob/master/torch/optim/rmsprop.py
 void RMSprop::step() {
@@ -27,31 +23,31 @@ void RMSprop::step() {
     }
 
     auto d_p = torch::autograd::as_variable_ref(grad).data();
-    if (options_.weight_decay_ > 0) {
-      d_p.add_(p, options_.weight_decay_);
+    if (options.weight_decay_ > 0) {
+      d_p.add_(p, options.weight_decay_);
     }
 
     auto square_average = buffer_at(square_average_buffers_, i).data();
-    square_average.mul_(options_.alpha_)
-        .addcmul_(d_p, d_p, 1.0 - options_.alpha_);
+    square_average.mul_(options.alpha_)
+        .addcmul_(d_p, d_p, 1.0 - options.alpha_);
 
     at::Tensor average;
-    if (options_.centered_ > 0) {
+    if (options.centered_ > 0) {
       auto grad_average = buffer_at(grad_average_buffers_, i).data();
-      grad_average.mul_(options_.alpha_).add_(d_p, 1.0 - options_.alpha_);
+      grad_average.mul_(options.alpha_).add_(d_p, 1.0 - options.alpha_);
       average = square_average.addcmul(grad_average, grad_average, -1.0)
                     .sqrt()
-                    .add_(options_.eps_);
+                    .add_(options.eps_);
     } else {
-      average = square_average.sqrt().add_(options_.eps_);
+      average = square_average.sqrt().add_(options.eps_);
     }
 
-    if (options_.momentum_ > 0) {
+    if (options.momentum_ > 0) {
       auto momentum = buffer_at(momentum_buffers_, i).data();
-      momentum.mul_(options_.momentum_).addcdiv_(d_p, average);
-      p.add_(momentum, -options_.learning_rate_);
+      momentum.mul_(options.momentum_).addcdiv_(d_p, average);
+      p.add_(momentum, -options.learning_rate_);
     } else {
-      p.addcdiv_(d_p, average, -options_.learning_rate_);
+      p.addcdiv_(d_p, average, -options.learning_rate_);
     }
   }
 }
diff --git a/torch/csrc/api/src/optim/sgd.cpp b/torch/csrc/api/src/optim/sgd.cpp
index 4b8ee7ae12822a..9948c12c1c3d3c 100644
--- a/torch/csrc/api/src/optim/sgd.cpp
+++ b/torch/csrc/api/src/optim/sgd.cpp
@@ -10,10 +10,6 @@ namespace torch {
 namespace optim {
 SGDOptions::SGDOptions(double learning_rate) : learning_rate_(learning_rate) {}
 
-const SGDOptions& SGD::options() const noexcept {
-  return options_;
-}
-
 void SGD::step() {
   for (size_t i = 0; i < parameters_.size(); ++i) {
     auto& grad = parameters_.at(i).grad();
@@ -24,25 +20,25 @@ void SGD::step() {
     }
 
     auto d_p = torch::Tensor(grad).data();
-    if (options_.weight_decay_ > 0) {
-      d_p.add_(p, options_.weight_decay_);
+    if (options.weight_decay_ > 0) {
+      d_p.add_(p, options.weight_decay_);
     }
 
-    if (options_.momentum_ != 0) {
+    if (options.momentum_ != 0) {
       auto momentum = buffer_at(momentum_buffers_, i).data();
       if (iteration_ == 0) {
-        momentum.mul_(options_.momentum_).add_(d_p);
+        momentum.mul_(options.momentum_).add_(d_p);
       } else {
-        momentum.mul_(options_.momentum_).add_(d_p, 1 - options_.dampening_);
+        momentum.mul_(options.momentum_).add_(d_p, 1 - options.dampening_);
       }
-      if (options_.nesterov_) {
-        d_p = d_p.add(momentum, options_.momentum_);
+      if (options.nesterov_) {
+        d_p = d_p.add(momentum, options.momentum_);
       } else {
         d_p = momentum;
       }
     }
 
-    p.add_(d_p, -options_.learning_rate_);
+    p.add_(d_p, -options.learning_rate_);
   }
   iteration_ += 1;
 }
diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp
index f8c88c7ddcdde5..9ebcdf6df993d3 100644
--- a/torch/csrc/autograd/variable.cpp
+++ b/torch/csrc/autograd/variable.cpp
@@ -22,7 +22,7 @@
 namespace torch {
 namespace autograd {
 Variable::Impl::Impl(at::Tensor data, bool requires_grad, Edge gradient_edge)
-    : TensorImpl(data.type().backend(), data.type().scalarType(), nullptr, /* is variable */ true),
+    : TensorImpl(data.type().type_id(), data.type().scalarType(), nullptr, /* is variable */ true),
       data_(std::move(data)),
       grad_fn_(std::move(gradient_edge.function)),
       requires_grad_(false),
@@ -93,8 +93,7 @@ Tensor Variable::Impl::detach() const {
 
 void Variable::Impl::detach_() {
   if (is_view_) {
-    throw std::runtime_error(
-        "Can't detach views in-place. Use detach() instead");
+    AT_ERROR("Can't detach views in-place. Use detach() instead");
   }
   set_requires_grad(false);
   grad_fn_.reset();
@@ -131,7 +130,7 @@ void Variable::Impl::set_data(Tensor new_data) {
   
   // Updates metadata
   scalar_type_ = new_data.type().scalarType();
-  backend_ = new_data.type().backend();
+  type_id_ = new_data.type().type_id();
   is_variable_ = true;
   data_ = std::move(new_data);
 }
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
index 5fbfcf5f2bb8c0..32c33a87e2c503 100644
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@@ -298,7 +298,7 @@ struct Variable::Impl : public at::TensorImpl {
   }
 
   /// Accesses the gradient `Variable` of this `Variable`.
-  Tensor& grad() override {
+  Variable& grad() override {
     return grad_;
   }
   const Variable& grad() const override {
diff --git a/torch/csrc/cuda/Storage.cpp b/torch/csrc/cuda/Storage.cpp
index f767e94ccc38f4..da6f15c10f1d14 100644
--- a/torch/csrc/cuda/Storage.cpp
+++ b/torch/csrc/cuda/Storage.cpp
@@ -11,7 +11,6 @@
 #include "THCP.h"
 
 #include "override_macros.h"
-#include "torch/csrc/finalizer.h"
 #include "torch/csrc/copy_utils.h"
 #include "DynamicTypes.h"
 
diff --git a/torch/csrc/finalizer.cpp b/torch/csrc/finalizer.cpp
deleted file mode 100644
index b18f4f7bb095d2..00000000000000
--- a/torch/csrc/finalizer.cpp
+++ /dev/null
@@ -1,6 +0,0 @@
-#include <Python.h>
-#include <torch/csrc/finalizer.h>
-
-namespace torch {
-
-} // namespace torch
diff --git a/torch/csrc/finalizer.h b/torch/csrc/finalizer.h
deleted file mode 100644
index 13b9fa5e7bdd04..00000000000000
--- a/torch/csrc/finalizer.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#pragma once
-
-#include <Python.h>
-#include <TH/THStorageFunctions.hpp>
-#include <torch/csrc/utils/object_ptr.h>
-#include <torch/csrc/utils/auto_gil.h>
-
-namespace torch {
-
-struct PyObjectFinalizer : public THFinalizer {
-  THPObjectPtr pyobj_;
-  // TODO: This recursive structure can lead to a stack overflow if you
-  // put too many finalizers on the same object
-  std::unique_ptr<THFinalizer> next_;
-  PyObjectFinalizer(PyObject* pyobj) {
-    Py_XINCREF(pyobj);
-    pyobj_ = pyobj;
-  }
-  void operator()() override {
-    if (next_) { (*next_)(); }
-  }
-  ~PyObjectFinalizer() {
-    // We must manually ensure that we have the GIL before
-    // pyobj gets destroyed...
-    AutoGIL gil;
-    pyobj_ = nullptr;
-  }
-};
-
-} // namespace torch
diff --git a/torch/csrc/generic/StorageSharing.cpp b/torch/csrc/generic/StorageSharing.cpp
index dbbdedb03e17ba..c6e949a2085739 100644
--- a/torch/csrc/generic/StorageSharing.cpp
+++ b/torch/csrc/generic/StorageSharing.cpp
@@ -291,45 +291,23 @@ static PyObject * THPStorage_(newSharedCuda)(PyObject *_unused, PyObject *args)
 // pointer.
 //
 // NB: This does NOT preserve object identity when you call it multiple times
-static PyObject * THPStorage_(weakRef)(THPStorage *self, PyObject *weak_ref_class) {
+static PyObject * THPStorage_(weakRef)(THPStorage *self, PyObject *args) {
   HANDLE_TH_ERRORS
   THStorage* storage = self->cdata;
-
   THStorage_weakRetain(storage);
-
-  THPObjectPtr args(Py_BuildValue("(N)", PyLong_FromVoidPtr(storage)));
-  if (!args) return NULL;
-  THPObjectPtr ref(PyObject_Call(weak_ref_class, args, NULL));
-  if (!ref) return NULL;
-
-  // We need to also add a finalizer with an owning reference to the weak class,
-  // so that we can keep the "weak" object live until it should actually be
-  // cleared form the map.
-  // Access to storage->finalizer protected by GIL
-  torch::PyObjectFinalizer* finalizer = new torch::PyObjectFinalizer(ref.get());
-  std::swap(storage->finalizer_, finalizer->next_);
-  storage->finalizer_.reset(finalizer);
-
-  return ref.release();
+  return PyLong_FromVoidPtr(storage);
   END_HANDLE_TH_ERRORS
 }
 
 PyObject * THPStorage_(newWithWeakPtr)(PyObject *_unused, PyObject *arg)
 {
   HANDLE_TH_ERRORS
-  THPObjectPtr ref(PyObject_GetAttrString(arg, "cdata"));
-  if (!ref) {
-    return NULL;
-  } else if (ref.get() == Py_None) {
-    Py_RETURN_NONE;
-  }
-  THPUtils_assert(THPUtils_checkLong(ref.get()),
-      "_new_with_weak_ptr(): arg.cdata must be an 'int'");
-  THStorage *weak_storage = (THStorage*)PyLong_AsVoidPtr(ref.get());
+  THPUtils_assert(THPUtils_checkLong(arg),
+      "_new_with_weak_ptr(): arg must be an 'int'");
+  THStorage *weak_storage = (THStorage*)PyLong_AsVoidPtr(arg);
   if (auto* storage = THStorage_weakLock(weak_storage)) {
     return THPStorage_(New)(storage);
   }
-
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }
@@ -349,6 +327,15 @@ PyObject * THPStorage_(freeWeakRef)(PyObject *_unused, PyObject *arg)
   END_HANDLE_TH_ERRORS
 }
 
+PyObject * THPStorage_(expired)(PyObject *_unused, PyObject *arg)
+{
+  HANDLE_TH_ERRORS
+  THPUtils_assert(THPUtils_checkLong(arg), "_expired(): arg must be an 'int'");
+  THStorage *weak_storage = (THStorage*)PyLong_AsVoidPtr(arg);
+  return PyBool_FromLong(weak_storage->use_count() == 0);
+  END_HANDLE_TH_ERRORS
+}
+
 PyObject * THPStorage_(sharedFd)(THPStorage *self)
 {
   HANDLE_TH_ERRORS
@@ -390,8 +377,9 @@ static PyMethodDef THPStorage_(sharingMethods)[] = {
   {"_new_shared_filename", (PyCFunction)THPStorage_(newSharedFilename), METH_VARARGS | METH_STATIC, NULL},
   {"_new_using_filename", (PyCFunction)THPStorage_(pyNewFilenameStorage), METH_VARARGS | METH_STATIC, NULL},
 #endif
-  {"_weak_ref", (PyCFunction)THPStorage_(weakRef), METH_O, NULL},
+  {"_weak_ref", (PyCFunction)THPStorage_(weakRef), METH_NOARGS, NULL},
   {"_free_weak_ref", (PyCFunction)THPStorage_(freeWeakRef), METH_O | METH_STATIC, NULL},
+  {"_expired", (PyCFunction)THPStorage_(expired), METH_O | METH_STATIC, NULL},
   {"_shared_decref", (PyCFunction)THPStorage_(sharedDecref), METH_NOARGS, NULL},
   {"_shared_incref", (PyCFunction)THPStorage_(sharedIncref), METH_NOARGS, NULL},
   {"_get_shared_fd", (PyCFunction)THPStorage_(sharedFd), METH_NOARGS, NULL},
diff --git a/torch/csrc/jit/interpreter.cpp b/torch/csrc/jit/interpreter.cpp
index 48f6e1100ac2bc..3654816d479ad2 100644
--- a/torch/csrc/jit/interpreter.cpp
+++ b/torch/csrc/jit/interpreter.cpp
@@ -337,7 +337,7 @@ struct PreprocessGraph {
 struct ContainerTensor : public at::TensorImpl {
 public:
   ContainerTensor()
-  : TensorImpl(at::Backend::Undefined,at::ScalarType::Undefined, nullptr, /* is_variable */ false) {}
+  : TensorImpl(at::UndefinedTensorId(),at::ScalarType::Undefined, nullptr, /* is_variable */ false) {}
 
   virtual ~ContainerTensor() = default;
   virtual at::IntList sizes() const override {
diff --git a/torch/multiprocessing/reductions.py b/torch/multiprocessing/reductions.py
index 2cac31c75cde48..a346d56934b175 100644
--- a/torch/multiprocessing/reductions.py
+++ b/torch/multiprocessing/reductions.py
@@ -1,6 +1,7 @@
 import torch
 import os
 import weakref
+import threading
 import multiprocessing
 from multiprocessing.reduction import ForkingPickler
 import sys
@@ -15,19 +16,54 @@
     pass
 
 
-class StorageRef(object):
-    # An object with a cdata field which may be set to None. We subclass object
-    # instead of using a dict() to support weak references.
+class StorageWeakRef(object):
+    r"""A weak reference to a Storage.
 
-    def __init__(self, ptr):
-        self.cdata = ptr
+    The cdata member is a Python number containing the integer representation of
+    the Storage pointer."""
+
+    def __init__(self, storage):
+        self.cdata = storage._weak_ref()
+        # Save a direct reference to _free_weak_ref because the `torch` module
+        # might be cleared during Python shutdown before this module is cleared.
+        self._free_weak_ref = torch.Storage._free_weak_ref
+
+    def expired(self):
+        return torch.Storage._expired(self.cdata)
 
     def __del__(self):
-        torch.Storage._free_weak_ref(self.cdata)
+        self._free_weak_ref(self.cdata)
+
+
+class SharedCache(dict):
+    """dictionary from multiprocessing handles to StorageWeakRef"""
+
+    def __init__(self):
+        # free_dead_references() is called if the len exceeds the currrent
+        # limit. The limit scales with the number of remaining live objects.
+        self.limit = 128
+        self.lock = threading.Lock()
+
+    def __setitem__(self, key, storage_ref):
+        dict.__setitem__(self, key, storage_ref)
+        if len(self) > self.limit:
+            self.free_dead_references()
+
+    def free_dead_references(self):
+        # Multiple Python threads may call free_dead_references() concurrently.
+        # Without a lock, they may try deleting the same entry multiple times.
+        with self.lock:
+            live = 0
+            for key, storage_ref in list(self.items()):
+                if storage_ref.expired():
+                    del self[key]
+                else:
+                    live += 1
+            self.limit = max(128, live * 2)
 
 
-# mapping from handles to StorageRef objects
-shared_cache = weakref.WeakValueDictionary()
+# mapping from handles to StorageWeakRef objects
+shared_cache = SharedCache()
 
 
 def rebuild_event(handle):
@@ -55,7 +91,7 @@ def rebuild_cuda_tensor(tensor_cls, tensor_size, tensor_stride, tensor_offset,
     if storage is None:
         torch.cuda._lazy_init()
         storage = storage_cls._new_shared_cuda(storage_device, storage_handle, storage_size)
-        shared_cache[storage_handle] = storage._weak_ref(StorageRef)
+        shared_cache[storage_handle] = StorageWeakRef(storage)
 
     t = torch._utils._rebuild_tensor(storage, tensor_offset, tensor_size, tensor_stride)
     if tensor_cls == torch.nn.parameter.Parameter:
@@ -125,11 +161,7 @@ def reduce_tensor(tensor):
         (device, handle, storage_size, storage_offset) = storage._share_cuda_()
         tensor_offset = tensor.storage_offset()
 
-        # WARNING!  This call to _weak_ref could lead to O(n) deleter
-        # behavior, if you repeatedly call it on the same Storage (all
-        # other sites are guarded by shared_cache; maybe this site
-        # should be too?)
-        shared_cache[handle] = storage._weak_ref(StorageRef)
+        shared_cache[handle] = StorageWeakRef(storage)
 
         return (rebuild_cuda_tensor,
                 (type(tensor),
@@ -159,7 +191,7 @@ def storage_from_cache(cls, key):
     storage_ref = shared_cache.get(key)
     if storage_ref is None:
         return None
-    return cls._new_with_weak_ptr(storage_ref)
+    return cls._new_with_weak_ptr(storage_ref.cdata)
 
 
 def rebuild_storage_fd(cls, df, size):
@@ -172,7 +204,7 @@ def rebuild_storage_fd(cls, df, size):
         if storage is not None:
             return storage
         storage = cls._new_shared_fd(fd, size)
-        shared_cache[fd_id(fd)] = storage._weak_ref(StorageRef)
+        shared_cache[fd_id(fd)] = StorageWeakRef(storage)
         return storage
     finally:
         os.close(fd)
@@ -183,7 +215,7 @@ def rebuild_storage_filename(cls, manager, handle, size):
     if storage is not None:
         return storage._shared_decref()
     storage = cls._new_shared_filename(manager, handle, size)
-    shared_cache[handle] = storage._weak_ref(StorageRef)
+    shared_cache[handle] = StorageWeakRef(storage)
     return storage._shared_decref()
 
 
@@ -214,11 +246,7 @@ def reduce_storage(storage):
         metadata = (df, size)
         rebuild = rebuild_storage_fd
 
-    # WARNING!  This call to _weak_ref could lead to O(n) deleter
-    # behavior, if you repeatedly call it on the same Storage (all
-    # other sites are guarded by shared_cache; maybe this site
-    # should be too?)
-    shared_cache[cache_key] = storage._weak_ref(StorageRef)
+    shared_cache[cache_key] = StorageWeakRef(storage)
     return (rebuild, (type(storage),) + metadata)