diff --git a/CMakeLists.txt b/CMakeLists.txt
index 60d69ef2d9bc6..488605d5ea459 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,11 +5,10 @@ cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
 # ---[ Project and semantic versioning.
 project(Caffe2 CXX C)
 
-set(CAFFE2_VERSION_MAJOR 0)
-set(CAFFE2_VERSION_MINOR 8)
-set(CAFFE2_VERSION_PATCH 2)
-set(CAFFE2_VERSION
-    "${CAFFE2_VERSION_MAJOR}.${CAFFE2_VERSION_MINOR}.${CAFFE2_VERSION_PATCH}")
+set(CMAKE_CXX_STANDARD 11)
+if (NOT MSVC)
+  set(CMAKE_C_STANDARD 11)
+endif()
 
 # One variable that determines whether the current cmake process is being run
 # with the main Caffe2 library. This is useful for building modules - if
@@ -134,6 +133,22 @@ if (ANDROID OR IOS)
   set(BUILD_ATEN_MOBILE ON)
 endif()
 
+# ---[ Utils
+# TODO: merge the following 3 files into cmake/public/utils.cmake.
+include(cmake/Utils.cmake)
+include(cmake/public/utils.cmake)
+
+# ---[ Version numbers for generated libraries
+set(TORCH_DEFAULT_VERSION "1.0.0")
+set(TORCH_BUILD_VERSION "${TORCH_DEFAULT_VERSION}" CACHE STRING "Torch build version")
+if (NOT TORCH_BUILD_VERSION)
+  # An empty string was specified so force version to the default
+  set(TORCH_BUILD_VERSION "${TORCH_DEFAULT_VERSION}"
+    CACHE STRING "Torch build version" FORCE)
+endif()
+caffe2_parse_version_str(TORCH ${TORCH_BUILD_VERSION})
+caffe2_parse_version_str(CAFFE2 ${TORCH_BUILD_VERSION})
+
 # ---[ CMake scripts + modules
 list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
 
@@ -160,11 +175,6 @@ include(cmake/MiscCheck.cmake)
 # External projects
 include(ExternalProject)
 
-# ---[ Utils
-# TODO: merge the following 3 files into cmake/public/utils.cmake.
-include(cmake/Utils.cmake)
-include(cmake/public/utils.cmake)
-
 # ---[ Dependencies
 include(cmake/Dependencies.cmake)
 
@@ -294,6 +304,7 @@ include_directories(BEFORE ${PROJECT_BINARY_DIR})
 include_directories(BEFORE ${PROJECT_SOURCE_DIR}/aten/src/)
 
 # ---[ Main build
+add_subdirectory(c10)
 add_subdirectory(caffe2)
 
 # --[ Documentation
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 0f46fa1cf62a7..f0be7e770b97e 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -262,9 +262,9 @@ than Linux, which are worth keeping in mind when fixing these problems.
 1. Symbols are NOT exported by default on Windows; instead, you have to explicitly
    mark a symbol as exported/imported in a header file with `__declspec(dllexport)` /
    `__declspec(dllimport)`.  We have codified this pattern into a set of macros
-   which follow the convention `*_API`, e.g., `AT_API` inside ATen. (Every separate
-   shared library needs a unique macro name, because symbol visibility is on a per
-   shared library basis.)
+   which follow the convention `*_API`, e.g., `CAFFE2_API` inside Caffe2 and ATen.
+   (Every separate shared library needs a unique macro name, because symbol visibility
+   is on a per shared library basis. See c10/macros/Macros.h for more details.)
 
    The upshot is if you see an "unresolved external" error in your Windows build, this
    is probably because you forgot to mark a function with `*_API`.  However, there is
diff --git a/README.md b/README.md
index e0aa68bf8b3e2..918aac0627cf2 100644
--- a/README.md
+++ b/README.md
@@ -163,7 +163,7 @@ conda install numpy pyyaml mkl mkl-include setuptools cmake cffi typing
 conda install -c mingfeima mkldnn
 
 # Add LAPACK support for the GPU
-conda install -c pytorch magma-cuda80 # or magma-cuda90 if CUDA 9
+conda install -c pytorch magma-cuda92 # or [magma-cuda80 | magma-cuda91] depending on your cuda version
 ```
 
 On macOS
diff --git a/aten/src/ATen/CPUGeneral.h b/aten/src/ATen/CPUGeneral.h
index b406669053dd8..04bd0aacb528f 100644
--- a/aten/src/ATen/CPUGeneral.h
+++ b/aten/src/ATen/CPUGeneral.h
@@ -1,12 +1,12 @@
 #pragma once
 
-// Using AT_API is crucial as otherwise you'll see
+// Using CAFFE2_API is crucial as otherwise you'll see
 // linking errors using MSVC
 // See https://msdn.microsoft.com/en-us/library/a90k134d.aspx
-// This header adds this if using AT_API
+// This header adds this if using CAFFE2_API
 #include "ATen/core/ATenGeneral.h"
 
 namespace at {
-AT_API void set_num_threads(int);
-AT_API int get_num_threads();
+CAFFE2_API void set_num_threads(int);
+CAFFE2_API int get_num_threads();
 }
diff --git a/aten/src/ATen/CPUTypeDefault.h b/aten/src/ATen/CPUTypeDefault.h
index c9776b7b0a2cc..6a854c98d16e3 100644
--- a/aten/src/ATen/CPUTypeDefault.h
+++ b/aten/src/ATen/CPUTypeDefault.h
@@ -3,7 +3,7 @@
 
 namespace at {
 
-struct AT_API CPUTypeDefault : public TypeDefault {
+struct CAFFE2_API CPUTypeDefault : public TypeDefault {
   CPUTypeDefault(TensorTypeId type_id, bool is_variable, bool is_undefined)
       : TypeDefault(type_id, is_variable, is_undefined) {}
   Allocator* allocator() const override;
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index 4e147cffabbe8..1f546f8574a78 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -22,10 +22,10 @@
 
 namespace at {
 
-struct Tensor;
+class Tensor;
 
-class AT_API Context {
-public:
+class CAFFE2_API Context {
+ public:
   Context();
   TypeExtendedInterface* getNonVariableTypeRaw(Backend p, ScalarType s) {
     return static_cast<TypeExtendedInterface*>(globalLegacyTypeDispatch().getNonVariableTypeRaw(p, s));
@@ -133,7 +133,7 @@ class AT_API Context {
   friend struct Type;
 };
 
-AT_API Context & globalContext();
+CAFFE2_API Context& globalContext();
 
 static inline void init() {
   globalContext();
@@ -153,11 +153,11 @@ static inline TypeExtendedInterface& getNonVariableType(DeviceType p, ScalarType
   return globalContext().getNonVariableType(deviceTypeToBackend(p), s);
 }
 
-AT_API TypeExtendedInterface& getType(TensorOptions options);
-AT_API TypeExtendedInterface& getType(const TensorImpl*);
-AT_API TypeExtendedInterface& getType(const Tensor&);
+CAFFE2_API TypeExtendedInterface& getType(TensorOptions options);
+CAFFE2_API TypeExtendedInterface& getType(const TensorImpl*);
+CAFFE2_API TypeExtendedInterface& getType(const Tensor&);
 
-AT_API Allocator* getCPUAllocator();
+CAFFE2_API Allocator* getCPUAllocator();
 
 static inline TypeExtendedInterface& CPU(ScalarType s) {
   return getNonVariableType(Backend::CPU, s);
diff --git a/aten/src/ATen/DLConvertor.h b/aten/src/ATen/DLConvertor.h
index 5ed9899fc5500..d254fb568fd09 100644
--- a/aten/src/ATen/DLConvertor.h
+++ b/aten/src/ATen/DLConvertor.h
@@ -10,8 +10,8 @@
 
 namespace at {
 
-AT_API ScalarType toScalarType(const DLDataType& dtype);
-AT_API DLManagedTensor * toDLPack(const Tensor& src);
-AT_API Tensor fromDLPack(const DLManagedTensor* src);
+CAFFE2_API ScalarType toScalarType(const DLDataType& dtype);
+CAFFE2_API DLManagedTensor* toDLPack(const Tensor& src);
+CAFFE2_API Tensor fromDLPack(const DLManagedTensor* src);
 
 } //namespace at
diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h
index 3453155da5b1d..cd95271adf427 100644
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@@ -9,9 +9,12 @@
 
 namespace at {
 
-AT_API std::vector<int64_t> infer_size(IntList a, IntList b);
-AT_API std::tuple<std::vector<int64_t>, std::vector<int64_t> > inferExpandGeometry(
-    IntList tensor_sizes, IntList tensor_strides, IntList sizes);
+CAFFE2_API std::vector<int64_t> infer_size(IntList a, IntList b);
+CAFFE2_API std::tuple<std::vector<int64_t>, std::vector<int64_t>>
+inferExpandGeometry(
+    IntList tensor_sizes,
+    IntList tensor_strides,
+    IntList sizes);
 
 // avoid copy-construction of Tensor by using a reference_wrapper.
 inline void check_defined(std::initializer_list<std::reference_wrapper<const Tensor>> tensors, const char *api_name) {
diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h
index 42b670bea0854..7ffb68a4963c0 100644
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@@ -5,7 +5,7 @@
 #include "ATen/core/Error.h"
 
 namespace at {
-struct AT_API SparseTensorImpl : public TensorImpl {
+struct CAFFE2_API SparseTensorImpl : public TensorImpl {
   // Stored in COO format, indices + values.
 
   // INVARIANTS:
@@ -157,11 +157,11 @@ struct AT_API SparseTensorImpl : public TensorImpl {
     sparseDims_ = sparseDims;
     denseDims_ = denseDims;
 
-    auto empty_indices = indices().type().tensor({sparseDims, 0});
+    auto empty_indices = at::empty({sparseDims, 0}, indices().options());
     std::vector<int64_t> values_size = {0};
     auto dense_size = sizes().slice(sparseDims);
     values_size.insert(values_size.end(), dense_size.begin(), dense_size.end());
-    auto empty_values = values().type().tensor(values_size);
+    auto empty_values = at::empty(values_size, values().options());
     set_indices_and_values_unsafe(empty_indices, empty_values);
     refresh_numel();
   }
diff --git a/aten/src/ATen/TensorGeometry.cpp b/aten/src/ATen/TensorGeometry.cpp
index b11c7bb159900..20ab6bb6690c5 100644
--- a/aten/src/ATen/TensorGeometry.cpp
+++ b/aten/src/ATen/TensorGeometry.cpp
@@ -12,8 +12,4 @@ bool TensorGeometry::is_contiguous() const {
   return at::geometry_is_contiguous(sizes_, strides_);
 }
 
-Tensor TensorGeometry::zeros_with_stride(const Type& type) const {
-  return type.tensor(sizes_, strides_).zero_();
-}
-
 } // namespace at
diff --git a/aten/src/ATen/TensorGeometry.h b/aten/src/ATen/TensorGeometry.h
index 5f441ed8fa71c..c989d2ca8f7d0 100644
--- a/aten/src/ATen/TensorGeometry.h
+++ b/aten/src/ATen/TensorGeometry.h
@@ -5,7 +5,7 @@
 
 namespace at {
 
-struct AT_API TensorGeometry {
+struct CAFFE2_API TensorGeometry {
   TensorGeometry() : storage_offset_(0) {}
 
   explicit TensorGeometry(IntList sizes)
@@ -30,9 +30,6 @@ struct AT_API TensorGeometry {
   // true if the tensor is contiguous
   bool is_contiguous() const;
 
-  // creates a new tensor with the sizes and strides of the source
-  Tensor zeros_with_stride(const Type& type) const;
-
   int64_t dim() const { return sizes_.size(); }
   int64_t size(int64_t dim) const {
     dim = maybe_wrap_dim(dim, this->dim());
diff --git a/aten/src/ATen/TensorOperators.h b/aten/src/ATen/TensorOperators.h
index 57a986b5d46f7..f4bdab0bf35d7 100644
--- a/aten/src/ATen/TensorOperators.h
+++ b/aten/src/ATen/TensorOperators.h
@@ -68,9 +68,9 @@ inline Tensor Tensor::operator[](int64_t index) const {
 #define AT_FORALL_BINARY_OPS(_) \
 _(+,x.add(y), y.add(x)) \
 _(*,x.mul(y), y.mul(x)) \
-_(-,x.sub(y), y.type().tensor().resize_(y.sizes()).fill_(x).sub_(y)) \
-_(/,x.div(y), y.type().tensor().resize_(y.sizes()).fill_(x).div_(y)) \
-_(%,x.remainder(y), y.type().tensor().resize_(y.sizes()).fill_(x).remainder_(y)) \
+_(-,x.sub(y), ::at::empty(y.sizes(), y.options()).fill_(x).sub_(y)) \
+_(/,x.div(y), ::at::empty(y.sizes(), y.options()).fill_(x).div_(y)) \
+_(%,x.remainder(y), ::at::empty(y.sizes(), y.options()).fill_(x).remainder_(y)) \
 _(<,x.lt(y), y.gt(x)) \
 _(<=,x.le(y), y.ge(x)) \
 _(>,x.gt(y),y.lt(x)) \
diff --git a/aten/src/ATen/TensorUtils.h b/aten/src/ATen/TensorUtils.h
index 2443bde4b482c..f65093a586004 100644
--- a/aten/src/ATen/TensorUtils.h
+++ b/aten/src/ATen/TensorUtils.h
@@ -12,7 +12,7 @@ namespace at {
 // make sense.  These are particularly useful for native functions,
 // which do NO argument checking by default.
 
-struct AT_API TensorArg {
+struct CAFFE2_API TensorArg {
   Tensor tensor;
   const char* name;
   int pos; // 1-indexed
@@ -22,7 +22,7 @@ struct AT_API TensorArg {
   const Tensor& operator*() const { return tensor; }
 };
 
-struct AT_API TensorGeometryArg {
+struct CAFFE2_API TensorGeometryArg {
   TensorGeometry tensor;
   const char* name;
   int pos; // 1-indexed
@@ -49,40 +49,80 @@ using CheckedFrom = const char*;
 // not TensorGeometryArg, because the Tensor to TensorGeometry
 // conversion will blow up if you have undefined tensors.
 
-AT_API std::ostream& operator<<(std::ostream & out, TensorGeometryArg t);
-AT_API void checkDim(CheckedFrom c, const TensorGeometryArg& t, int64_t dim);
+CAFFE2_API std::ostream& operator<<(std::ostream& out, TensorGeometryArg t);
+CAFFE2_API void checkDim(
+    CheckedFrom c,
+    const TensorGeometryArg& t,
+    int64_t dim);
 // NB: this is an inclusive-exclusive range
-AT_API void checkDimRange(CheckedFrom c, const TensorGeometryArg& t, int64_t dim_start, int64_t dim_end);
-AT_API void checkSameDim(CheckedFrom c, const TensorGeometryArg& t1, const TensorGeometryArg& t2);
-AT_API void checkContiguous(CheckedFrom c, const TensorGeometryArg& t);
-AT_API void checkAllContiguous(CheckedFrom c, at::ArrayRef<TensorArg> ts);
-AT_API void checkSize(CheckedFrom c, const TensorGeometryArg& t, IntList sizes);
-AT_API void checkSize(CheckedFrom c, const TensorGeometryArg& t, int64_t dim, int64_t size);
-AT_API void checkNumel(CheckedFrom c, const TensorGeometryArg& t, int64_t numel);
-AT_API void checkSameNumel(CheckedFrom c, const TensorGeometryArg& t1, const TensorGeometryArg& t2);
-AT_API void checkAllSameNumel(CheckedFrom c, ArrayRef<TensorArg> tensors);
-AT_API void checkScalarType(CheckedFrom c, const TensorArg& t, ScalarType s);
-AT_API void checkScalarTypes(CheckedFrom c, const TensorArg& t, at::ArrayRef<ScalarType> l);
-AT_API void checkSameGPU(CheckedFrom c, const TensorArg& t1, const TensorArg& t2);
-AT_API void checkAllSameGPU(CheckedFrom c, ArrayRef<TensorArg> tensors);
-AT_API void checkSameType(CheckedFrom c, const TensorArg& t1, const TensorArg& t2);
-AT_API void checkAllSameType(CheckedFrom c, ArrayRef<TensorArg> tensors);
-AT_API void checkSameSize(CheckedFrom c, const TensorArg& t1, const TensorArg& t2);
-AT_API void checkDefined(CheckedFrom c, const TensorArg& t);
-AT_API void checkAllDefined(CheckedFrom c, at::ArrayRef<TensorArg> t);
+CAFFE2_API void checkDimRange(
+    CheckedFrom c,
+    const TensorGeometryArg& t,
+    int64_t dim_start,
+    int64_t dim_end);
+CAFFE2_API void checkSameDim(
+    CheckedFrom c,
+    const TensorGeometryArg& t1,
+    const TensorGeometryArg& t2);
+CAFFE2_API void checkContiguous(CheckedFrom c, const TensorGeometryArg& t);
+CAFFE2_API void checkAllContiguous(CheckedFrom c, at::ArrayRef<TensorArg> ts);
+CAFFE2_API void checkSize(
+    CheckedFrom c,
+    const TensorGeometryArg& t,
+    IntList sizes);
+CAFFE2_API void checkSize(
+    CheckedFrom c,
+    const TensorGeometryArg& t,
+    int64_t dim,
+    int64_t size);
+CAFFE2_API void checkNumel(
+    CheckedFrom c,
+    const TensorGeometryArg& t,
+    int64_t numel);
+CAFFE2_API void checkSameNumel(
+    CheckedFrom c,
+    const TensorGeometryArg& t1,
+    const TensorGeometryArg& t2);
+CAFFE2_API void checkAllSameNumel(CheckedFrom c, ArrayRef<TensorArg> tensors);
+CAFFE2_API void checkScalarType(
+    CheckedFrom c,
+    const TensorArg& t,
+    ScalarType s);
+CAFFE2_API void checkScalarTypes(
+    CheckedFrom c,
+    const TensorArg& t,
+    at::ArrayRef<ScalarType> l);
+CAFFE2_API void checkSameGPU(
+    CheckedFrom c,
+    const TensorArg& t1,
+    const TensorArg& t2);
+CAFFE2_API void checkAllSameGPU(CheckedFrom c, ArrayRef<TensorArg> tensors);
+CAFFE2_API void checkSameType(
+    CheckedFrom c,
+    const TensorArg& t1,
+    const TensorArg& t2);
+CAFFE2_API void checkAllSameType(CheckedFrom c, ArrayRef<TensorArg> tensors);
+CAFFE2_API void checkSameSize(
+    CheckedFrom c,
+    const TensorArg& t1,
+    const TensorArg& t2);
+CAFFE2_API void checkDefined(CheckedFrom c, const TensorArg& t);
+CAFFE2_API void checkAllDefined(CheckedFrom c, at::ArrayRef<TensorArg> t);
 
 // FixMe: does TensorArg slow things down?
-AT_API void checkBackend(CheckedFrom c, at::ArrayRef<Tensor> t, at::Backend backend);
+CAFFE2_API void checkBackend(
+    CheckedFrom c,
+    at::ArrayRef<Tensor> t,
+    at::Backend backend);
 
 // Methods for getting data_ptr if tensor is defined
-AT_API void * maybe_data_ptr(const Tensor& tensor);
-AT_API void * maybe_data_ptr(const TensorArg& tensor);
+CAFFE2_API void* maybe_data_ptr(const Tensor& tensor);
+CAFFE2_API void* maybe_data_ptr(const TensorArg& tensor);
 
 // Return if the tensor geometry represented by `sizes` and `strides` is contiguous
 // Although we cache is_contiguous in tensor now, this is till useful because it
 // allows checking if a particular geometry is contiguous without explicitly
 // constructing a tensor, e.g., when you want to choose a kernel strategy based
 // on whether a subgeometry is contiguous.
-AT_API bool geometry_is_contiguous(IntList sizes, IntList strides);
-
+CAFFE2_API bool geometry_is_contiguous(IntList sizes, IntList strides);
 }
diff --git a/aten/src/ATen/Utils.h b/aten/src/ATen/Utils.h
index c4473d1471ab7..21ade98cba79c 100644
--- a/aten/src/ATen/Utils.h
+++ b/aten/src/ATen/Utils.h
@@ -24,7 +24,7 @@
 
 namespace at {
 
-AT_API int _crash_if_asan(int);
+CAFFE2_API int _crash_if_asan(int);
 
 static inline const Storage& checked_storage(
     const Storage& expr,
diff --git a/aten/src/ATen/core/ATenCoreTest.h b/aten/src/ATen/core/ATenCoreTest.h
index a6769b10b93ee..93f894ea66b97 100644
--- a/aten/src/ATen/core/ATenCoreTest.h
+++ b/aten/src/ATen/core/ATenCoreTest.h
@@ -4,5 +4,5 @@
 
 namespace at {
 
-AT_CORE_API int CoreTest();
+CAFFE2_API int CoreTest();
 }
diff --git a/aten/src/ATen/core/ATenGeneral.h b/aten/src/ATen/core/ATenGeneral.h
index cbc1d6f13692f..cb946c93c9b96 100644
--- a/aten/src/ATen/core/ATenGeneral.h
+++ b/aten/src/ATen/core/ATenGeneral.h
@@ -1,8 +1,3 @@
 #pragma once
 
 #include "ATen/core/Macros.h"
-
-// TODO: Merge the *_API macros.
-#define AT_API AT_CORE_API
-#define AT_EXPORT AT_CORE_EXPORT
-#define AT_IMPORT AT_CORE_IMPORT
diff --git a/aten/src/ATen/core/Allocator.h b/aten/src/ATen/core/Allocator.h
index dc520008e3bbf..a3bae36efe4a4 100644
--- a/aten/src/ATen/core/Allocator.h
+++ b/aten/src/ATen/core/Allocator.h
@@ -115,7 +115,7 @@ struct Allocator {
   }
 };
 
-struct AT_CORE_API InefficientStdFunctionContext {
+struct CAFFE2_API InefficientStdFunctionContext {
   std::unique_ptr<void, std::function<void(void*)>> ptr_;
   InefficientStdFunctionContext(
       std::unique_ptr<void, std::function<void(void*)>>&& ptr)
diff --git a/aten/src/ATen/core/Backtrace.h b/aten/src/ATen/core/Backtrace.h
index 9aa3ac826ce78..7a4e9e6b1dba2 100644
--- a/aten/src/ATen/core/Backtrace.h
+++ b/aten/src/ATen/core/Backtrace.h
@@ -8,7 +8,7 @@
 
 namespace at {
 /// Utility to demangle a C++ symbol name.
-AT_CORE_API std::string demangle(const char* name);
+CAFFE2_API std::string demangle(const char* name);
 
 /// Returns the printable name of the type.
 template <typename T>
@@ -21,7 +21,7 @@ inline const char* demangle_type() {
 #endif // __GXX_RTTI
 }
 
-AT_CORE_API std::string get_backtrace(
+CAFFE2_API std::string get_backtrace(
     size_t frames_to_skip = 0,
     size_t maximum_number_of_frames = 64,
     bool skip_python_frames = true);
diff --git a/aten/src/ATen/core/Device.h b/aten/src/ATen/core/Device.h
index cd3efb6734e2d..a06d5f1e0d166 100644
--- a/aten/src/ATen/core/Device.h
+++ b/aten/src/ATen/core/Device.h
@@ -21,7 +21,7 @@ namespace at {
 /// 1. A negative index represents the current device, a non-negative index
 /// represents a specific, concrete device,
 /// 2. When the device type is CPU, the device index must be zero.
-struct AT_CORE_API Device {
+struct CAFFE2_API Device {
   using Type = at::DeviceType;
 
   /// Constructs a new `Device` from a `DeviceType` and an optional device
@@ -92,7 +92,7 @@ struct AT_CORE_API Device {
   int32_t index_ = -1;
 };
 
-AT_CORE_API std::ostream& operator<<(
+CAFFE2_API std::ostream& operator<<(
     std::ostream& stream,
     const at::Device& device);
 
diff --git a/aten/src/ATen/core/DeviceType.h b/aten/src/ATen/core/DeviceType.h
index 870b1e5bf9e53..a4342eade903a 100644
--- a/aten/src/ATen/core/DeviceType.h
+++ b/aten/src/ATen/core/DeviceType.h
@@ -26,11 +26,11 @@ enum class DeviceType : int32_t {
   ONLY_FOR_TEST = 20901701, // This device type is only for test.
 };
 
-AT_CORE_API std::string DeviceTypeName(
+CAFFE2_API std::string DeviceTypeName(
     at::DeviceType d,
     bool lower_case = false);
 
-AT_CORE_API std::ostream& operator<<(std::ostream& stream, at::DeviceType type);
+CAFFE2_API std::ostream& operator<<(std::ostream& stream, at::DeviceType type);
 
 } // namespace at
 
diff --git a/aten/src/ATen/core/Error.h b/aten/src/ATen/core/Error.h
index de3231180f4f7..a36608256ddf0 100644
--- a/aten/src/ATen/core/Error.h
+++ b/aten/src/ATen/core/Error.h
@@ -19,7 +19,7 @@ namespace at {
 namespace detail {
 
 // Obtains the base name from a full path.
-AT_CORE_API std::string StripBasename(const std::string& full_path);
+CAFFE2_API std::string StripBasename(const std::string& full_path);
 
 inline std::ostream& _str(std::ostream& ss) {
   return ss;
@@ -56,7 +56,7 @@ inline std::string str(const char* c_str) {
 }
 
 /// Represents a location in source code (for debugging).
-struct AT_CORE_API SourceLocation {
+struct CAFFE2_API SourceLocation {
   const char* function;
   const char* file;
   uint32_t line;
@@ -71,7 +71,7 @@ std::ostream& operator<<(std::ostream& out, const SourceLocation& loc);
 ///
 /// NB: at::Error is handled specially by the default torch to suppress the
 /// backtrace, see torch/csrc/Exceptions.h
-class AT_CORE_API Error : public std::exception {
+class CAFFE2_API Error : public std::exception {
   std::vector<std::string> msg_stack_;
   std::string backtrace_;
 
@@ -128,7 +128,7 @@ class AT_CORE_API Error : public std::exception {
   }
 };
 
-class AT_CORE_API Warning {
+class CAFFE2_API Warning {
   using handler_t =
       void (*)(const SourceLocation& source_location, const char* msg);
 
@@ -152,7 +152,7 @@ class AT_CORE_API Warning {
 
 // A utility function to return an exception std::string by prepending its
 // exception type before its what() content
-AT_CORE_API std::string GetExceptionString(const std::exception& e);
+CAFFE2_API std::string GetExceptionString(const std::exception& e);
 
 } // namespace at
 
diff --git a/aten/src/ATen/core/Formatting.h b/aten/src/ATen/core/Formatting.h
index c6ac26b8a9e0e..4906770271f27 100644
--- a/aten/src/ATen/core/Formatting.h
+++ b/aten/src/ATen/core/Formatting.h
@@ -8,10 +8,13 @@
 
 namespace at {
 
-AT_API std::ostream& operator<<(std::ostream & out, IntList list);
-AT_API std::ostream& operator<<(std::ostream & out, Backend b);
-AT_API std::ostream& operator<<(std::ostream & out, const Type & t);
-AT_API std::ostream& print(std::ostream& stream, const Tensor & tensor, int64_t linesize);
+CAFFE2_API std::ostream& operator<<(std::ostream& out, IntList list);
+CAFFE2_API std::ostream& operator<<(std::ostream& out, Backend b);
+CAFFE2_API std::ostream& operator<<(std::ostream& out, const Type& t);
+CAFFE2_API std::ostream& print(
+    std::ostream& stream,
+    const Tensor& tensor,
+    int64_t linesize);
 static inline std::ostream& operator<<(std::ostream & out, const Tensor & t) {
   return print(out,t,80);
 }
diff --git a/aten/src/ATen/core/Generator.h b/aten/src/ATen/core/Generator.h
index b8894c4307b04..fce3d35636c27 100644
--- a/aten/src/ATen/core/Generator.h
+++ b/aten/src/ATen/core/Generator.h
@@ -5,7 +5,7 @@
 
 namespace at {
 
-struct AT_API Generator {
+struct CAFFE2_API Generator {
   Generator() {};
   Generator(const Generator& other) = delete;
   Generator(Generator&& other) = delete;
diff --git a/aten/src/ATen/core/Half.h b/aten/src/ATen/core/Half.h
index 47a8e8e52d2ad..ec72fe0067dcb 100644
--- a/aten/src/ATen/core/Half.h
+++ b/aten/src/ATen/core/Half.h
@@ -34,8 +34,8 @@ namespace at {
 
 namespace detail {
 
-AT_CORE_API float halfbits2float(unsigned short bits);
-AT_CORE_API unsigned short float2halfbits(float value);
+CAFFE2_API float halfbits2float(unsigned short bits);
+CAFFE2_API unsigned short float2halfbits(float value);
 
 } // namespace detail
 
@@ -178,7 +178,7 @@ To checked_convert(From f, const char* name) {
   return convert<To, From>(f);
 }
 
-AT_CORE_API std::ostream& operator<<(std::ostream& out, const Half& value);
+CAFFE2_API std::ostream& operator<<(std::ostream& out, const Half& value);
 
 } // namespace at
 
diff --git a/aten/src/ATen/core/IdWrapper.h b/aten/src/ATen/core/IdWrapper.h
index 58632ce111db5..268fe6725356f 100644
--- a/aten/src/ATen/core/IdWrapper.h
+++ b/aten/src/ATen/core/IdWrapper.h
@@ -22,7 +22,7 @@ namespace at {
  * for you, given the underlying type supports it.
  */
 template <class ConcreteType, class UnderlyingType>
-class AT_CORE_API IdWrapper {
+class CAFFE2_API IdWrapper {
  public:
   using underlying_type = UnderlyingType;
   using concrete_type = ConcreteType;
diff --git a/aten/src/ATen/core/LegacyTypeDispatch.h b/aten/src/ATen/core/LegacyTypeDispatch.h
index 578e02e739d0d..53cedf04e4601 100644
--- a/aten/src/ATen/core/LegacyTypeDispatch.h
+++ b/aten/src/ATen/core/LegacyTypeDispatch.h
@@ -30,7 +30,7 @@
 
 namespace at {
 
-struct AT_CORE_API LegacyTypeInitInterface {
+struct CAFFE2_API LegacyTypeInitInterface {
   virtual ~LegacyTypeInitInterface() {}
   virtual void initCPU() const {
     AT_ERROR("cannot use CPU without ATen library");
@@ -42,15 +42,15 @@ struct AT_CORE_API LegacyTypeInitInterface {
     AT_ERROR("cannot use complex without ATen Complex library");
   }
 };
-struct AT_CORE_API LegacyTypeInitArgs {};
+struct CAFFE2_API LegacyTypeInitArgs {};
 AT_DECLARE_REGISTRY(LegacyTypeInitRegistry, LegacyTypeInitInterface, LegacyTypeInitArgs);
 #define REGISTER_LEGACY_TYPE_INIT(clsname) AT_REGISTER_CLASS(LegacyTypeInitRegistry, clsname, clsname)
 
-AT_CORE_API const LegacyTypeInitInterface& getLegacyTypeInit();
+CAFFE2_API const LegacyTypeInitInterface& getLegacyTypeInit();
 
 struct Type;
 
-struct AT_CORE_API LegacyTypeDeleter {
+struct CAFFE2_API LegacyTypeDeleter {
   using TypeDeleterFun = void(Type*);
   TypeDeleterFun *fn_ = nullptr;
   LegacyTypeDeleter() {}
@@ -62,8 +62,8 @@ struct AT_CORE_API LegacyTypeDeleter {
   }
 };
 
-class AT_CORE_API LegacyTypeDispatch {
-public:
+class CAFFE2_API LegacyTypeDispatch {
+ public:
   using TypeUniquePtr = std::unique_ptr<Type, LegacyTypeDeleter>;
   // WARNING: This function has the precondition that you have
   // initialized the type you want to call.  This initialization
@@ -150,6 +150,6 @@ class AT_CORE_API LegacyTypeDispatch {
     [static_cast<int>(ScalarType::NumOptions)];
 };
 
-AT_CORE_API LegacyTypeDispatch & globalLegacyTypeDispatch();
+CAFFE2_API LegacyTypeDispatch& globalLegacyTypeDispatch();
 
 } // namespace at
diff --git a/aten/src/ATen/core/Macros.h b/aten/src/ATen/core/Macros.h
index 244124475bc08..cb48b68782ab0 100644
--- a/aten/src/ATen/core/Macros.h
+++ b/aten/src/ATen/core/Macros.h
@@ -3,41 +3,7 @@
 #include <sstream>
 #include <string>
 
-// You can use the definition AT_CORE_STATIC_WINDOWS to control whether
-// or not we apply __declspec.  You will want to set this as
-// -DAT_CORE_STATIC_WINDOWS=1 when compiling code which links
-// against ATen/core on Windows, when ATen/core is built as a
-// static library (in which case, saying the symbol is coming
-// from a DLL would be incorrect).
-
-#ifdef _WIN32
-#if !defined(AT_CORE_STATIC_WINDOWS)
-#define AT_CORE_EXPORT __declspec(dllexport)
-#define AT_CORE_IMPORT __declspec(dllimport)
-#else // !defined(AT_CORE_STATIC_WINDOWS)
-#define AT_CORE_EXPORT
-#define AT_CORE_IMPORT
-#endif // !defined(AT_CORE_STATIC_WINDOWS)
-#else  // _WIN32
-#if defined(__GNUC__)
-#define AT_CORE_EXPORT __attribute__((__visibility__("default")))
-#else // defined(__GNUC__)
-#define AT_CORE_EXPORT
-#endif // defined(__GNUC__)
-#define AT_CORE_IMPORT AT_CORE_EXPORT
-#endif  // _WIN32
-
-// AT_CORE_API is a macro that, depends on whether you are building the
-// main library or not, resolves to either AT_CORE_EXPORT or
-// AT_CORE_IMPORT.
-//
-
-// TODO: unify the controlling macros.
-#if defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
-#define AT_CORE_API AT_CORE_EXPORT
-#else // defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
-#define AT_CORE_API AT_CORE_IMPORT
-#endif // defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
+#include "c10/macros/Macros.h"
 
 #ifdef __CUDACC__
 // Designates functions callable from the host (CPU) and the device (GPU)
@@ -50,13 +16,6 @@
 #define AT_DEVICE
 #endif
 
-// Disable the copy and assignment operator for a class. Note that this will
-// disable the usage of the class in std containers.
-#define AT_DISABLE_COPY_AND_ASSIGN(classname) \
-  classname(const classname&) = delete;       \
-  classname& operator=(const classname&) = delete
-
-
 #if defined(__ANDROID__)
 #define AT_ANDROID 1
 #define AT_MOBILE 1
diff --git a/aten/src/ATen/core/OptionsGuard.h b/aten/src/ATen/core/OptionsGuard.h
index b359638d53a61..fc078db6bf90b 100644
--- a/aten/src/ATen/core/OptionsGuard.h
+++ b/aten/src/ATen/core/OptionsGuard.h
@@ -20,7 +20,7 @@ struct DefaultTensorOptions {
   /// Defined in OptionsGuard.cpp because we can't use optional in headers, due
   /// to Windows and other compilers.
   /// TODO: The inability to use optional in headers is no longer true
-  AT_API static TensorOptions& get();
+  CAFFE2_API static TensorOptions& get();
 
  private:
   /// This is an optional because of compiler bugs that mis-initialize static
@@ -64,8 +64,9 @@ struct OptionsGuard {
 #else // AT_MOBILE
 
 struct DefaultTensorOptions {
-  AT_API static const TensorOptions& get();
-private:
+  CAFFE2_API static const TensorOptions& get();
+
+ private:
   static TensorOptions options_;
 };
 
diff --git a/aten/src/ATen/core/Registry.h b/aten/src/ATen/core/Registry.h
index 8f3caffe49154..98a3e4a18c725 100644
--- a/aten/src/ATen/core/Registry.h
+++ b/aten/src/ATen/core/Registry.h
@@ -44,7 +44,7 @@ inline void PrintOffendingKey(const std::string& key) {
  * objects.
  */
 template <class SrcType, class ObjectPtrType, class... Args>
-class AT_API Registry {
+class CAFFE2_API Registry {
  public:
   typedef std::function<ObjectPtrType(Args...)> Creator;
 
@@ -114,7 +114,7 @@ class AT_API Registry {
 };
 
 template <class SrcType, class ObjectPtrType, class... Args>
-class AT_API Registerer {
+class CAFFE2_API Registerer {
  public:
   Registerer(
       const SrcType& key,
@@ -152,11 +152,12 @@ class AT_API Registerer {
  * declaration, as well as creating a convenient typename for its corresponding
  * registerer.
  */
-#define AT_DECLARE_TYPED_REGISTRY(                                    \
-    RegistryName, SrcType, ObjectType, PtrType, ...)                     \
-  AT_API Registry<SrcType, PtrType<ObjectType>, __VA_ARGS__>* RegistryName(); \
-  typedef Registerer<SrcType, PtrType<ObjectType>, __VA_ARGS__>        \
-      Registerer##RegistryName; \
+#define AT_DECLARE_TYPED_REGISTRY(                                \
+    RegistryName, SrcType, ObjectType, PtrType, ...)              \
+  CAFFE2_API Registry<SrcType, PtrType<ObjectType>, __VA_ARGS__>* \
+  RegistryName();                                                 \
+  typedef Registerer<SrcType, PtrType<ObjectType>, __VA_ARGS__>   \
+      Registerer##RegistryName;                                   \
   extern template class Registerer<SrcType, PtrType<ObjectType>, __VA_ARGS__>;
 
 #define AT_DEFINE_TYPED_REGISTRY(                                         \
diff --git a/aten/src/ATen/core/Scalar.h b/aten/src/ATen/core/Scalar.h
index de01a56ce3374..f1b40d6f8053b 100644
--- a/aten/src/ATen/core/Scalar.h
+++ b/aten/src/ATen/core/Scalar.h
@@ -12,10 +12,10 @@
 
 namespace at {
 
-struct Tensor;
+class Tensor;
 
-class AT_API Scalar {
-public:
+class CAFFE2_API Scalar {
+ public:
   Scalar() : Scalar(int64_t(0)) {}
 
 #define DEFINE_IMPLICIT_CTOR(type,name,member) \
diff --git a/aten/src/ATen/core/ScalarType.h b/aten/src/ATen/core/ScalarType.h
index 6fe88bfadb05f..fad2f765fe433 100644
--- a/aten/src/ATen/core/ScalarType.h
+++ b/aten/src/ATen/core/ScalarType.h
@@ -178,17 +178,17 @@ static inline ScalarType promoteTypes(ScalarType a, ScalarType b) {
             /* u1  i1  i2  i4  i8  f2  f4  f8 */
     /* u1 */ { u1, i2, i2, i4, i8, f2, f4, f8 },
     /* i1 */ { i2, i1, i2, i4, i8, f2, f4, f8 },
-    /* i2 */ { i2, i2, i2, i4, i8, f4, f4, f8 },
-    /* i4 */ { i4, i4, i4, i4, i8, f8, f4, f8 },
-    /* i8 */ { i8, i8, i8, i8, i8, f8, f4, f8 },
-    /* f2 */ { f2, f2, f4, f8, f8, f2, f4, f8 },
+    /* i2 */ { i2, i2, i2, i4, i8, f2, f4, f8 },
+    /* i4 */ { i4, i4, i4, i4, i8, f2, f4, f8 },
+    /* i8 */ { i8, i8, i8, i8, i8, f2, f4, f8 },
+    /* f2 */ { f2, f2, f2, f2, f2, f2, f4, f8 },
     /* f4 */ { f4, f4, f4, f4, f4, f4, f4, f8 },
     /* f8 */ { f8, f8, f8, f8, f8, f8, f8, f8 },
   };
   return _promoteTypesLookup[static_cast<int>(a)][static_cast<int>(b)];
 }
 
-struct Tensor;
+class Tensor;
 typedef ArrayRef<int64_t> IntList;
 typedef ArrayRef<Tensor> TensorList;
 
diff --git a/aten/src/ATen/core/SmallVector.h b/aten/src/ATen/core/SmallVector.h
index 483144794f46e..cd2c2f51f4960 100644
--- a/aten/src/ATen/core/SmallVector.h
+++ b/aten/src/ATen/core/SmallVector.h
@@ -59,7 +59,7 @@ static inline uint64_t NextPowerOf2(uint64_t A) {
 } // namespace detail
 
 /// This is all the non-templated stuff common to all SmallVectors.
-class AT_CORE_API SmallVectorBase {
+class CAFFE2_API SmallVectorBase {
  protected:
   void *BeginX, *EndX, *CapacityX;
 
diff --git a/aten/src/ATen/core/SparseTensorRef.h b/aten/src/ATen/core/SparseTensorRef.h
index 9c9fada2dc711..9a5bbddb783c0 100644
--- a/aten/src/ATen/core/SparseTensorRef.h
+++ b/aten/src/ATen/core/SparseTensorRef.h
@@ -2,7 +2,7 @@
 
 namespace at {
 
-struct Tensor;
+class Tensor;
 struct SparseTensorRef {
   explicit SparseTensorRef(const Tensor& t): tref(t) {}
   const Tensor& tref;
diff --git a/aten/src/ATen/core/Storage.h b/aten/src/ATen/core/Storage.h
index ab201be88d630..cd42b33d12e2b 100644
--- a/aten/src/ATen/core/Storage.h
+++ b/aten/src/ATen/core/Storage.h
@@ -4,8 +4,8 @@
 
 namespace at {
 
-struct AT_API Storage {
-public:
+struct CAFFE2_API Storage {
+ public:
   Storage() {}
   Storage(c10::intrusive_ptr<StorageImpl> ptr) : storage_impl_(std::move(ptr)) {}
   Storage(
diff --git a/aten/src/ATen/core/StorageImpl.h b/aten/src/ATen/core/StorageImpl.h
index cc63bd0090666..bba2df4e0d1be 100644
--- a/aten/src/ATen/core/StorageImpl.h
+++ b/aten/src/ATen/core/StorageImpl.h
@@ -10,7 +10,7 @@ namespace at {
 
 struct Type;
 
-struct AT_API StorageImpl : public c10::intrusive_ptr_target {
+struct CAFFE2_API StorageImpl : public c10::intrusive_ptr_target {
  public:
   StorageImpl(
       caffe2::TypeMeta data_type,
diff --git a/aten/src/ATen/core/Tensor.h b/aten/src/ATen/core/Tensor.h
index 7445c332200da..fa31741313db3 100644
--- a/aten/src/ATen/core/Tensor.h
+++ b/aten/src/ATen/core/Tensor.h
@@ -15,7 +15,7 @@
 namespace at {
 struct Generator;
 struct Type;
-struct Tensor;
+class Tensor;
 struct TensorOptions;
 } // namespace at
 
@@ -37,11 +37,12 @@ namespace at {
 //
 // Note that Tensor can also be NULL, i.e. it is not associated with any underlying TensorImpl, and
 // special care must be taken to handle this.
-struct AT_API Tensor {
+class CAFFE2_API Tensor {
+public:
   Tensor(){};
   Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
-      : tensor_impl_(std::move(tensor_impl)) {
-    if (tensor_impl_.get() == nullptr) {
+      : impl_(std::move(tensor_impl)) {
+    if (impl_.get() == nullptr) {
       throw std::runtime_error("TensorBaseImpl with nullptr not supported");
     }
   }
@@ -50,25 +51,25 @@ struct AT_API Tensor {
   Tensor(Tensor&&) = default;
 
   int64_t dim() const {
-    return tensor_impl_->dim();
+    return impl_->dim();
   }
 
   TensorImpl * unsafeGetTensorImpl() const {
-    return tensor_impl_.get();
+    return impl_.get();
   }
   TensorImpl * unsafeReleaseTensorImpl() {
-    return tensor_impl_.release();
+    return impl_.release();
   }
   const c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>& getIntrusivePtr() const {
-    return tensor_impl_;
+    return impl_;
   }
 
   bool defined() const {
-    return tensor_impl_;
+    return impl_;
   }
 
   void reset() {
-    tensor_impl_.reset();
+    impl_.reset();
   }
 
   // The following overloads are very intruiging.  Consider the following
@@ -102,11 +103,11 @@ struct AT_API Tensor {
   // Tensor& operator=(const Tensor&) & = default;
   // Tensor& operator=(Tensor&&) & = default;
   Tensor& operator=(const Tensor& x) & {
-    tensor_impl_ = x.tensor_impl_;
+    impl_ = x.impl_;
     return *this;
   }
   Tensor& operator=(Tensor&& x) & {
-    tensor_impl_ = std::move(x.tensor_impl_);
+    impl_ = std::move(x.impl_);
     return *this;
   }
 
@@ -115,37 +116,37 @@ struct AT_API Tensor {
   Tensor& operator=(Tensor&&) &&;
 
   bool is_same(const Tensor& other) const noexcept {
-    return tensor_impl_ == other.tensor_impl_;
+    return impl_ == other.impl_;
   }
   size_t use_count() const noexcept {
-    return tensor_impl_.use_count();
+    return impl_.use_count();
   }
   size_t weak_use_count() const noexcept {
-    return tensor_impl_.weak_use_count();
+    return impl_.weak_use_count();
   }
 
   const char * toString() const;
 
   IntList sizes() const {
-    return tensor_impl_->sizes();
+    return impl_->sizes();
   }
   IntList strides() const {
-    return tensor_impl_->strides();
+    return impl_->strides();
   }
   int64_t ndimension() const {
     return dim();
   }
   Type & type() const {
-    return tensor_impl_->type();
+    return impl_->type();
   }
   TensorTypeId type_id() const {
-    return tensor_impl_->type_id();
+    return impl_->type_id();
   }
   ScalarType scalar_type() const {
-    return dataTypeToScalarType(tensor_impl_->dtype().id());
+    return dataTypeToScalarType(impl_->dtype().id());
   }
   const Storage& storage() const {
-    return tensor_impl_->storage();
+    return impl_->storage();
   }
   Tensor toType(const Type & t, bool non_blocking=false) const;
   Tensor & copy_(const Tensor & src, bool non_blocking=false);
@@ -172,20 +173,12 @@ struct AT_API Tensor {
   template<typename T>
   T * data() const;
 
+  template <typename T>
+  T item() const;
+
   // Purposely not defined here to avoid inlining
   void print() const;
 
-  //toLongData(), toFloatData() etc.
-  #define TO_TYPE_DATA(T,name,_) \
-  T * to##name##Data() const;
-  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(TO_TYPE_DATA)
-  #undef TO_TYPE_DATA
-
-  #define TO_C_TYPE(T,name,_) \
-  T toC##name () const;
-  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(TO_C_TYPE)
-  #undef TO_C_TYPE
-
   // Return a `TensorAccessor` for CPU `Tensor`s. You have to specify scalar type and
   // dimension.
   template<typename T, size_t N>
@@ -230,18 +223,18 @@ struct AT_API Tensor {
   // ~~~~~ Autograd API ~~~~~
 
   Tensor& set_requires_grad(bool requires_grad) {
-    tensor_impl_->set_requires_grad(requires_grad);
+    impl_->set_requires_grad(requires_grad);
     return *this;
   }
   bool requires_grad() const {
-    return tensor_impl_->requires_grad();
+    return impl_->requires_grad();
   }
 
   Tensor& grad() {
-    return tensor_impl_->grad();
+    return impl_->grad();
   }
   const Tensor& grad() const {
-    return tensor_impl_->grad();
+    return impl_->grad();
   }
 
   void set_data(Tensor new_data);
@@ -653,35 +646,35 @@ struct AT_API Tensor {
   friend struct WeakTensor;
 
 protected:
-  c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl_;
+  c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> impl_;
 };
 
-struct AT_API WeakTensor {
-  WeakTensor(const Tensor& t) : weak_tensor_impl_(t.tensor_impl_) {}
+struct CAFFE2_API WeakTensor {
+  WeakTensor(const Tensor& t) : weak_impl_(t.impl_) {}
 
   // XXX: this can return undefined tensors
   // Ideally it would be at::optional<Tensor>, but MSVC is too cool for that
   Tensor lock() const {
-    return Tensor(weak_tensor_impl_.lock());
+    return Tensor(weak_impl_.lock());
   }
 
   bool is_same(const WeakTensor& other) const noexcept {
-    return weak_tensor_impl_ == other.weak_tensor_impl_;
+    return weak_impl_ == other.weak_impl_;
   }
 
   size_t use_count() const noexcept {
-    return weak_tensor_impl_.use_count();
+    return weak_impl_.use_count();
   }
   size_t weak_use_count() const noexcept {
-    return weak_tensor_impl_.weak_use_count();
+    return weak_impl_.weak_use_count();
   }
 
   TensorImpl* unsafeGetTensorImpl() const {
-    return weak_tensor_impl_._unsafe_get_target();
+    return weak_impl_._unsafe_get_target();
   }
 
 private:
-  c10::weak_intrusive_ptr<TensorImpl, UndefinedTensorImpl> weak_tensor_impl_;
+  c10::weak_intrusive_ptr<TensorImpl, UndefinedTensorImpl> weak_impl_;
 };
 } // namespace at
 
diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
index d2f98ff52780f..27232e2a3a8e9 100644
--- a/aten/src/ATen/core/TensorImpl.h
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -16,11 +16,11 @@ namespace at {
 class Scalar;
 struct Type;
 struct Storage;
-struct Tensor;
+class Tensor;
 } // namespace at
 
 namespace at {
-struct AT_API TensorImpl : public c10::intrusive_ptr_target {
+struct CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
   TensorImpl() = delete;
   TensorImpl(TensorTypeId type_id, const caffe2::TypeMeta& data_type, Allocator *allocator, bool is_variable);
   TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable);
@@ -69,9 +69,11 @@ struct AT_API TensorImpl : public c10::intrusive_ptr_target {
   // numbers. Otherwise, they behave like their non-wrapped equivalents.
   // See [Result type computation] in TensorIterator.h.
   bool is_wrapped_number() const {
+    AT_ASSERT(!is_variable());
     return is_wrapped_number_;
   }
   void set_wrapped_number(bool value) {
+    AT_ASSERT(!is_variable());
     AT_ASSERT(dim() == 0);
     is_wrapped_number_ = value;
   }
@@ -97,10 +99,12 @@ struct AT_API TensorImpl : public c10::intrusive_ptr_target {
 
   template <typename T>
   inline T * data() const {
+    AT_ASSERT(!is_variable());
     return storage_.data<T>() + storage_offset_;
   }
 
   inline void* data() const {
+    AT_ASSERT(!is_variable());
     return static_cast<void*>(
         static_cast<char*>(storage_.data()) +
         data_type_.itemsize() * storage_offset_);
@@ -108,6 +112,7 @@ struct AT_API TensorImpl : public c10::intrusive_ptr_target {
 
   template <typename T>
   inline T * unsafe_data() const {
+    AT_ASSERT(!is_variable());
     return storage_.unsafe_data<T>() + storage_offset_;
   }
 
@@ -155,6 +160,7 @@ struct AT_API TensorImpl : public c10::intrusive_ptr_target {
   // sizes/strides are in bounds for the storage that is allocated;
   // this is the responsibility of the caller
   void set_sizes_and_strides(at::IntList new_size, at::IntList new_stride) {
+    AT_ASSERT(!is_variable());
     AT_CHECK(
         new_size.size() == new_stride.size(),
         "dimensionality of sizes (",
@@ -174,12 +180,12 @@ struct AT_API TensorImpl : public c10::intrusive_ptr_target {
   bool is_variable() const { return is_variable_; };
 
  private:
-  int64_t storage_offset_;
+  int64_t storage_offset_ = 0;
   std::vector<int64_t> sizes_;
   std::vector<int64_t> strides_;
 
-  bool is_contiguous_;
-  int64_t numel_;
+  bool is_contiguous_ = true;
+  int64_t numel_ = -1;
 
   int64_t compute_numel() const {
     int64_t n = 1;
@@ -192,9 +198,11 @@ struct AT_API TensorImpl : public c10::intrusive_ptr_target {
 
  protected:
   void refresh_numel() {
+    AT_ASSERT(!is_variable());
     numel_ = compute_numel();
   }
   void refresh_contiguous() {
+    AT_ASSERT(!is_variable());
     is_contiguous_ = compute_contiguous();
   }
   TensorTypeId type_id_;
diff --git a/aten/src/ATen/core/TensorMethods.h b/aten/src/ATen/core/TensorMethods.h
index 789340dc1b91d..c6197b4fc2d08 100644
--- a/aten/src/ATen/core/TensorMethods.h
+++ b/aten/src/ATen/core/TensorMethods.h
@@ -1241,16 +1241,16 @@ inline Device Tensor::device() const {
         " but found ",                           \
         at::toString(type().scalarType()));      \
     return static_cast<T*>(this->data_ptr());    \
-  }                                              \
-  inline T* Tensor::to##name##Data() const {     \
-    return data<T>();                            \
   }
 
 AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_CAST)
 #undef DEFINE_CAST
 
-#define DEFINE_TO_C_TYPE(T,name,_) \
-inline T Tensor::toC##name () const { return _local_scalar().to##name (); }
+#define DEFINE_TO_C_TYPE(T, name, _)   \
+  template <>                          \
+  inline T Tensor::item() const {      \
+    return _local_scalar().to##name(); \
+  }
 
 AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_TO_C_TYPE)
 #undef DEFINE_TO_C_TYPE
diff --git a/aten/src/ATen/core/TensorOptions.h b/aten/src/ATen/core/TensorOptions.h
index 2b589e9b13f48..4ae7b3452bddf 100644
--- a/aten/src/ATen/core/TensorOptions.h
+++ b/aten/src/ATen/core/TensorOptions.h
@@ -47,7 +47,7 @@ namespace at {
 ///     at::zeros({2,2}, at::device({at::kCUDA, 1})); // place on device 1
 ///     at::zeros({2,2}, at::requires_grad());
 ///
-struct AT_API TensorOptions {
+struct CAFFE2_API TensorOptions {
   TensorOptions() : TensorOptions(/*use_thread_local_default_options=*/true) {}
 
   /// Constructs the `TensorOptions` with defaults taken from the thread local
diff --git a/aten/src/ATen/core/TensorTypeId.h b/aten/src/ATen/core/TensorTypeId.h
index d01437bbe9197..ac584263c8018 100644
--- a/aten/src/ATen/core/TensorTypeId.h
+++ b/aten/src/ATen/core/TensorTypeId.h
@@ -17,7 +17,7 @@ using _tensorTypeId_underlyingType = uint8_t;
  * Dynamic type ID of a Tensor argument.  It represents something like
  * CPUTensor, etc.
  */
-class AT_CORE_API TensorTypeId final
+class CAFFE2_API TensorTypeId final
     : public at::
           IdWrapper<TensorTypeId, details::_tensorTypeId_underlyingType> {
  public:
@@ -32,10 +32,10 @@ class AT_CORE_API TensorTypeId final
       : IdWrapper(id) {}
 
   friend class TensorTypeIdCreator;
-  friend AT_CORE_API std::ostream& operator<<(std::ostream&, TensorTypeId);
+  friend CAFFE2_API std::ostream& operator<<(std::ostream&, TensorTypeId);
 };
 
-AT_CORE_API std::ostream& operator<<(std::ostream&, at::TensorTypeId);
+CAFFE2_API std::ostream& operator<<(std::ostream&, at::TensorTypeId);
 
 } // namespace at
 
diff --git a/aten/src/ATen/core/TensorTypeIdRegistration.h b/aten/src/ATen/core/TensorTypeIdRegistration.h
index a7b30932cebe8..c252a6ef6e443 100644
--- a/aten/src/ATen/core/TensorTypeIdRegistration.h
+++ b/aten/src/ATen/core/TensorTypeIdRegistration.h
@@ -16,7 +16,7 @@
 
 namespace at {
 
-class AT_CORE_API TensorTypeIdCreator final {
+class CAFFE2_API TensorTypeIdCreator final {
  public:
   TensorTypeIdCreator();
 
@@ -29,10 +29,10 @@ class AT_CORE_API TensorTypeIdCreator final {
  private:
   std::atomic<details::_tensorTypeId_underlyingType> last_id_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(TensorTypeIdCreator);
+  C10_DISABLE_COPY_AND_ASSIGN(TensorTypeIdCreator);
 };
 
-class AT_CORE_API TensorTypeIdRegistry final {
+class CAFFE2_API TensorTypeIdRegistry final {
  public:
   TensorTypeIdRegistry();
 
@@ -43,10 +43,10 @@ class AT_CORE_API TensorTypeIdRegistry final {
   std::unordered_set<at::TensorTypeId> registeredTypeIds_;
   std::mutex mutex_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(TensorTypeIdRegistry);
+  C10_DISABLE_COPY_AND_ASSIGN(TensorTypeIdRegistry);
 };
 
-class AT_CORE_API TensorTypeIds final {
+class CAFFE2_API TensorTypeIds final {
  public:
   static TensorTypeIds& singleton();
 
@@ -61,14 +61,14 @@ class AT_CORE_API TensorTypeIds final {
   TensorTypeIdCreator creator_;
   TensorTypeIdRegistry registry_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(TensorTypeIds);
+  C10_DISABLE_COPY_AND_ASSIGN(TensorTypeIds);
 };
 
 inline constexpr at::TensorTypeId TensorTypeIds::undefined() noexcept {
   return TensorTypeIdCreator::undefined();
 }
 
-class AT_CORE_API TensorTypeIdRegistrar final {
+class CAFFE2_API TensorTypeIdRegistrar final {
  public:
   TensorTypeIdRegistrar();
   ~TensorTypeIdRegistrar();
@@ -78,14 +78,15 @@ class AT_CORE_API TensorTypeIdRegistrar final {
  private:
   at::TensorTypeId id_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(TensorTypeIdRegistrar);
+  C10_DISABLE_COPY_AND_ASSIGN(TensorTypeIdRegistrar);
 };
 
 inline at::TensorTypeId TensorTypeIdRegistrar::id() const noexcept {
   return id_;
 }
 
-#define AT_DECLARE_TENSOR_TYPE(TensorName) AT_CORE_API at::TensorTypeId TensorName();
+#define AT_DECLARE_TENSOR_TYPE(TensorName) \
+  CAFFE2_API at::TensorTypeId TensorName();
 
 #define AT_DEFINE_TENSOR_TYPE(TensorName)           \
   at::TensorTypeId TensorName() {                   \
diff --git a/aten/src/ATen/core/Type.h b/aten/src/ATen/core/Type.h
index fdec4c6408d4c..3a2ccbe1e45ed 100644
--- a/aten/src/ATen/core/Type.h
+++ b/aten/src/ATen/core/Type.h
@@ -33,7 +33,7 @@ class Context;
 struct Allocator;
 struct Generator;
 struct Storage;
-struct Tensor;
+class Tensor;
 
 static inline void noop_deleter(void*) {}
 
@@ -76,7 +76,7 @@ enum class TypeID {
   NumOptions
 };
 
-struct AT_API Type {
+struct CAFFE2_API Type {
   explicit Type(TensorTypeId type_id, bool is_variable, bool is_undefined)
       : type_id_(type_id), is_variable_(is_variable), is_undefined_(is_undefined) {}
 
@@ -364,8 +364,6 @@ struct AT_API Type {
   virtual Tensor & log_normal_(Tensor & self, double mean, double std, Generator * generator) const = 0;
   virtual Tensor & exponential_(Tensor & self, double lambd, Generator * generator) const = 0;
   virtual Tensor & geometric_(Tensor & self, double p, Generator * generator) const = 0;
-  virtual Tensor tensor(Storage storage, int64_t storageOffset, IntList size, IntList stride) const = 0;
-  virtual Tensor tensor(IntList size, IntList stride) const = 0;
   virtual Tensor abs(const Tensor & self) const = 0;
   virtual Tensor & abs_(Tensor & self) const = 0;
   virtual Tensor acos(const Tensor & self) const = 0;
@@ -579,15 +577,6 @@ struct AT_API Type {
   virtual Tensor & sub_(Tensor & self, Scalar other, Scalar alpha) const = 0;
   virtual Tensor addmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const = 0;
   virtual Tensor & addmm_(Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const = 0;
-  virtual Tensor tensor() const = 0;
-  virtual Tensor tensor(IntList size) const = 0;
-  virtual Tensor native_sparse_coo_tensor(IntList size) const = 0;
-  virtual Tensor native_sparse_coo_tensor(const Tensor & indices, const Tensor & values) const = 0;
-  virtual Tensor native_sparse_coo_tensor(const Tensor & indices, const Tensor & values, IntList size) const = 0;
-  virtual Tensor sparse_coo_tensor(IntList size) const = 0;
-  virtual Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values) const = 0;
-  virtual Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values, IntList size) const = 0;
-  virtual Tensor _native_sparse_coo_tensor_unsafe(const Tensor & indices, const Tensor & values, IntList size) const = 0;
   virtual Tensor & sparse_resize_(Tensor & self, IntList size, int64_t sparseDims, int64_t denseDims) const = 0;
   virtual Tensor & sparse_resize_and_clear_(Tensor & self, IntList size, int64_t sparseDims, int64_t denseDims) const = 0;
   virtual Tensor sparse_mask(const Tensor & self, SparseTensorRef mask) const = 0;
@@ -611,7 +600,6 @@ struct AT_API Type {
   TensorTypeId type_id_;
   bool is_variable_;
   bool is_undefined_;
-
 };
 
 } // namespace at
diff --git a/aten/src/ATen/core/UndefinedTensorImpl.h b/aten/src/ATen/core/UndefinedTensorImpl.h
index 6c734950d90ca..7a6866187c5f2 100644
--- a/aten/src/ATen/core/UndefinedTensorImpl.h
+++ b/aten/src/ATen/core/UndefinedTensorImpl.h
@@ -4,8 +4,8 @@
 
 namespace at {
 
-struct AT_API UndefinedTensorImpl final : public TensorImpl {
-public:
+struct CAFFE2_API UndefinedTensorImpl final : public TensorImpl {
+ public:
   // Without this, we get:
   //  error: identifier "at::UndefinedTensorImpl::_singleton" is undefined in device code
   // (ostensibly because the constexpr tricks MSVC into trying to compile this
diff --git a/aten/src/ATen/core/UniqueVoidPtr.h b/aten/src/ATen/core/UniqueVoidPtr.h
index a7c9d6119bfcd..daa6cdd373578 100644
--- a/aten/src/ATen/core/UniqueVoidPtr.h
+++ b/aten/src/ATen/core/UniqueVoidPtr.h
@@ -10,7 +10,7 @@ using DeleterFnPtr = void (*)(void*);
 namespace detail {
 
 // Does not delete anything
-AT_CORE_API void deleteNothing(void*);
+CAFFE2_API void deleteNothing(void*);
 
 // A detail::UniqueVoidPtr is an owning smart pointer like unique_ptr, but
 // with three major differences:
diff --git a/aten/src/ATen/core/VariableHooksInterface.h b/aten/src/ATen/core/VariableHooksInterface.h
index 09c972255ea6f..e8fd4da9e2753 100644
--- a/aten/src/ATen/core/VariableHooksInterface.h
+++ b/aten/src/ATen/core/VariableHooksInterface.h
@@ -20,8 +20,7 @@ namespace at {
 //
 // We may choose to absorb autograd into ATen, in which case this interface is obsolete.
 //
-struct AT_API VariableHooksInterface {
-
+struct CAFFE2_API VariableHooksInterface {
   // This should never actually be implemented, but it is used to
   // squelch -Werror=non-virtual-dtor
   virtual ~VariableHooksInterface() {}
@@ -34,18 +33,17 @@ struct AT_API VariableHooksInterface {
     // no-op if Variable not available; it'll get handled (if at all) when
     // libtorch.so gets loaded
   }
-
 };
 
 // NB: dummy argument to suppress "ISO C++11 requires at least one argument
 // for the "..." in a variadic macro"
-struct AT_API VariableHooksArgs {};
+struct CAFFE2_API VariableHooksArgs {};
 
 AT_DECLARE_REGISTRY(VariableHooksRegistry, VariableHooksInterface, VariableHooksArgs)
 #define REGISTER_VARIABLE_HOOKS(clsname) AT_REGISTER_CLASS(VariableHooksRegistry, clsname, clsname)
 
 namespace detail {
-  AT_API const VariableHooksInterface& getVariableHooks();
+CAFFE2_API const VariableHooksInterface& getVariableHooks();
 }
 
 } // namespace at
diff --git a/aten/src/ATen/core/blob.cpp b/aten/src/ATen/core/blob.cpp
new file mode 100644
index 0000000000000..930255194639b
--- /dev/null
+++ b/aten/src/ATen/core/blob.cpp
@@ -0,0 +1 @@
+#include <ATen/core/blob.h>
diff --git a/aten/src/ATen/core/blob.h b/aten/src/ATen/core/blob.h
new file mode 100644
index 0000000000000..17a09f33616e7
--- /dev/null
+++ b/aten/src/ATen/core/blob.h
@@ -0,0 +1,217 @@
+#pragma once
+
+#include <cstddef>
+#include <sstream>
+#include <type_traits>
+#include <typeinfo>
+#include <vector>
+
+#include <ATen/core/intrusive_ptr.h>
+#include <ATen/core/typeid.h>
+#include <c10/macros/Macros.h>
+
+namespace caffe2 {
+
+class Tensor;
+
+/**
+ * @brief Blob is a general container that hosts a typed pointer.
+ *
+ * A Blob hosts a pointer as well as its type, and takes charge of deleting it
+ * properly when the blob is deallocated or re-allocated with a new type. A blob
+ * could contain anything, although the most common case is to contain a Tensor.
+ */
+class CAFFE2_API Blob final : public c10::intrusive_ptr_target {
+ public:
+  using DestroyCall = void(void*);
+
+  /**
+   * Initializes an empty Blob.
+   */
+  Blob() noexcept : meta_(), pointer_(nullptr), destroy_(nullptr) {}
+  ~Blob() {
+    Reset();
+  }
+
+  Blob(Blob&& other) noexcept : Blob() {
+    swap(other);
+  }
+
+  Blob& operator=(Blob&& other) noexcept {
+    Blob(std::move(other)).swap(*this);
+    return *this;
+  }
+
+  /**
+   * Checks if the content stored in the blob is of type T.
+   */
+  template <class T>
+  bool IsType() const noexcept {
+    return meta_.Match<T>();
+  }
+
+  /**
+   * Returns the meta info of the blob.
+   */
+  inline const TypeMeta& meta() const noexcept {
+    return meta_;
+  }
+
+  /**
+   * Returns a printable typename of the blob.
+   */
+  inline const char* TypeName() const noexcept {
+    return meta_.name();
+  }
+
+  /**
+   * @brief Gets the const reference of the stored object. The code checks if
+   * the stored object is of the desired type.
+   */
+  // TODO(jerryzh): add a Get(DeviceType) function?
+  template <class T>
+  const T& Get() const {
+    AT_ASSERTM(
+        IsType<T>(),
+        "wrong type for the Blob instance. Blob contains ",
+        meta_.name(),
+        " while caller expects ",
+        TypeMeta::TypeName<T>());
+    // TODO: after we add Get<Tensor>(DeviceType)
+    // and changed all the callsites, we can add
+    // a static assert here to enforce T != Tensor
+    return *static_cast<const T*>(pointer_);
+  }
+
+  const void* GetRaw() const noexcept {
+    return pointer_;
+  }
+  void* GetRaw() noexcept {
+    return pointer_;
+  }
+
+  /**
+   * @brief Gets a mutable pointer to the stored object.
+   *
+   * If the current object is not of the right type, a new object is created
+   * and the old object is freed. Note that type T should have a default
+   * constructor. Otherwise, create the object yourself first, and use
+   * Reset().
+   */
+  template <class T>
+  T* GetMutable() {
+    static_assert(
+        std::is_default_constructible<T>::value,
+        "GetMutable can't be called with non-default-constructible types. "
+        "Try using specialized methods");
+    if (IsType<T>()) {
+      return static_cast<T*>(pointer_);
+    } else {
+      // TODO Re-enable logging
+      // VLOG(1) << "Create new mutable object " << TypeMeta::TypeName<T>();
+      return Reset<T>(new T());
+    }
+  }
+
+  template <class T>
+  T* GetMutableOrNull() {
+    if (IsType<T>()) {
+      return static_cast<T*>(pointer_);
+    } else {
+      return nullptr;
+    }
+  }
+
+  /**
+   * Sets the underlying object to the allocated one. The Blob then takes over
+   * the ownership of the passed in pointer. If there is already an object in
+   * the Blob, the old object is freed.
+   *
+   * This is used when the underlying class T does not have a default ctor, or
+   * complex initializations needs to be done outside the blob.
+   */
+  template <class T>
+  T* Reset(T* allocated) {
+    if (pointer_ && destroy_) {
+      destroy_(pointer_);
+    }
+    meta_ = TypeMeta::Make<T>();
+    pointer_ = static_cast<void*>(allocated);
+    destroy_ = &Destroy<T>;
+    return allocated;
+  }
+
+  /**
+   * Sets the underlying object to the allocated one, but does not take over
+   * the ownership of the passed in pointer. If there is already an object in
+   * the Blob, the old object is freed.
+   *
+   * Unlike Reset, this does not take over the ownership of the pointer and the
+   * caller is responsible for making sure that the lifetime of the allocated
+   * blob outlasts the lifetime of any access to this blob, until another Reset
+   * call is made or the blob is destructed.
+   */
+  template <class T>
+  typename std::remove_const<T>::type* ShareExternal(
+      typename std::remove_const<T>::type* allocated) {
+    return static_cast<T*>(ShareExternal(
+        static_cast<void*>(allocated),
+        TypeMeta::Make<typename std::remove_const<T>::type>()));
+  }
+
+  void* ShareExternal(void* allocated, const TypeMeta& meta) {
+    if (pointer_ && destroy_) {
+      destroy_(pointer_);
+    }
+    meta_ = meta;
+    pointer_ = static_cast<void*>(allocated);
+    destroy_ = nullptr;
+    return allocated;
+  }
+
+  /**
+   * Resets the Blob to an empty one.
+   */
+  inline void Reset() {
+    if (pointer_ && destroy_) {
+      destroy_(pointer_);
+    }
+    pointer_ = nullptr;
+    meta_ = TypeMeta();
+    destroy_ = nullptr;
+  }
+
+  /**
+   * @brief Swaps the underlying storage of two blobs.
+   */
+  void swap(Blob& rhs) {
+    using std::swap;
+    swap(meta_, rhs.meta_);
+    swap(pointer_, rhs.pointer_);
+    swap(destroy_, rhs.destroy_);
+  }
+
+ private:
+  /**
+   * @brief A destroy call that is used to properly deconstruct objects.
+   */
+  template <class T>
+  static void Destroy(void* pointer) {
+    delete static_cast<T*>(pointer);
+  }
+  TypeMeta meta_;
+  void* pointer_ = nullptr;
+  DestroyCall* destroy_ = nullptr;
+
+  C10_DISABLE_COPY_AND_ASSIGN(Blob);
+};
+
+inline void swap(Blob& lhs, Blob& rhs) {
+  lhs.swap(rhs);
+}
+
+inline std::ostream& operator<<(std::ostream& out, const Blob& v) {
+  return out << "Blob[" << v.TypeName() << "]";
+}
+
+} // namespace caffe2
diff --git a/aten/src/ATen/core/context_base.h b/aten/src/ATen/core/context_base.h
index 0a653ba0a1237..326cae5eb9691 100644
--- a/aten/src/ATen/core/context_base.h
+++ b/aten/src/ATen/core/context_base.h
@@ -25,7 +25,7 @@ class BaseContext;
    functions that are invoked statically before in Tensor class, e.g. New,
    We will merge this with Allocator later.
  */
-class AT_CORE_API BaseStaticContext {
+class CAFFE2_API BaseStaticContext {
  public:
   virtual ~BaseStaticContext() noexcept {}
 
@@ -55,7 +55,7 @@ class AT_CORE_API BaseStaticContext {
  * functions in the BaseContext class.
  * TODO: add docs after this is finalized.
  */
-class AT_CORE_API BaseContext {
+class CAFFE2_API BaseContext {
  public:
   virtual ~BaseContext() noexcept {}
 
@@ -192,9 +192,9 @@ using at::BaseContext;
 using at::BaseStaticContext;
 
 using StaticContextMap = std::unordered_map<at::DeviceType, BaseStaticContext*>;
-AT_API StaticContextMap& GetStaticContexts();
-AT_API void set_static_context(at::DeviceType t, BaseStaticContext* ptr);
-AT_API BaseStaticContext* get_static_context(at::DeviceType t);
+CAFFE2_API StaticContextMap& GetStaticContexts();
+CAFFE2_API void set_static_context(at::DeviceType t, BaseStaticContext* ptr);
+CAFFE2_API BaseStaticContext* get_static_context(at::DeviceType t);
 
 template <at::DeviceType t>
 struct StaticContextFunctionRegisterer {
diff --git a/aten/src/ATen/core/intrusive_ptr.h b/aten/src/ATen/core/intrusive_ptr.h
index 961915555a375..4dc3c501e9433 100644
--- a/aten/src/ATen/core/intrusive_ptr.h
+++ b/aten/src/ATen/core/intrusive_ptr.h
@@ -33,7 +33,7 @@ namespace c10 {
 // tells us if the object was allocated by us.  If it wasn't, no
 // intrusive_ptr for you!
 
-class AT_CORE_API intrusive_ptr_target {
+class CAFFE2_API intrusive_ptr_target {
   // Note [Weak references for intrusive refcounting]
   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   // Here's the scheme:
@@ -114,7 +114,7 @@ class AT_CORE_API intrusive_ptr_target {
 
 namespace detail {
 template <class TTarget>
-struct AT_CORE_EXPORT intrusive_target_default_null_type final {
+struct C10_EXPORT intrusive_target_default_null_type final {
   static constexpr TTarget* singleton() noexcept {
     return nullptr;
   }
@@ -136,7 +136,7 @@ class weak_intrusive_ptr;
 template <
     class TTarget,
     class NullType = detail::intrusive_target_default_null_type<TTarget>>
-class AT_CORE_EXPORT intrusive_ptr final {
+class C10_EXPORT intrusive_ptr final {
  private:
   static_assert(
       std::is_base_of<intrusive_ptr_target, TTarget>::value,
@@ -391,7 +391,7 @@ inline bool operator!=(
 template <
     typename TTarget,
     class NullType = detail::intrusive_target_default_null_type<TTarget>>
-class AT_CORE_EXPORT weak_intrusive_ptr final {
+class C10_EXPORT weak_intrusive_ptr final {
  private:
   static_assert(
       std::is_base_of<intrusive_ptr_target, TTarget>::value,
@@ -739,13 +739,13 @@ namespace std {
 // To allow intrusive_ptr and weak_intrusive_ptr inside std::unordered_map or
 // std::unordered_set, we need std::hash
 template <class TTarget, class NullType>
-struct AT_CORE_EXPORT hash<c10::intrusive_ptr<TTarget, NullType>> {
+struct C10_EXPORT hash<c10::intrusive_ptr<TTarget, NullType>> {
   size_t operator()(const c10::intrusive_ptr<TTarget, NullType>& x) const {
     return std::hash<TTarget*>()(x.get());
   }
 };
 template <class TTarget, class NullType>
-struct AT_CORE_EXPORT hash<c10::weak_intrusive_ptr<TTarget, NullType>> {
+struct C10_EXPORT hash<c10::weak_intrusive_ptr<TTarget, NullType>> {
   size_t operator()(const c10::weak_intrusive_ptr<TTarget, NullType>& x) const {
     return std::hash<TTarget*>()(x._unsafe_get_target());
   }
diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
index 3d2b56893e718..8dfb1e8ebb75b 100644
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@@ -1,12 +1,15 @@
 #include <ATen/core/ivalue.h>
 #include <ATen/core/Formatting.h>
 
-#define TORCH_FORALL_TAGS(_) \
-  _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(String) _(TensorList)
+#define TORCH_FORALL_TAGS(_)                                             \
+  _(None)                                                                \
+  _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(String) \
+      _(TensorList) _(Blob)
 
 namespace torch { namespace jit {
 
-AT_API c10::intrusive_ptr<ConstantString> ConstantString::create(std::string str_) {
+CAFFE2_API c10::intrusive_ptr<ConstantString> ConstantString::create(
+    std::string str_) {
   return c10::make_intrusive<ConstantString>(std::move(str_));
 }
 
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index 914598f6ceb42..513845d4c12af 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -4,6 +4,7 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/core/TensorImpl.h>
 #include <ATen/core/UndefinedTensorImpl.h>
+#include <ATen/core/blob.h>
 #include <ATen/core/intrusive_ptr.h>
 
 #include <type_traits>
@@ -14,7 +15,7 @@ template <typename T>
 using Shared = c10::intrusive_ptr<T>;
 
 // string
-struct AT_API ConstantString final : c10::intrusive_ptr_target {
+struct CAFFE2_API ConstantString final : c10::intrusive_ptr_target {
  private:
   const std::string str_;
  public:
@@ -27,14 +28,14 @@ struct AT_API ConstantString final : c10::intrusive_ptr_target {
   operator const std::string & () const {
     return string();
   }
-  AT_API friend std::ostream& operator<<(
+  CAFFE2_API friend std::ostream& operator<<(
       std::ostream& out,
       const ConstantString& v);
 };
 
 // non-mutable list
 template <typename Elem>
-struct AT_CORE_EXPORT ConstantList final : c10::intrusive_ptr_target {
+struct C10_EXPORT ConstantList final : c10::intrusive_ptr_target {
  private:
   const std::vector<Elem> elements_;
  public:
@@ -64,10 +65,12 @@ using DoubleList = ConstantList<double>;
 // to mark whether that type is a subtype of c10::intrusive_ptr_target and needs
 // retain/release calls.
 
-#define TORCH_FORALL_TAGS(_) \
-  _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(String) _(TensorList)
+#define TORCH_FORALL_TAGS(_)                                             \
+  _(None)                                                                \
+  _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(String) \
+      _(TensorList) _(Blob)
 
-struct AT_API IValue final {
+struct CAFFE2_API IValue final {
   IValue()
   : payload{0}
   , tag(Tag::None)
@@ -125,6 +128,25 @@ struct AT_API IValue final {
     return at::Tensor(toIntrusivePtr<at::TensorImpl, at::UndefinedTensorImpl>());
   }
 
+  IValue(caffe2::Blob blob) : tag(Tag::Blob), is_intrusive_ptr(true) {
+    // TODO (after Tensor merge) If we pass in a Blob holding a Tensor, extract
+    // and
+    //      store it as a Tensor instead.
+    payload.as_intrusive_ptr =
+        c10::make_intrusive<caffe2::Blob>(std::move(blob)).release();
+  }
+  bool isBlob() const {
+    return Tag::Blob == tag;
+  }
+  caffe2::Blob& toBlob() & {
+    AT_ASSERT(isBlob());
+    return *static_cast<caffe2::Blob*>(payload.as_intrusive_ptr);
+  }
+  const caffe2::Blob& toBlob() const& {
+    AT_ASSERT(isBlob());
+    return *static_cast<caffe2::Blob*>(payload.as_intrusive_ptr);
+  }
+
   // Tuple
   IValue(c10::intrusive_ptr<Tuple> v);
   bool isTuple() const { return Tag::Tuple == tag; }
@@ -277,7 +299,9 @@ struct AT_API IValue final {
   template<typename T>
   T to() const &;
 
-  AT_API friend std::ostream& operator<<(std::ostream& out, const IValue& v);
+  CAFFE2_API friend std::ostream& operator<<(
+      std::ostream& out,
+      const IValue& v);
 
  private:
   // NOTE: IValue tags are intentionally private. In the future we may encode
diff --git a/aten/src/ATen/core/typeid.h b/aten/src/ATen/core/typeid.h
index 2ed81cb1e1c8a..9055746ea377d 100644
--- a/aten/src/ATen/core/typeid.h
+++ b/aten/src/ATen/core/typeid.h
@@ -47,7 +47,8 @@ class TypeMeta;
  * use TypeIdentifier with custom types. This is for example used to store the
  * dtype of tensors.
  */
-class AT_CORE_API TypeIdentifier final : public at::IdWrapper<TypeIdentifier, uint16_t> {
+class CAFFE2_API TypeIdentifier final
+    : public at::IdWrapper<TypeIdentifier, uint16_t> {
  public:
   static TypeIdentifier createTypeId();
 
@@ -61,6 +62,8 @@ class AT_CORE_API TypeIdentifier final : public at::IdWrapper<TypeIdentifier, ui
     return TypeIdentifier(11);
   }
 
+  const char* name() const noexcept;
+
  private:
   constexpr explicit TypeIdentifier(uint16_t id) : IdWrapper(id) {}
   friend class TypeMeta;
@@ -88,11 +91,16 @@ AT_DEFINE_HASH_FOR_IDWRAPPER(caffe2::TypeIdentifier)
 
 namespace caffe2 {
 
-AT_CORE_API std::unordered_map<TypeIdentifier, std::string>& gTypeNames();
-AT_CORE_API std::unordered_set<std::string>& gRegisteredTypeNames();
+CAFFE2_API std::unordered_map<TypeIdentifier, std::string>& gTypeNames();
+CAFFE2_API std::unordered_set<std::string>& gRegisteredTypeNames();
 
+inline const char* TypeIdentifier::name() const noexcept {
+  auto it = gTypeNames().find(*this);
+  assert(it != gTypeNames().end());
+  return it->second.c_str();
+}
 
-AT_CORE_API std::mutex& gTypeRegistrationMutex();
+CAFFE2_API std::mutex& gTypeRegistrationMutex();
 
 template <typename T>
 struct TypeNameRegisterer {
@@ -139,7 +147,7 @@ struct TypeNameRegisterer {
  * stores some additional data such as the item size and the name of the type
  * for run-time inspection.
  */
-class AT_CORE_API TypeMeta {
+class CAFFE2_API TypeMeta {
  public:
   using PlacementNew = void(void*, size_t);
   using TypedCopy = void(const void*, void*, size_t);
@@ -240,7 +248,7 @@ class AT_CORE_API TypeMeta {
    * is generated during run-time. Do NOT serialize the id for storage.
    */
   template <typename T>
-  AT_CORE_API static TypeIdentifier Id();
+  CAFFE2_API static TypeIdentifier Id();
 
   /**
    * Returns the item size of the type. This is equivalent to sizeof(T).
@@ -396,20 +404,16 @@ inline bool operator!=(const TypeMeta& lhs, const TypeMeta& rhs) noexcept {
  *
  * NOTE: the macro needs to be invoked in ::caffe2 namespace
  */
-// Implementation note: in MSVC, we will need to prepend the AT_CORE_API
+// Implementation note: in MSVC, we will need to prepend the CAFFE2_API
 // keyword in order to get things compiled properly. in Linux, gcc seems to
 // create attribute ignored error for explicit template instantiations, see
 //   http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0537r0.html
 //   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51930
 // and as a result, we define these two macros slightly differently.
-// TODO(jiayq): AT_CORE_API below is not correct, because we may use the
-// definition in third party dependent libraries. The proper way is to use
-// CAFFE2_EXPORT (which explicitly requires dllexport). Marking this as a
-// todo item when the unified build is finished.
 #ifdef _MSC_VER
 #define CAFFE_KNOWN_TYPE(T)                                               \
   template <>                                                             \
-  AT_CORE_EXPORT TypeIdentifier TypeMeta::Id<T>() {                       \
+  C10_EXPORT TypeIdentifier TypeMeta::Id<T>() {                           \
     static const TypeIdentifier type_id = TypeIdentifier::createTypeId(); \
     static TypeNameRegisterer<T> registerer(type_id, #T);                 \
     return type_id;                                                       \
@@ -431,10 +435,10 @@ inline bool operator!=(const TypeMeta& lhs, const TypeMeta& rhs) noexcept {
  * for your own types to allocate dynamic ids for them.
  */
 #ifdef _MSC_VER
-#define CAFFE_DECLARE_KNOWN_TYPE(PreallocatedId, T)       \
-  template <>                                             \
-  inline AT_CORE_API TypeIdentifier TypeMeta::Id<T>() {   \
-    return TypeIdentifier(PreallocatedId);                \
+#define CAFFE_DECLARE_KNOWN_TYPE(PreallocatedId, T)    \
+  template <>                                          \
+  inline CAFFE2_API TypeIdentifier TypeMeta::Id<T>() { \
+    return TypeIdentifier(PreallocatedId);             \
   }
 #else // _MSC_VER
 #define CAFFE_DECLARE_KNOWN_TYPE(PreallocatedId, T) \
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
index e36178d47439a..0a4649d9c41ad 100644
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -58,6 +58,10 @@ cusparseHandle_t getCurrentCUDASparseHandle() {
   return THCState_getCurrentSparseHandle(at::globalContext().getTHCState());
 }
 
+cublasHandle_t getCurrentCUDABlasHandle() {
+  return THCState_getCurrentBlasHandle(at::globalContext().getTHCState());
+}
+
 } // namespace cuda
 
 } // namespace at
diff --git a/aten/src/ATen/cuda/CUDAContext.h b/aten/src/ATen/cuda/CUDAContext.h
index 279ac1d9b1e05..3a480d2ca4e4e 100644
--- a/aten/src/ATen/cuda/CUDAContext.h
+++ b/aten/src/ATen/cuda/CUDAContext.h
@@ -9,6 +9,7 @@
 
 #include "cuda_runtime_api.h"
 #include "cusparse.h"
+#include "cublas_v2.h"
 
 namespace at {
 namespace cuda {
@@ -35,31 +36,31 @@ manage their own state. There is only a single CUDA context/state.
 */
 
 /* Device info */
-AT_API int64_t getNumGPUs();
+CAFFE2_API int64_t getNumGPUs();
 
-AT_API int64_t current_device();
+CAFFE2_API int64_t current_device();
 
-AT_API void set_device(int64_t device);
+CAFFE2_API void set_device(int64_t device);
 
-AT_API cudaDeviceProp* getCurrentDeviceProperties();
+CAFFE2_API cudaDeviceProp* getCurrentDeviceProperties();
 
-AT_API cudaDeviceProp* getDeviceProperties(int64_t device);
+CAFFE2_API cudaDeviceProp* getDeviceProperties(int64_t device);
 
 /* Streams */
-AT_API CUDAStream createCUDAStream(
-  const bool isHighPriority = false
-, int64_t device = -1);
+CAFFE2_API CUDAStream
+createCUDAStream(const bool isHighPriority = false, int64_t device = -1);
 
-AT_API CUDAStream getDefaultCUDAStream(int64_t device = -1);
-AT_API CUDAStream getCurrentCUDAStream(int64_t device = -1);
+CAFFE2_API CUDAStream getDefaultCUDAStream(int64_t device = -1);
+CAFFE2_API CUDAStream getCurrentCUDAStream(int64_t device = -1);
 
-AT_API void setCurrentCUDAStream(CUDAStream stream);
-AT_API void uncheckedSetCurrentCUDAStream(CUDAStream stream);
+CAFFE2_API void setCurrentCUDAStream(CUDAStream stream);
+CAFFE2_API void uncheckedSetCurrentCUDAStream(CUDAStream stream);
 
-AT_API Allocator* getCUDADeviceAllocator();
+CAFFE2_API Allocator* getCUDADeviceAllocator();
 
 /* Handles */
-AT_API cusparseHandle_t getCurrentCUDASparseHandle();
+CAFFE2_API cusparseHandle_t getCurrentCUDASparseHandle();
+CAFFE2_API cublasHandle_t getCurrentCUDABlasHandle();
 
 
 } // namespace cuda
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h
index 4e60ee1597cc4..69149932ac7b9 100644
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@@ -47,7 +47,7 @@ constexpr const char* CUDA_HELP =
 // TODO: Consider putting the stub definitions in another class, so that one
 // never forgets to implement each virtual function in the real implementation
 // in CUDAHooks.  This probably doesn't buy us much though.
-struct AT_API CUDAHooksInterface {
+struct CAFFE2_API CUDAHooksInterface {
   // This should never actually be implemented, but it is used to
   // squelch -Werror=non-virtual-dtor
   virtual ~CUDAHooksInterface() {}
@@ -129,14 +129,14 @@ struct AT_API CUDAHooksInterface {
 
 // NB: dummy argument to suppress "ISO C++11 requires at least one argument
 // for the "..." in a variadic macro"
-struct AT_API CUDAHooksArgs {};
+struct CAFFE2_API CUDAHooksArgs {};
 
 AT_DECLARE_REGISTRY(CUDAHooksRegistry, CUDAHooksInterface, CUDAHooksArgs)
 #define REGISTER_CUDA_HOOKS(clsname) \
   AT_REGISTER_CLASS(CUDAHooksRegistry, clsname, clsname)
 
 namespace detail {
-AT_API const CUDAHooksInterface& getCUDAHooks();
+CAFFE2_API const CUDAHooksInterface& getCUDAHooks();
 
 /// This class exists to let us access `cudaSetDevice`, `cudaGetDevice` and CUDA
 /// error handling functions, when CUDA is available. These functions will first
@@ -144,7 +144,7 @@ AT_API const CUDAHooksInterface& getCUDAHooks();
 /// the `cudaSetDevice`/`cudaGetDevice` functions. This allows us to access them
 /// with only a single pointer indirection, while virtual dispatch would require
 /// two (one for the virtual call, one for `cudaSetDevice`/`cudaGetDevice`).
-struct AT_API DynamicCUDAInterface {
+struct CAFFE2_API DynamicCUDAInterface {
   static void (*set_device)(int32_t);
   static void (*get_device)(int32_t*);
   static void (*unchecked_set_device)(int32_t);
diff --git a/aten/src/ATen/detail/ComplexHooksInterface.h b/aten/src/ATen/detail/ComplexHooksInterface.h
index 80ecfb6f26f83..e5d5c3ec2a83f 100644
--- a/aten/src/ATen/detail/ComplexHooksInterface.h
+++ b/aten/src/ATen/detail/ComplexHooksInterface.h
@@ -7,7 +7,7 @@ namespace at {
 
 class Context;
 
-struct AT_API ComplexHooksInterface {
+struct CAFFE2_API ComplexHooksInterface {
   virtual ~ComplexHooksInterface() {}
 
   virtual void registerComplexTypes(Context*) const {
@@ -15,13 +15,13 @@ struct AT_API ComplexHooksInterface {
   }
 };
 
-struct AT_API ComplexHooksArgs {};
+struct CAFFE2_API ComplexHooksArgs {};
 AT_DECLARE_REGISTRY(ComplexHooksRegistry, ComplexHooksInterface, ComplexHooksArgs)
 #define REGISTER_COMPLEX_HOOKS(clsname) \
   AT_REGISTER_CLASS(ComplexHooksRegistry, clsname, clsname)
 
 namespace detail {
-AT_API const ComplexHooksInterface& getComplexHooks();
+CAFFE2_API const ComplexHooksInterface& getComplexHooks();
 }
 
 }
diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
index 323701d69e837..189cadf0b6d1c 100644
--- a/aten/src/ATen/function_wrapper.py
+++ b/aten/src/ATen/function_wrapper.py
@@ -154,7 +154,7 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 """)
 # add a native declaration for a native function
 NATIVE_DECLARATION = CodeTemplate("""\
-AT_API ${return_type} ${native_type_method_dispatch}(${formals_with_defaults});
+CAFFE2_API ${return_type} ${native_type_method_dispatch}(${formals_with_defaults});
 """)
 
 # special method definition for factory functions in Functions.h
diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp
index 30ccc616c6e7c..84f83946094c8 100644
--- a/aten/src/ATen/native/Activation.cpp
+++ b/aten/src/ATen/native/Activation.cpp
@@ -37,11 +37,11 @@ Tensor & celu_(Tensor & self, Scalar alpha) {
 }
 
 Tensor rrelu(const Tensor & self, Scalar lower, Scalar upper, bool training, Generator* generator) {
-  return at::rrelu_with_noise(self, self.type().tensor(), lower, upper, training, generator);
+  return at::rrelu_with_noise(self, at::empty({0}, self.options()), lower, upper, training, generator);
 }
 
 Tensor & rrelu_(Tensor & self, Scalar lower, Scalar upper, bool training, Generator* generator) {
-  return at::rrelu_with_noise_(self, self.type().tensor(), lower, upper, training, generator);
+  return at::rrelu_with_noise_(self, at::empty({0}, self.options()), lower, upper, training, generator);
 }
 
 // -----------------------------------
diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp
index 517d00164e37f..5ddb36bb5b4dd 100644
--- a/aten/src/ATen/native/BinaryOps.cpp
+++ b/aten/src/ATen/native/BinaryOps.cpp
@@ -16,7 +16,7 @@ DEFINE_DISPATCH(div_stub);
 Tensor& add_out(Tensor& result, const Tensor& self, const Tensor& other, Scalar alpha) {
   if (other.is_sparse()) {
     if (!result.defined()) {
-      result = self.type().tensor();
+      result = at::empty({0}, self.options());
     }
     if (self.is_sparse()) {
       at::_sparse_add_out(result, self, other, alpha);
@@ -44,7 +44,7 @@ Tensor& add_(Tensor& self, const Tensor& other, Scalar alpha) {
 Tensor& div_out(Tensor& result, const Tensor& self, const Tensor& other) {
   if (self.is_sparse()) {
     if (!result.defined()) {
-      result = self.type().tensor();
+      result = at::empty({0}, self.options());
     }
     if (other.dim() != 0) {
       AT_ERROR("div(): sparse division only supports division by a scalar ",
@@ -69,7 +69,7 @@ Tensor& div_(Tensor& self, const Tensor& other) {
 Tensor& mul_out(Tensor& result, const Tensor& self, const Tensor& other) {
   if (self.is_sparse() || other.is_sparse()) {
     if (!result.defined()) {
-      result = self.type().tensor();
+      result = at::empty({0}, self.options());
     }
     return at::_sparse_mul_out(result, self, other);
   }
@@ -90,7 +90,7 @@ Tensor& mul_(Tensor& self, const Tensor& other) {
 Tensor& sub_out(Tensor& result, const Tensor& self, const Tensor& other, Scalar alpha) {
   if (other.is_sparse()) {
     if (!result.defined()) {
-      result = self.type().tensor();
+      result = at::empty({0}, self.options());
     }
     if (!self.sizes().equals(other.sizes())) {
       AT_ERROR("sizes do not match");
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index cee0ccd212f5c..77bc209c7036d 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -318,7 +318,7 @@ at::Tensor _convolution(
     weight = view4d(weight);
   }
 
-  auto output = input.type().tensor();
+  auto output = at::empty({0}, input.options());
 
   if (params.is_depthwise(input, weight)) {
       /* output.resize_(output_size(input, weight)); */
diff --git a/aten/src/ATen/native/ConvolutionTBC.cpp b/aten/src/ATen/native/ConvolutionTBC.cpp
index 0c2ac96dce806..8b9779313bf89 100644
--- a/aten/src/ATen/native/ConvolutionTBC.cpp
+++ b/aten/src/ATen/native/ConvolutionTBC.cpp
@@ -33,11 +33,11 @@ Tensor conv_tbc(const Tensor& self, const Tensor& weight, const Tensor& bias, in
       "the weight tensor (output channels).");
 
   // input * weights + bias -> output_features
-  Tensor output = self.type().tensor({
+  Tensor output = at::empty({
     olen,
     input_size[1],
     weight_size[2],
-  });
+  }, self.options());
   output.copy_(bias.expand(output.sizes()));
   for (int k = 0; k < kw; k++) {
     int iShift = std::max(0, static_cast<int>(k - real_pad));
diff --git a/aten/src/ATen/native/DispatchStub.h b/aten/src/ATen/native/DispatchStub.h
index 42ef6a4f6bb5f..c803ecd3f353b 100644
--- a/aten/src/ATen/native/DispatchStub.h
+++ b/aten/src/ATen/native/DispatchStub.h
@@ -49,10 +49,10 @@ enum class CPUCapability {
 CPUCapability get_cpu_capability();
 
 template <typename FnPtr, typename T>
-struct AT_API DispatchStub;
+struct CAFFE2_API DispatchStub;
 
 template <typename rT, typename T, typename... Args>
-struct AT_API DispatchStub<rT (*) (Args...), T> {
+struct CAFFE2_API DispatchStub<rT (*)(Args...), T> {
   using FnPtr = rT (*) (Args...);
 
   template <typename... ArgTypes>
@@ -114,9 +114,9 @@ struct RegisterDispatch {
 // adding parentheses and using helper struct to get rid of the parentheses, do
 // not work with MSVC. So do a `using`-declaration if you need to pass in such
 // `fn`, e.g., grid_sampler_2d_backward_cpu_kernel in GridSampleKernel.h.
-#define DECLARE_DISPATCH(fn, name) \
+#define DECLARE_DISPATCH(fn, name)         \
   struct name : DispatchStub<fn, name> {}; \
-  extern AT_API struct name name
+  extern CAFFE2_API struct name name
 
 #define DEFINE_DISPATCH(name) struct name name
 
diff --git a/aten/src/ATen/native/Distance.cpp b/aten/src/ATen/native/Distance.cpp
index 08f306869d89f..f075269291d64 100644
--- a/aten/src/ATen/native/Distance.cpp
+++ b/aten/src/ATen/native/Distance.cpp
@@ -26,7 +26,7 @@ Tensor _pdist_forward(const Tensor& self, const double p) {
   AT_CHECK(self.is_contiguous(), "_pdist_forward requires contiguous input");
   auto device = self.type().device_type();
   AT_CHECK(device == kCPU || device == kCUDA, "_pdist_forward only supports CPU and CUDA devices, got: ", device);
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   if (self.size(0) <= 1) {
     result.resize_({0});
   } else {
diff --git a/aten/src/ATen/native/Distributions.cpp b/aten/src/ATen/native/Distributions.cpp
index 3a2d1da5bd9a5..9810c9128980e 100644
--- a/aten/src/ATen/native/Distributions.cpp
+++ b/aten/src/ATen/native/Distributions.cpp
@@ -173,7 +173,7 @@ Tensor& bernoulli_scalar_cpu_(Tensor& self, double p, Generator* gen) {
 
 
 Tensor _standard_gamma_grad_cpu(const Tensor& self, const Tensor& output) {
-  Tensor ret = self.type().tensor(self.sizes());
+  Tensor ret = at::empty(self.sizes(), self.options());
   AT_DISPATCH_FLOATING_TYPES(self.type(), "_standard_gamma_grad", [&] {
     CPU_tensor_apply3<scalar_t, scalar_t, scalar_t>(ret, self, output,
       [](scalar_t& ret_val, const scalar_t& self_val, const scalar_t &output_val) {
diff --git a/aten/src/ATen/native/Embedding.cpp b/aten/src/ATen/native/Embedding.cpp
index 99fa4c701d4bb..72518fbd4a0e8 100644
--- a/aten/src/ATen/native/Embedding.cpp
+++ b/aten/src/ATen/native/Embedding.cpp
@@ -66,12 +66,12 @@ Tensor embedding_sparse_backward(
 
   int64_t num_features = grad_.size(-1);
   auto weight_size = std::array<int64_t, 2>{{ num_weights, num_features }};
-  auto& dense_type = grad.type();
+  auto dense_options = grad.options();
 
   // check if all our grad come from padding_idx
   if (grad.numel() == 0) {
-    return at::_sparse_coo_tensor_unsafe(indices_.type().tensor({1, 0}),
-                                         dense_type.tensor({0, num_features}),
+    return at::_sparse_coo_tensor_unsafe(at::empty({1, 0}, indices_.options()),
+                                         at::empty({0, num_features}, dense_options),
                                          weight_size);
   }
 
@@ -168,7 +168,7 @@ Tensor & embedding_renorm_cpu_(
       continue;
     }
     auto row = self[sorted_indices[i]];
-    auto norm = row.norm(norm_type).toCDouble();
+    auto norm = row.norm(norm_type).item<double>();
     if (norm > max_norm) {
       auto scale = max_norm / (norm + 1e-7);
       row *= scale;
diff --git a/aten/src/ATen/native/Indexing.cpp b/aten/src/ATen/native/Indexing.cpp
index 5566fd397320a..bb06719d85a49 100644
--- a/aten/src/ATen/native/Indexing.cpp
+++ b/aten/src/ATen/native/Indexing.cpp
@@ -73,7 +73,7 @@ static std::vector<Tensor> expandByteTensors(const Tensor & self, TensorList ind
         if (special_empty) {
           // We can't call select on an empty tensor so we just create an empty
           // tensor.
-          result.emplace_back(nonzero.type().tensor());
+          result.emplace_back(at::empty({0}, nonzero.options()));
         } else {
           result.emplace_back(nonzero.select(1, j));
         }
@@ -143,8 +143,8 @@ static Tensor unsqueezeN(const Tensor & src, int64_t before, int64_t after) {
 
 static Tensor wrapIndexOnce(const Tensor & index, int64_t dim, int64_t dim_size) {
   if (index.numel() != 0) {
-    auto max_idx = index.max().toCLong();
-    auto min_idx = index.min().toCLong();
+    auto max_idx = index.max().item<int64_t>();
+    auto min_idx = index.min().item<int64_t>();
     AT_CHECK(max_idx < dim_size,
              "index ", max_idx, " is out of bounds for dimension ", dim, " with size ", dim_size);
     AT_CHECK(min_idx >= -dim_size,
diff --git a/aten/src/ATen/native/LegacyBridge.cpp b/aten/src/ATen/native/LegacyBridge.cpp
index 0aaf2149b42a0..5fc554410ac9c 100644
--- a/aten/src/ATen/native/LegacyBridge.cpp
+++ b/aten/src/ATen/native/LegacyBridge.cpp
@@ -150,10 +150,6 @@ Tensor tensor(const Type& dtype, ArrayRef<int64_t> size) {
   }
 }
 
-Tensor sparse_coo_tensor(const Type& dtype, ArrayRef<int64_t> size) {
-  return at::getType(dtype.options().layout(at::kSparse)).native_sparse_coo_tensor(size);
-}
-
 Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values) {
   return at::getType(values.options().layout(at::kSparse)).native_sparse_coo_tensor(indices, values);
 }
@@ -162,6 +158,21 @@ Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values, ArrayRef<i
   return at::getType(values.options().layout(at::kSparse)).native_sparse_coo_tensor(indices, values, size);
 }
 
+Tensor sparse_coo_tensor(ArrayRef<int64_t> size, const TensorOptions& options) {
+  TensorOptions toptions = options;
+  return at::getType(toptions.layout(at::kSparse)).native_sparse_coo_tensor(size);
+}
+
+Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values, const TensorOptions& options) {
+  TensorOptions toptions = options;
+  return at::getType(toptions.layout(at::kSparse)).native_sparse_coo_tensor(indices, values);
+}
+
+Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values, ArrayRef<int64_t> size, const TensorOptions& options) {
+  TensorOptions toptions = options;
+  return at::getType(toptions.layout(at::kSparse)).native_sparse_coo_tensor(indices, values, size);
+}
+
 Tensor _sparse_coo_tensor_unsafe(const Tensor& indices, const Tensor& values, ArrayRef<int64_t> size) {
   return at::getType(values.options().layout(at::kSparse))._native_sparse_coo_tensor_unsafe(indices, values, size);
 }
diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp
index bdf9602fe9ae0..7b0d89d4d5675 100644
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@@ -404,7 +404,7 @@ Tensor _trilinear(const Tensor& i1_, const Tensor& i2_, const Tensor& i3_,
   int64_t slicemul2 = (expand2[unroll_dim] ? 0 : 1);
   int64_t slicemul3 = (expand3[unroll_dim] ? 0 : 1);
 
-  auto output = i1.type().tensor(output_size).zero_();
+  auto output = at::zeros(output_size, i1.options());
   if (! sumdim[unroll_dim]) {
     for (int64_t k = 0; k < unroll_size; k++) {
       Tensor buf = at::native::sumproduct_pair(i1.narrow(unroll_dim, k * slicemul1, 1),
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index 04bf617081387..0cd08c5b2c491 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -22,7 +22,7 @@ static inline std::tuple<double, Tensor, int> _lu_det_P_diag_U_info(const Tensor
   std::tie(lu, p, info) = self.unsqueeze(0).btrifact_with_info();
   p.squeeze_(0);
   lu.squeeze_(0);
-  int int_info = info.squeeze_().toCInt();
+  int int_info = info.squeeze_().item<int32_t>();
   AT_CHECK(int_info >= 0, "LU factorization (getrf) failed with info = ", int_info);
   auto n = self.size(0);
   auto num_exchanges = (at::arange(1, n + 1, p.type()) != p).nonzero().size(0);
@@ -63,7 +63,7 @@ Tensor logdet(const Tensor& self) {
   } else {
     det = diag_U.prod().mul_(det_P);
   }
-  if (det.sign().toCDouble() <= 0) {
+  if (det.sign().item<double>() <= 0) {
     return det.log_();  // in order to get proper -inf (det=0) or nan (det<0)
   } else {
     return diag_U.abs().log().sum();
@@ -88,7 +88,7 @@ std::tuple<Tensor, Tensor> slogdet(const Tensor& self) {
 }
 
 Tensor inverse(const Tensor& self) {
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   return at::native::inverse_out(result, self);
 }
 
@@ -111,7 +111,7 @@ Tensor pinverse(const Tensor& self, double rcond) {
            "of floating types");
   if (self.numel() == 0) {
     // Match NumPy
-    return self.type().tensor({self.size(1), self.size(0)});
+    return at::empty({self.size(1), self.size(0)}, self.options());
   }
   Tensor U, S, V;
   std::tie(U, S, V) = self.svd();
@@ -345,7 +345,7 @@ static inline Tensor& bmm_out_or_baddbmm_(Tensor& self_or_result, const Tensor&
 
 
 Tensor baddbmm_cpu(const Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   return at::native::baddbmm_out_cpu(result, self, batch1, batch2, beta, alpha);
 }
 
@@ -362,7 +362,7 @@ Tensor& baddbmm__cpu(Tensor& self, const Tensor& batch1, const Tensor& batch2, S
 }
 
 Tensor bmm_cpu(const Tensor& self, const Tensor& mat2) {
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   return at::native::bmm_out_cpu(result, self, mat2);
 }
 
diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp
index c976121e77ae3..9e61db8543fbc 100644
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@@ -321,7 +321,7 @@ Tensor sum(const Tensor& self, IntList dim, ScalarType dtype) {
 
 Tensor _sum(const Tensor &self, int64_t dim_, bool keepdim) {
   int64_t dim = maybe_wrap_dim(dim_, self.dim());
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   return at::_sum_out(result, self, dim, keepdim);
 }
 
@@ -343,7 +343,7 @@ Tensor prod(const Tensor& self, int64_t dim, ScalarType dtype) {
 
 Tensor _prod(const Tensor &self, int64_t dim_, bool keepdim) {
   int64_t dim = maybe_wrap_dim(dim_, self.dim());
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   return at::_prod_out(result, self, dim, keepdim);
 }
 
@@ -365,7 +365,7 @@ Tensor& logsumexp_out(Tensor& result, const Tensor &self, int64_t dim_, bool kee
 
 Tensor logsumexp(const Tensor &self, int64_t dim_, bool keepdim) {
   int64_t dim = maybe_wrap_dim(dim_, self.dim());
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   return at::native::logsumexp_out(result, self, dim, keepdim);
 }
 
@@ -639,7 +639,7 @@ Tensor _norm(const Tensor &self, Scalar p) {
 }
 
 Tensor norm(const Tensor& self, Scalar p, int64_t dim, bool keepdim) {
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   return at::native::norm_out(result, self, p, dim, keepdim);
 }
 
@@ -648,7 +648,7 @@ Tensor norm(const Tensor& self, Scalar p) {
 }
 
 Tensor all(const Tensor& self, int64_t dim, bool keepdim) {
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   return at::native::all_out(result, self, dim, keepdim);
 }
 
@@ -665,7 +665,7 @@ Tensor &all_out(Tensor &result, const Tensor &self, int64_t dim, bool keepdim) {
 }
 
 Tensor any(const Tensor& self, int64_t dim, bool keepdim) {
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   return at::native::any_out(result, self, dim, keepdim);
 }
 
@@ -690,7 +690,7 @@ Tensor var(const Tensor& self, bool unbiased) {
 }
 
 Tensor var(const Tensor& self, int64_t dim, bool unbiased, bool keepdim) {
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   return at::native::var_out(result, self, dim, unbiased, keepdim);
 }
 
@@ -715,7 +715,7 @@ Tensor std(const Tensor& self, bool unbiased) {
 }
 
 Tensor std(const Tensor& self, int64_t dim, bool unbiased, bool keepdim) {
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   return at::native::std_out(result, self, dim, unbiased, keepdim);
 }
 
diff --git a/aten/src/ATen/native/RoiPooling.cpp b/aten/src/ATen/native/RoiPooling.cpp
index 1a089a9f473c1..26aeee9caf719 100644
--- a/aten/src/ATen/native/RoiPooling.cpp
+++ b/aten/src/ATen/native/RoiPooling.cpp
@@ -28,13 +28,13 @@ std::tuple<at::Tensor, at::Tensor> RoiPooling2d_forward_cpu(
   auto inputWidth = input.size(3);
 
   // Output Tensor is (num_rois, C, pooledHeight, pooledWidth)
-  auto output = input.type().tensor({proposals, inputChannels, pooledHeight, pooledWidth});
+  auto output = at::empty({proposals, inputChannels, pooledHeight, pooledWidth}, input.options());
 
   // TODO: need some mechanism for determining train vs. test
 
   // During training, we need to store the argmaxes for the pooling operation, so
   // the argmaxes Tensor should be the same size as the output Tensor
-  auto argmaxes = input.type().toScalarType(kInt).tensor({proposals, inputChannels, pooledHeight, pooledWidth});
+  auto argmaxes = at::empty({proposals, inputChannels, pooledHeight, pooledWidth}, input.options().dtype(kInt));
 
   AT_CHECK(input.is_contiguous(), "input must be contiguous");
   AT_CHECK(rois.is_contiguous(), "rois must be contiguous");
diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp
index 40c4ce39addeb..1cca4191fd079 100644
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@@ -34,7 +34,7 @@ DEFINE_DISPATCH(max_kernel);
 DEFINE_DISPATCH(min_kernel);
 
 bool allclose(const Tensor& self, const Tensor& other, double rtol, double atol, bool equal_nan) {
-  return at::isclose(self, other, rtol, atol, equal_nan).all().toCByte();
+  return at::isclose(self, other, rtol, atol, equal_nan).all().item<uint8_t>();
 }
 
 Tensor isclose(const Tensor& self, const Tensor& other, double rtol, double atol, bool equal_nan) {
@@ -85,7 +85,7 @@ Tensor where(const Tensor& condition, const Tensor& self, const Tensor& other) {
 }
 
 Tensor _s_where_cpu(const Tensor& condition, const Tensor& self, const Tensor& other) {
-  Tensor ret = self.type().tensor(self.sizes());
+  Tensor ret = at::empty(self.sizes(), self.options());
   AT_DISPATCH_ALL_TYPES(ret.type(), "where", [&] {
     where_cpu<scalar_t>(ret, condition, self, other);
   });
@@ -93,8 +93,8 @@ Tensor _s_where_cpu(const Tensor& condition, const Tensor& self, const Tensor& o
 }
 
 std::tuple<Tensor, Tensor> kthvalue(const Tensor& self, int64_t k, int64_t dim, bool keepdim) {
-  Tensor values = self.type().tensor();
-  Tensor indices = self.type().toScalarType(kLong).tensor();
+  Tensor values = at::empty({0}, self.options());
+  Tensor indices = at::empty({0}, self.options().dtype(kLong));
   return at::native::kthvalue_out(values, indices, self, k, dim, keepdim);
 }
 
@@ -113,8 +113,8 @@ std::tuple<Tensor &,Tensor &> kthvalue_out(Tensor& values, Tensor& indices,
 }
 
 std::tuple<Tensor, Tensor> median(const Tensor& self, int64_t dim, bool keepdim) {
-  Tensor values = self.type().tensor();
-  Tensor indices = self.type().toScalarType(kLong).tensor();
+  Tensor values = at::empty({0}, self.options());
+  Tensor indices = at::empty({0}, self.options().dtype(kLong));
   return at::native::median_out(values, indices, self, dim, keepdim);
 }
 
@@ -133,8 +133,8 @@ std::tuple<Tensor &,Tensor &> median_out(Tensor& values, Tensor& indices,
 }
 
 std::tuple<Tensor, Tensor> mode(const Tensor& self, int64_t dim, bool keepdim) {
-  Tensor values = self.type().tensor();
-  Tensor indices = self.type().toScalarType(kLong).tensor();
+  Tensor values = at::empty({0}, self.options());
+  Tensor indices = at::empty({0}, self.options().dtype(kLong));
   return at::native::mode_out(values, indices, self, dim, keepdim);
 }
 
@@ -168,8 +168,8 @@ std::tuple<Tensor &,Tensor &> _max_out_cpu(Tensor& max, Tensor& max_indices,
 }
 
 std::tuple<Tensor, Tensor> max(const Tensor& self, int64_t dim, bool keepdim) {
-  Tensor max = self.type().tensor();
-  Tensor max_indices = self.type().toScalarType(kLong).tensor();
+  Tensor max = at::empty({0}, self.options());
+  Tensor max_indices = at::empty({0}, self.options().dtype(kLong));
   return at::native::max_out(max, max_indices, self, dim, keepdim);
 }
 
@@ -211,8 +211,8 @@ std::tuple<Tensor &,Tensor &> _min_out_cpu(Tensor& min, Tensor& min_indices,
 }
 
 std::tuple<Tensor, Tensor> min(const Tensor& self, int64_t dim, bool keepdim) {
-  Tensor min = self.type().tensor();
-  Tensor min_indices = self.type().toScalarType(kLong).tensor();
+  Tensor min = at::empty({0}, self.options());
+  Tensor min_indices = at::empty({0}, self.options().dtype(kLong));
   return at::native::min_out(min, min_indices, self, dim, keepdim);
 }
 
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index 178045d9fd0de..2e37acc951a61 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -118,6 +118,12 @@ Tensor& empty_out(Tensor& result, IntList size) {
   return result;
 }
 
+Tensor empty_strided(IntList size, IntList stride, const TensorOptions& options) {
+  // Note [Native bindings for legacy TH factory functions]
+  return getFactoryType(options).tensor(size, stride);
+}
+
+
 // Temporary type cast operators. These are needed to trace type-casts now since
 // Type's are not supported in the IR. Instead, we call down to these
 // specialized operators for each datatype.
diff --git a/aten/src/ATen/native/TensorIterator.cpp b/aten/src/ATen/native/TensorIterator.cpp
index c3535a92a0572..97645c0d0256c 100644
--- a/aten/src/ATen/native/TensorIterator.cpp
+++ b/aten/src/ATen/native/TensorIterator.cpp
@@ -153,7 +153,7 @@ void TensorIterator::allocate_outputs() {
       for (int dim = 0; dim < ndim(); dim++) {
         tensor_stride[dim] /= element_size;
       }
-      *op.tensor = op.type->tensor(tensor_shape, tensor_stride);
+      *op.tensor = at::empty_strided(tensor_shape, tensor_stride, op.type->options());
     }
   }
 }
diff --git a/aten/src/ATen/native/TensorIterator.h b/aten/src/ATen/native/TensorIterator.h
index 3faedbec6bb32..7d97d7f7f6635 100644
--- a/aten/src/ATen/native/TensorIterator.h
+++ b/aten/src/ATen/native/TensorIterator.h
@@ -50,7 +50,7 @@
 
 namespace at {
 
-struct AT_API OperandInfo {
+struct CAFFE2_API OperandInfo {
   OperandInfo() {}
   OperandInfo(const Tensor& t) : tensor(const_cast<Tensor*>(&t)) {}
 
@@ -82,7 +82,7 @@ struct AT_API OperandInfo {
 
 struct SplitUntil32Bit;
 
-struct AT_API TensorIterator {
+struct CAFFE2_API TensorIterator {
   struct Builder;
   friend struct Builder;
 
@@ -212,8 +212,8 @@ struct TensorIterator::Builder {
 /// A container-like struct that acts as if it contains splits of a
 /// TensorIterator that can use 32-bit indexing. Taken together the splits cover
 /// the original TensorIterator.
-struct AT_API SplitUntil32Bit {
-  struct AT_API iterator {
+struct CAFFE2_API SplitUntil32Bit {
+  struct CAFFE2_API iterator {
     iterator() {};
     iterator(const TensorIterator& iter);
     iterator(iterator&&) = default;
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 634e7a443d21f..c470f554c1423 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -133,7 +133,7 @@ Tensor expand_as(const Tensor& self, const Tensor& other) {
 }
 
 Tensor as_strided(const Tensor& self, IntList size, IntList stride, int64_t storage_offset) {
-  return self.type().tensor().set_(self.storage(), storage_offset, size, stride);
+  return at::empty({0}, self.options()).set_(self.storage(), storage_offset, size, stride);
 }
 
 Tensor &as_strided_(Tensor& self, IntList size, IntList stride, int64_t storage_offset) {
@@ -196,7 +196,7 @@ Tensor repeat(const Tensor& self, IntList repeats) {
 
   Tensor xtensor = self.expand(padded_size);
 
-  Tensor result = self.type().tensor(target_size);
+  Tensor result = at::empty(target_size, self.options());
   Tensor urtensor = at::alias(result);
   for (int64_t i = 0; i < xtensor.dim(); ++i) {
     // can't unfold with step 0, so make sure step is at least 1
diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index 89a13e14b8b2e..f6434b2c957c1 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -31,17 +31,17 @@ namespace at {
 namespace native {
 
 Tensor clamp(const Tensor& self, Scalar min, Scalar max) {
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   return clamp_out(result, self, min, max);
 }
 
 Tensor clamp_max(const Tensor& self, Scalar max) {
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   return clamp_max_out(result, self, max);
 }
 
 Tensor clamp_min(const Tensor& self, Scalar min) {
-  Tensor result = self.type().tensor();
+  Tensor result = at::empty({0}, self.options());
   return clamp_min_out(result, self, min);
 }
 
@@ -99,7 +99,7 @@ Tensor& fill_(Tensor& self, const Tensor& value) {
 Tensor mvlgamma(const Tensor& self, int64_t p) {
   AT_CHECK(at::isFloatingType(self.type().scalarType()),
            "mvlgamma is not implemented for ", self.type());
-  AT_CHECK((self > 0.5 * (p - 1.)).all().toCByte(),
+  AT_CHECK((self > 0.5 * (p - 1.)).all().item<uint8_t>(),
            "Condition for computing multivariate log-gamma not met");
   AT_CHECK(p >= 1, "p has to be greater than or equal to 1");
   Tensor args = native::arange(-p / 2. + 0.5, 0.5, 0.5, self.options());
@@ -110,7 +110,7 @@ Tensor mvlgamma(const Tensor& self, int64_t p) {
 Tensor& mvlgamma_(Tensor& self, int64_t p) {
   AT_CHECK(at::isFloatingType(self.type().scalarType()),
            "mvlgamma is not implemented for ", self.type());
-  AT_CHECK((self > 0.5 * (p - 1.)).all().toCByte(),
+  AT_CHECK((self > 0.5 * (p - 1.)).all().item<uint8_t>(),
            "Condition for computing multivariate log-gamma not met");
   AT_CHECK(p >= 1, "p has to be greater than or equal to 1");
   Tensor args = native::arange(-p / 2. + 0.5, 0.5, 0.5, self.options());
@@ -123,7 +123,7 @@ Tensor& mvlgamma_(Tensor& self, int64_t p) {
 
 #define IMPLEMENT_UNARY_OP_VEC(op)                              \
   Tensor op(const Tensor& self) {                               \
-    Tensor result = self.type().tensor();                       \
+    Tensor result = at::empty({0}, self.options());             \
     return at::op##_out(result, self);                          \
   }                                                             \
   Tensor& _##op##__cpu(Tensor& self_) {                         \
@@ -143,7 +143,7 @@ Tensor& mvlgamma_(Tensor& self, int64_t p) {
 
 #define IMPLEMENT_UNARY_OP_TH(op)                               \
   Tensor op(const Tensor& self) {                               \
-    Tensor result = self.type().tensor();                       \
+    Tensor result = at::empty({0}, self.options());             \
     return at::op##_out(result, self);                          \
   }                                                             \
   Tensor& _##op##__cpu(Tensor& self) {                          \
diff --git a/aten/src/ATen/native/cuda/Activation.cu b/aten/src/ATen/native/cuda/Activation.cu
index 1bce68730f0d0..505054b8d431c 100644
--- a/aten/src/ATen/native/cuda/Activation.cu
+++ b/aten/src/ATen/native/cuda/Activation.cu
@@ -72,11 +72,10 @@ Tensor prelu_cuda(const Tensor& self, const Tensor& weight_) {
     AT_CHECK(input_ndim > 0, "Not allow zero-dim input tensor.");
 
     int64_t channel_size = 1; // channel_size default to 1
-    int64_t input_dim0_size = 1, input_stride0 = 1, input_stride1 = 1;
+    int64_t input_stride0 = 1, input_stride1 = 1;
 
     if (input_ndim > 1) {
       channel_size = input.size(1); // channel is the 2nd dim of input
-      input_dim0_size = input.size(0);
       input_stride0 = strides[0];
       input_stride1 = strides[1];
     }
@@ -189,11 +188,10 @@ std::tuple<Tensor, Tensor> prelu_backward_cuda(const Tensor& grad_out_, const Te
     AT_CHECK(input_ndim > 0, "Not allow zero-dim input tensor.");
 
     int64_t channel_size = 1; // channel_size default to 1
-    int64_t input_dim0_size = 1, input_stride0 = 1, input_stride1 = 1;
+    int64_t input_stride0 = 1, input_stride1 = 1;
 
     if (input_ndim > 1) {
       channel_size = input.size(1); // channel is the 2nd dim of input
-      input_dim0_size = input.size(0);
       input_stride0 = strides[0];
       input_stride1 = strides[1];
     }
diff --git a/aten/src/ATen/native/cuda/CuFFTPlanCache.h b/aten/src/ATen/native/cuda/CuFFTPlanCache.h
index 8715a9ef460ee..c8ea2c3c69196 100644
--- a/aten/src/ATen/native/cuda/CuFFTPlanCache.h
+++ b/aten/src/ATen/native/cuda/CuFFTPlanCache.h
@@ -346,6 +346,7 @@ class CuFFTConfig {
 //     be fine for now.
 // TODO: When CUDA 10 comes out, check if the bug is fixed or if we need another
 //       number for CUDA 10.
+// Update: bug related to cuFFT plan cache max size has been fixed in CUDA 10.
 constexpr int64_t CUFFT_MAX_PLAN_NUM = 1023;
 static_assert(CUFFT_MAX_PLAN_NUM >= 0 && CUFFT_MAX_PLAN_NUM <= std::numeric_limits<size_t>::max(),
               "CUFFT_MAX_PLAN_NUM not in size_t range");
@@ -389,12 +390,17 @@ class CuFFTParamsLRUCache {
 
     // Miss
     // remove if needed
+    // bug related to cuFFT plan cache max size has been fixed
+    // in CUDA 10. Hence, when compiling with CUDA 10, just
+    // don't do the erase.
+    #if CUDA_VERSION < 10000
     if (_usage_list.size() >= _max_size) {
       auto last = _usage_list.end();
       last--;
       _cache_map.erase(last->first);
       _usage_list.pop_back();
     }
+    #endif
 
     // construct new plan at list front, then insert into _cache_map
     _usage_list.emplace_front(std::piecewise_construct,
@@ -414,7 +420,8 @@ class CuFFTParamsLRUCache {
 
   void resize(int64_t new_size) {
     _set_max_size(new_size);
-
+    // no-op when compiling with CUDA 10.
+    #if CUDA_VERSION < 10000
     auto cur_size = _usage_list.size();
     if (cur_size > _max_size) {
       auto delete_it = _usage_list.end();
@@ -424,17 +431,26 @@ class CuFFTParamsLRUCache {
       }
       _usage_list.erase(delete_it, _usage_list.end());
     }
+    #endif
   }
 
   size_t size() const { return _cache_map.size(); }
 
-  size_t max_size() const noexcept { return _max_size; }
+  size_t max_size() const noexcept {
+    #if CUDA_VERSION < 10000
+      return _max_size;
+    #else
+      return size();
+    #endif
+  }
 
 private:
   // Only sets size and does value check. Does not resize the data structures.
   void _set_max_size(int64_t new_size) {
+    #if CUDA_VERSION < 10000
     AT_CHECK(new_size <= CUFFT_MAX_PLAN_NUM,
              "cuFFT plan cache size can not be larger than ", CUFFT_MAX_PLAN_NUM, ", but got ", new_size);
+    #endif
     AT_CHECK(new_size >= 0,
              "cuFFT plan cache size must be non-negative, but got ", new_size);
     _max_size = static_cast<size_t>(new_size);
diff --git a/aten/src/ATen/native/cuda/DistanceKernel.cu b/aten/src/ATen/native/cuda/DistanceKernel.cu
index 02c143254ced7..f6128389f16f2 100644
--- a/aten/src/ATen/native/cuda/DistanceKernel.cu
+++ b/aten/src/ATen/native/cuda/DistanceKernel.cu
@@ -192,7 +192,7 @@ void pdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor
   const dim3 grid(grid_x, grid_y);
   const dim3 block(block_x, block_y);
 
-  Tensor buffer = result.type().tensor({n - 1, result.size(0), result.size(1)});
+  Tensor buffer = at::empty({n - 1, result.size(0), result.size(1)}, result.options());
   AT_DISPATCH_FLOATING_TYPES(self.type(), "pdist_cuda_backward", [&] {
     if (p == 1.0) {
       pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
diff --git a/aten/src/ATen/native/cuda/Distributions.cu b/aten/src/ATen/native/cuda/Distributions.cu
index fc908714f18f2..50ea3a9bf32b2 100644
--- a/aten/src/ATen/native/cuda/Distributions.cu
+++ b/aten/src/ATen/native/cuda/Distributions.cu
@@ -182,7 +182,7 @@ void bernoulli_scalar_cuda_kernel(
 
 namespace at { namespace native {
 Tensor _s_poisson_cuda(const Tensor& lambda, Generator* gen) {
-  Tensor ret = lambda.type().tensor(lambda.sizes());
+  Tensor ret = at::empty(lambda.sizes(), lambda.options());
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(ret.type(), "poisson", [&] {
     poisson_cuda_kernel<scalar_t>(ret, lambda, next_philox_seed(gen, 20));
   });
@@ -190,7 +190,7 @@ Tensor _s_poisson_cuda(const Tensor& lambda, Generator* gen) {
 }
 
 Tensor _s_gamma_cuda(const Tensor& alpha, Generator* gen) {
-  Tensor ret = alpha.type().tensor(alpha.sizes());
+  Tensor ret = at::empty(alpha.sizes(), alpha.options());
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(ret.type(), "gamma", [&] {
      gamma_cuda_kernel<scalar_t>(ret, alpha, next_philox_seed(gen, 10));
    });
@@ -198,7 +198,7 @@ Tensor _s_gamma_cuda(const Tensor& alpha, Generator* gen) {
 }
 
 Tensor _standard_gamma_grad_cuda(const Tensor& self, const Tensor& output) {
-  Tensor ret = self.type().tensor(self.sizes());
+  Tensor ret = at::empty(self.sizes(), self.options());
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.type(), "_standard_gamma_grad", [&] {
      gamma_grad_cuda_kernel<scalar_t>(ret, self, output);
    });
diff --git a/aten/src/ATen/native/cuda/Dropout.cu b/aten/src/ATen/native/cuda/Dropout.cu
index 2d133a70dc23b..6976565de059a 100644
--- a/aten/src/ATen/native/cuda/Dropout.cu
+++ b/aten/src/ATen/native/cuda/Dropout.cu
@@ -97,7 +97,7 @@ void masked_scale_kernel(at::Tensor& ret, const at::Tensor src, const at::Tensor
 std::tuple<Tensor,Tensor>
 fused_dropout_cuda(const Tensor& self, double p, Generator * gen){
   Tensor ret = at::empty_like(self);
-  Tensor mask = self.type().toScalarType(kByte).tensor(self.sizes());
+  Tensor mask = at::empty(self.sizes(), self.options().dtype(kByte));
   const int64_t nelem = self.numel();
   const int64_t block_size = 256;
   unsigned int blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor/block_size;
diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu
index 27b079fe219e2..ddc01923859b1 100644
--- a/aten/src/ATen/native/cuda/Embedding.cu
+++ b/aten/src/ATen/native/cuda/Embedding.cu
@@ -349,7 +349,7 @@ Tensor & embedding_renorm_cuda_(Tensor & self, const Tensor & indices,
   // FIXME: thrust::unique only removes consecutive elements that are equal.
   // We have race conditions when indices contain duplicates which are not
   // adjacent
-  auto unique_indices = indices.type().tensor(indices.numel());
+  auto unique_indices = at::empty(indices.numel(), indices.options());
   auto unique_data = device_ptr(unique_indices.data<int64_t>());
   auto end = thrust::unique_copy(policy, indices_data, indices_data + num_indices, unique_data);
   auto num_unique_indices = static_cast<int>(end - unique_data);
diff --git a/aten/src/ATen/native/cuda/Gesv.cu b/aten/src/ATen/native/cuda/Gesv.cu
index 80c7aaeb74f6a..a5802192eb77d 100644
--- a/aten/src/ATen/native/cuda/Gesv.cu
+++ b/aten/src/ATen/native/cuda/Gesv.cu
@@ -53,12 +53,16 @@ static magma_queue_t createMagmaQueue(const Tensor& tensor) {
   magma_queue_create_from_cuda(
       tensor.get_device(),
       at::cuda::getCurrentCUDAStream(),
-      THCState_getCurrentBlasHandle(context.getTHCState()),
-      THCState_getCurrentSparseHandle(context.getTHCState()),
+      at::cuda::getCurrentCUDABlasHandle(),
+      at::cuda::getCurrentCUDASparseHandle(),
       &magma_queue);
   return magma_queue;
 }
 
+static void destroyMagmaQueue(magma_queue_t& existing_queue) {
+  magma_queue_destroy(existing_queue);
+}
+
 static inline magma_int_t magma_int_cast(int64_t value, const char* varname) {
   auto result = static_cast<magma_int_t>(value);
   if (static_cast<int64_t>(result) != value) {
@@ -117,9 +121,11 @@ AT_ERROR("gesv: MAGMA library not found in "
     ipiv_array[i] = &ipiv_data[i * n];
   }
 
+  magma_queue_t gesv_queue = createMagmaQueue(b);
   magmaGesvBatched<scalar_t>(
       n, nrhs, A_array, n, ipiv_array, b_array, n,
-      info_array, batch_size, createMagmaQueue(b));
+      info_array, batch_size, gesv_queue);
+  destroyMagmaQueue(gesv_queue);
 
   for (int64_t i = 0; i < batch_size; i++) {
     infos[i] = info_array[i];
diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh
index b3435bd0f6bfb..6b5a0e59d08ab 100644
--- a/aten/src/ATen/native/cuda/Loops.cuh
+++ b/aten/src/ATen/native/cuda/Loops.cuh
@@ -63,6 +63,13 @@ template<typename func_t>
 void gpu_nullary_kernel(TensorIterator& iter, const func_t& f) {
   ASSERT_HOST_DEVICE_LAMBDA(func_t);
 
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      gpu_nullary_kernel(sub_iter, f);
+    }
+    return;
+  }
+
   char* out_data = (char*)iter.data_ptr(0);
 
   using traits = function_traits<func_t>;
@@ -93,6 +100,13 @@ template<typename func_t>
 void gpu_unary_kernel(TensorIterator& iter, const func_t& f) {
   ASSERT_HOST_DEVICE_LAMBDA(func_t);
 
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      gpu_unary_kernel(sub_iter, f);
+    }
+    return;
+  }
+
   char* out_data = (char*)iter.data_ptr(0);
   const char* in1_data = (char*)iter.data_ptr(1);
 
diff --git a/aten/src/ATen/native/cuda/RoiPooling.cu b/aten/src/ATen/native/cuda/RoiPooling.cu
index 0fd3f1d6efd15..6c0a90d4c2f48 100644
--- a/aten/src/ATen/native/cuda/RoiPooling.cu
+++ b/aten/src/ATen/native/cuda/RoiPooling.cu
@@ -122,13 +122,13 @@ std::tuple<Tensor, Tensor> RoiPooling2d_forward_cuda(
   auto inputWidth = input.size(3);
 
   // Output Tensor is (num_rois, C, pooledHeight, pooledWidth)
-  auto output = input.type().tensor({proposals, inputChannels, pooledHeight, pooledWidth});
+  auto output = at::empty({proposals, inputChannels, pooledHeight, pooledWidth}, input.options());
 
   // TODO: need some mechanism for determining train vs. test
 
   // During training, we need to store the argmaxes for the pooling operation, so
   // the argmaxes Tensor should be the same size as the output Tensor
-  auto argmaxes = input.type().toScalarType(kInt).tensor({proposals, inputChannels, pooledHeight, pooledWidth});
+  auto argmaxes = at::empty({proposals, inputChannels, pooledHeight, pooledWidth}, input.options().dtype(kInt));
 
   AT_CHECK(input.is_contiguous(), "input must be contiguous");
   AT_CHECK(rois.is_contiguous(), "rois must be contiguous");
@@ -198,7 +198,7 @@ Tensor RoiPooling2d_backward_cuda(
   auto inputHeight = input.size(2);
   auto inputWidth = input.size(3);
 
-  auto gradInput = input.type().tensor(input.sizes());
+  auto gradInput = at::empty(input.sizes(), input.options());
 
   dim3 block(512);
   dim3 grid((gradInput.numel() + 512 - 1) / 512);
diff --git a/aten/src/ATen/native/cuda/SpectralOps.cu b/aten/src/ATen/native/cuda/SpectralOps.cu
index 38b1dddb49627..51ab68a4f78f1 100644
--- a/aten/src/ATen/native/cuda/SpectralOps.cu
+++ b/aten/src/ATen/native/cuda/SpectralOps.cu
@@ -184,7 +184,7 @@ static inline Tensor _run_cufft(
   auto& ctx = at::globalContext();
 
   // set output
-  auto output = input.type().tensor(output_sizes);
+  auto output = at::empty(output_sizes, input.options());
 
   // set to current stream
   CUFFT_CHECK(cufftSetStream(plan, at::cuda::getCurrentCUDAStream()));
diff --git a/aten/src/ATen/native/cuda/SummaryOps.cu b/aten/src/ATen/native/cuda/SummaryOps.cu
index 2ab983c17721a..0ef8ebabf065a 100644
--- a/aten/src/ATen/native/cuda/SummaryOps.cu
+++ b/aten/src/ATen/native/cuda/SummaryOps.cu
@@ -258,7 +258,7 @@ Tensor _bincount_cuda_template(
     AT_ERROR("input and weights should have the same length");
   }
 
-  auto nbins = self.max().toCLong() + 1L;
+  auto nbins = self.max().item<int64_t>() + 1L;
   nbins = std::max(nbins, minlength);
   // alloc output counter on GPU
   Tensor output;
diff --git a/aten/src/ATen/native/cuda/TensorCompare.cu b/aten/src/ATen/native/cuda/TensorCompare.cu
index 8e0cf4e1b76c6..8f99241ca35a9 100644
--- a/aten/src/ATen/native/cuda/TensorCompare.cu
+++ b/aten/src/ATen/native/cuda/TensorCompare.cu
@@ -32,7 +32,7 @@ Tensor _s_where_cuda(
     const Tensor& condition,
     const Tensor& self,
     const Tensor& other) {
-  Tensor ret = self.type().tensor(self.sizes());
+  Tensor ret = at::empty(self.sizes(), self.options());
   AT_DISPATCH_ALL_TYPES_AND_HALF(ret.type(), "where", [&] {
     where_cuda<scalar_t>(ret, condition, self, other);
   });
diff --git a/aten/src/ATen/native/cuda/TensorFactories.cu b/aten/src/ATen/native/cuda/TensorFactories.cu
index 309b54a299caa..cbddd0ae87a13 100644
--- a/aten/src/ATen/native/cuda/TensorFactories.cu
+++ b/aten/src/ATen/native/cuda/TensorFactories.cu
@@ -49,14 +49,14 @@ Tensor& randperm_out_cuda(Tensor& result, int64_t n, Generator* generator) {
     result.copy_(randperm_out_cuda(result_float, n, generator));
   } else {
     if (n < 30000) {  // For small inputs, we offload it to CPU instead.
-      auto result_cpu = result.type().cpu().tensor({n});
+      auto result_cpu = at::empty({n}, result.options().device(kCPU));
       randperm_out(result_cpu, n, generator);
       result.copy_(result_cpu);
     } else {
       // Generate random values for the keys array
       AT_DISPATCH_ALL_TYPES(
         result.type(), "randperm_out_cuda", [&] {
-          auto keys = result.type().tensor(result.sizes()).random_(generator);
+          auto keys = at::empty(result.sizes(), result.options()).random_(generator);
 
           auto result_data = thrust::device_ptr<scalar_t>(result.data<scalar_t>());
           auto keys_data = thrust::device_ptr<scalar_t>(keys.data<scalar_t>());
diff --git a/aten/src/ATen/native/cuda/WeightNorm.cu b/aten/src/ATen/native/cuda/WeightNorm.cu
index 67d8f39e2de71..5700ca559f0fe 100644
--- a/aten/src/ATen/native/cuda/WeightNorm.cu
+++ b/aten/src/ATen/native/cuda/WeightNorm.cu
@@ -329,7 +329,7 @@ std::tuple<Tensor,Tensor> weight_norm_cuda
                            at::ScalarType::Float : g.type().scalarType();
   // Will this create norms on the same device as g, regardless of what the thread's default 
   // current device is?  I believe so, because Type::* functions are DeviceGuard()ed.
-  auto norms = g.type().toScalarType(AccType).tensor(g.sizes(), g.strides());
+  auto norms = at::empty_strided(g.sizes(), g.strides(), g.options().dtype(AccType));
 
   const int ndims = v.dim();
 
diff --git a/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp b/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp
index 463d4ffea3cf0..a12df78c767e2 100644
--- a/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp
+++ b/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp
@@ -59,7 +59,7 @@ Tensor cudnn_affine_grid_generator_forward(
   checkContiguous(c, theta);
   checkSize(c, theta, {N, 2, 3});
 
-  auto grid_t = theta->type().tensor();
+  auto grid_t = at::empty({0}, theta->options());
   grid_t.resize_({N, H, W, 2});
 
   auto dataType = getCudnnDataType(*theta);
@@ -82,7 +82,7 @@ Tensor cudnn_affine_grid_generator_backward(
   checkContiguous(c, grad_grid);
   checkSize(c, grad_grid, {N, H, W, 2});
 
-  auto grad_theta_t = grad_grid->type().tensor();
+  auto grad_theta_t = at::empty({0}, grad_grid->options());
   grad_theta_t.resize_({N, 2, 3});
 
   auto dataType = getCudnnDataType(grad_theta_t);
diff --git a/aten/src/ATen/native/cudnn/BatchNorm.cpp b/aten/src/ATen/native/cudnn/BatchNorm.cpp
index d54fe256b2915..427f7e00d9d90 100644
--- a/aten/src/ATen/native/cudnn/BatchNorm.cpp
+++ b/aten/src/ATen/native/cudnn/BatchNorm.cpp
@@ -94,7 +94,7 @@ std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm(
 #endif
   }
 
-  auto output_t = input->type().tensor(input->sizes());
+  auto output_t = at::empty(input->sizes(), input->options());
   TensorArg output{ output_t, "output", 0 };
 
   auto handle = getCudnnHandle();
@@ -108,8 +108,8 @@ std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm(
 
   if (training) {
     int64_t num_features = input_t.size(1);
-    save_mean = weight_t.type().tensor({ num_features });
-    save_var = weight_t.type().tensor({ num_features });
+    save_mean = at::empty({ num_features }, weight_t.options());
+    save_var = at::empty({ num_features }, weight_t.options());
     AT_CUDNN_CHECK(cudnnBatchNormalizationForwardTraining(
       handle, mode, &one, &zero,
       idesc.desc(), input->data_ptr(),
@@ -190,9 +190,9 @@ std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward(
 #endif
   }
 
-  auto grad_input_t  = input->type().tensor(input->sizes());
-  auto grad_weight_t = weight->type().tensor(weight->sizes());
-  auto grad_bias_t   = weight->type().tensor(weight->sizes());
+  auto grad_input_t  = at::empty(input->sizes(), input->options());
+  auto grad_weight_t = at::empty(weight->sizes(), weight->options());
+  auto grad_bias_t   = at::empty(weight->sizes(), weight->options());
 
   auto handle = getCudnnHandle();
   auto dataType = getCudnnDataType(*input);
diff --git a/aten/src/ATen/native/cudnn/Conv.cpp b/aten/src/ATen/native/cudnn/Conv.cpp
index afbd7653aefa6..9638740c24a6a 100644
--- a/aten/src/ATen/native/cudnn/Conv.cpp
+++ b/aten/src/ATen/native/cudnn/Conv.cpp
@@ -836,9 +836,10 @@ Tensor cudnn_convolution_forward(
   checkAllSameType(c, {input, weight});
   checkAllSameGPU(c, {input, weight});
 
-  auto output_t = input->type().tensor(
+  auto output_t = at::empty(
                     conv_output_size(input->sizes(), weight->sizes(),
-                                     padding, stride, dilation, groups));
+                                     padding, stride, dilation, groups),
+                    input->options());
 
   // Avoid ambiguity of "output" when this is being used as backwards
   TensorArg output{ output_t, "result", 0 };
@@ -976,7 +977,7 @@ Tensor cudnn_convolution_backward_input(
   checkAllSameType(c, {grad_output, weight});
   checkAllSameGPU(c, {grad_output, weight});
 
-  auto grad_input_t = grad_output->type().tensor(input_size);
+  auto grad_input_t = at::empty(input_size, grad_output->options());
 
   // Avoid "grad_input" when this is being used as transposed convolution
   TensorArg grad_input{ grad_input_t, "result", 0 };
@@ -1111,7 +1112,7 @@ Tensor cudnn_convolution_backward_weight(
   checkAllSameType(c, {grad_output, input});
   checkAllSameGPU(c, {grad_output, input});
 
-  auto grad_weight_t = grad_output->type().tensor(weight_size);
+  auto grad_weight_t = at::empty(weight_size, grad_output->options());
 
   // For uniformity with everything else, although it seems grad_weight
   // would be unambiguous too.
@@ -1179,8 +1180,8 @@ Tensor cudnn_convolution_backward_bias(
   TensorArg grad_output{ grad_output_t, "grad_output", 1 };
   setCuDNNStreamToCurrent();
 
-  auto grad_bias_t = grad_output->type().tensor(
-                        { grad_output->size(output_channels_dim) });
+  auto grad_bias_t = at::empty(
+                        { grad_output->size(output_channels_dim) }, grad_output->options());
 
   TensorArg grad_bias{ grad_bias_t, "result", 0 };
 
diff --git a/aten/src/ATen/native/cudnn/GridSampler.cpp b/aten/src/ATen/native/cudnn/GridSampler.cpp
index e859344bcc369..f9b7781036520 100644
--- a/aten/src/ATen/native/cudnn/GridSampler.cpp
+++ b/aten/src/ATen/native/cudnn/GridSampler.cpp
@@ -75,7 +75,7 @@ Tensor cudnn_grid_sampler_forward(
   checkGridSize(c, grid, input);
   checkDim(c, input, 4);
 
-  auto output_t = input->type().tensor();
+  auto output_t = at::empty({0}, input->options());
   output_t.resize_({input->size(0), input->size(1), grid->size(1), grid->size(2)});
 
   TensorDescriptor idesc{ *input };  // input descriptor
@@ -114,9 +114,9 @@ std::tuple<Tensor, Tensor> cudnn_grid_sampler_backward(
   checkDim(c, input, 4);
   checkDim(c, grad_output, 4);
 
-  auto grad_input_t = input->type().tensor();
+  auto grad_input_t = at::empty({0}, input->options());
   grad_input_t.resize_(input->sizes());
-  auto grad_grid_t = grid->type().tensor();
+  auto grad_grid_t = at::empty({0}, grid->options());
   grad_grid_t.resize_(grid->sizes());
 
   TensorDescriptor idesc{ *input };  // input descriptor
diff --git a/aten/src/ATen/native/cudnn/LossCTC.cpp b/aten/src/ATen/native/cudnn/LossCTC.cpp
index 98c0cb7918f02..28fd81f9a9a99 100644
--- a/aten/src/ATen/native/cudnn/LossCTC.cpp
+++ b/aten/src/ATen/native/cudnn/LossCTC.cpp
@@ -75,7 +75,7 @@ std::tuple<Tensor, Tensor> _cudnn_ctc_loss(const Tensor& log_probs_t, const Tens
 					      algo, ctc_loss_desc.desc(), &workspace_size));
 
 
-  Tensor workspace = log_probs->type().toScalarType(kByte).tensor(workspace_size); // new way of doing this with empty?
+  Tensor workspace = at::empty(workspace_size, log_probs->options().dtype(kByte));
   Tensor costs = at::empty({log_probs->size(1)}, log_probs->options());
 
   AT_CUDNN_CHECK(cudnnCTCLoss(handle, probs_desc.desc(), probs.data_ptr(),
diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
index 09c9365793ec7..35af9919d46d2 100644
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@@ -464,7 +464,7 @@ namespace {
               mat_numel * num_linear_layers / 2, 1};
             // Generate a new parameter tensor which is a view into the
             // weight_buf.
-            Tensor param = weight_buf.type().tensor().set_(weight_buf.storage(), offset, size);
+            Tensor param = at::empty({0}, weight_buf.options()).set_(weight_buf.storage(), offset, size);
             params.emplace_back(std::move(param));
             layer_params_count++;
           } else {
@@ -616,7 +616,7 @@ Tensor _cudnn_rnn_flatten_weight(
   x_desc.set(getCudnnDataType(any_param), x_geom.sizes(), x_geom.strides(), 5);
 
   auto num_weights = get_num_weights(handle, rnn_desc, x_desc, rnn.datatype);
-  auto weight_buf = any_param.type().tensor(num_weights).zero_();
+  auto weight_buf = at::zeros(num_weights, any_param.options());
 
   FilterDescriptor w_desc;
   w_desc.set(weight_buf, 3);
@@ -691,13 +691,13 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
            "rnn: cx is not contiguous");
 
   auto x = input.contiguous();
-  auto output = input.type().tensor(output_size);
-  auto hy = hx.type().tensor(hidden_size);
+  auto output = at::empty(output_size, input.options());
+  auto hy = at::empty(hidden_size, hx.options());
   Tensor cy;
   if (cx.defined()) {
-    cy = cx.type().tensor(hidden_size);
+    cy = at::empty(hidden_size, cx.options());
   } else {
-    cy = hx.type().tensor(); // NB: Not allowed to return undefined tensors
+    cy = at::empty({0}, hx.options()); // NB: Not allowed to return undefined tensors
   }
   auto y = output;
 
@@ -709,7 +709,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
   FilterDescriptor w_desc;
   if (!weight_buf.defined()) {
     auto num_weights = get_num_weights(handle, descs.rnn_desc, descs.x_descs[0], fn.rnn.datatype);
-    weight_buf = x.type().tensor(num_weights);
+    weight_buf = at::empty(num_weights, x.options());
     w_desc.set(weight_buf, 3);
     weight_buf.zero_();
     std::vector<Tensor> params;
@@ -734,7 +734,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
         x_descs_arr.data(),
         &workspace_size
         ));
-  Tensor workspace = input.type().toScalarType(kByte).tensor(workspace_size);
+  Tensor workspace = at::empty(workspace_size, input.options().dtype(kByte));
 
   Tensor reserve;
   // NB: Previously, the test was for fn.requires_grad, but we don't have
@@ -748,7 +748,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
           x_descs_arr.data(),
           &reserve_size
           ));
-    reserve = input.type().toScalarType(kByte).tensor(reserve_size);
+    reserve = at::empty(reserve_size, input.options().dtype(kByte));
     AT_CUDNN_CHECK(cudnnRNNForwardTraining(
           handle,
           descs.rnn_desc.desc(),
@@ -764,7 +764,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
           reserve.data_ptr(), reserve.size(0)
           ));
   } else { // inference
-    reserve = input.type().toScalarType(kByte).tensor();
+    reserve = at::empty({0}, input.options().dtype(kByte));
     AT_CUDNN_CHECK(cudnnRNNForwardInference(
           handle,
           descs.rnn_desc.desc(),
@@ -836,12 +836,12 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_rnn_backward_input(
   auto dy = grad_output.contiguous();
   auto y = output;
   auto w = weight_buf;
-  auto dx = input.type().tensor(input.sizes()); // TODO: more compact way of saying this
+  auto dx = at::empty(input.sizes(), input.options()); // TODO: more compact way of saying this
   auto dhy = grad_hy.contiguous().view(hidden_size);
   auto dcy = grad_cy.defined() ? grad_cy.contiguous().view(hidden_size) : Tensor();
-  auto dhx = hx.type().tensor(hidden_size);
+  auto dhx = at::empty(hidden_size, hx.options());
   AT_ASSERTM(cx.defined() || !output_mask[2], "illegally required grad of cx for non-LSTM RNN");
-  auto dcx = cx.defined() ? cx.type().tensor(hidden_size) : Tensor();
+  auto dcx = cx.defined() ? at::empty(hidden_size, cx.options()) : Tensor();
 
   AT_CHECK(fn_train,
            "cudnn RNN backward can only be called in training mode");
@@ -881,7 +881,7 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_rnn_backward_input(
         &workspace_size
         ));
   // TODO: put this in the correct device???
-  Tensor workspace = input.type().toScalarType(kByte).tensor(workspace_size);
+  Tensor workspace = at::empty(workspace_size, input.options().dtype(kByte));
 
   AT_CUDNN_CHECK(cudnnRNNBackwardData(
         handle,
@@ -965,7 +965,7 @@ std::vector<Tensor> _cudnn_rnn_backward_weight(
 
   auto x = input.contiguous();
   const auto& y = output;
-  auto dw = weight_buf.type().tensor(weight_buf.sizes()).zero_();
+  auto dw = at::zeros(weight_buf.sizes(), weight_buf.options());
 
   cudnnRNNAlgo_t algo = get_algo(fn.rnn, fn.tensors);
   fn.rnn.set_algo(algo);
@@ -984,7 +984,7 @@ std::vector<Tensor> _cudnn_rnn_backward_weight(
         x_descs_arr.data(),
         &workspace_size
         ));
-  Tensor workspace = input.type().toScalarType(kByte).tensor(workspace_size);
+  Tensor workspace = at::empty(workspace_size, input.options().dtype(kByte));
 
   AT_CUDNN_CHECK(cudnnRNNBackwardWeights(
         handle,
@@ -1001,7 +1001,7 @@ std::vector<Tensor> _cudnn_rnn_backward_weight(
   std::vector<Tensor> grad_weight_arr;
   grad_weight_arr.reserve( weight.numel() );
   for (const auto& w : weight_arr) {
-    grad_weight_arr.emplace_back(w.type().tensor(w.sizes()).zero_());
+    grad_weight_arr.emplace_back(at::zeros(w.sizes(), w.options()));
   }
 
   std::vector<Tensor> grad_params_arr;
@@ -1125,7 +1125,7 @@ DropoutState& get_dropout_state(const Type& tp, double dropout_p, bool train) {
                                  : ten_dropout_state_cache.at(device);
   if (train && dropout_p > 0 && !state.buffer.defined()) {
     std::unique_lock<std::mutex> lock {state.mutex};
-    int64_t seed = at::empty({}, at::kLong).random_().toCLong();
+    int64_t seed = at::empty({}, at::kLong).random_().item<int64_t>();
     state.buffer = at::_cudnn_init_dropout_state(
       tp.toScalarType(at::kByte), dropout_p, train, seed);
     // NB: CUDA binds the event to a device at creation time, so we can initialize it
@@ -1155,7 +1155,7 @@ Tensor try_get_weight_buf(
   // Try to get parameter storage
   auto & any_param = parameters.at(0);
   auto param_storage = any_param.storage();
-  auto weight_buf = any_param.type().tensor().set_(param_storage);
+  auto weight_buf = at::empty({0}, any_param.options()).set_(param_storage);
   if (weight_buf.size(0) < num_params) {
     return {};
   } else if (weight_buf.size(0) > num_params) {
diff --git a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
index c9d25780bd65d..f7d163bee732e 100644
--- a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
+++ b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
@@ -89,7 +89,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm(
     mode = miopenBNSpatial;
   }
 
-  auto output_t = input->type().tensor(input->sizes());
+  auto output_t = at::empty(input->sizes(), input->options());
   TensorArg output{ output_t, "output", 0 };
 
   auto handle = getMiopenHandle();
@@ -103,8 +103,8 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm(
 
   if (training) {
     int64_t num_features = input_t.size(1);
-    save_mean = weight_t.type().tensor({ num_features });
-    save_var = weight_t.type().tensor({ num_features });
+    save_mean = at::empty({ num_features }, weight_t.options());
+    save_var = at::empty({ num_features }, weight_t.options());
     MIOPEN_CHECK(miopenBatchNormalizationForwardTraining(
       handle, mode, &one, &zero,
       idesc.desc(), input->data_ptr(),
@@ -177,9 +177,9 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
     mode = miopenBNSpatial;
   }
 
-  auto grad_input_t  = input->type().tensor(input->sizes());
-  auto grad_weight_t = weight->type().tensor(weight->sizes());
-  auto grad_bias_t   = weight->type().tensor(weight->sizes());
+  auto grad_input_t  = at::empty(input->sizes(), input->options());
+  auto grad_weight_t = at::empty(weight->sizes(), weight->options());
+  auto grad_bias_t   = at::empty(weight->sizes(), weight->options());
 
   auto handle = getMiopenHandle();
   auto dataType = getMiopenDataType(*input);
diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp
index 9aeaad7355861..6515574a299c6 100644
--- a/aten/src/ATen/native/miopen/Conv_miopen.cpp
+++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp
@@ -616,9 +616,10 @@ Tensor miopen_convolution_forward(
   checkAllSameType(c, {input, weight});
   checkAllSameGPU(c, {input, weight});
 
-  auto output_t = input->type().tensor(
+  auto output_t = at::empty(
                     conv_output_size(input->sizes(), weight->sizes(),
-                                     padding, stride, dilation, groups));
+                                     padding, stride, dilation, groups),
+                    input->options());
 
   // Avoid ambiguity of "output" when this is being used as backwards
   TensorArg output{ output_t, "result", 0 };
@@ -734,7 +735,7 @@ Tensor miopen_convolution_backward_input(
   checkAllSameType(c, {grad_output, weight});
   checkAllSameGPU(c, {grad_output, weight});
 
-  auto grad_input_t = grad_output->type().tensor(input_size);
+  auto grad_input_t = at::empty(input_size, grad_output->options());
 
   // Avoid "grad_input" when this is being used as transposed convolution
   TensorArg grad_input{ grad_input_t, "result", 0 };
@@ -859,7 +860,7 @@ Tensor miopen_convolution_backward_weight(
   checkAllSameType(c, {grad_output, input});
   checkAllSameGPU(c, {grad_output, input});
 
-  auto grad_weight_t = grad_output->type().tensor(weight_size);
+  auto grad_weight_t = at::empty(weight_size, grad_output->options());
 
   // For uniformity with everything else, although it seems grad_weight
   // would be unambiguous too.
@@ -917,8 +918,7 @@ Tensor miopen_convolution_backward_bias(
   TensorArg grad_output{ grad_output_t, "grad_output", 1 };
   setMIOpenStreamToCurrent();
 
-  auto grad_bias_t = grad_output->type().tensor(
-                        { grad_output->size(output_channels_dim) });
+  auto grad_bias_t = at::empty( { grad_output->size(output_channels_dim) }, grad_output->options());
 
   TensorArg grad_bias{ grad_bias_t, "result", 0 };
 
diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp
index 2c81d69d3b843..1d92de58bb7ec 100644
--- a/aten/src/ATen/native/mkl/SpectralOps.cpp
+++ b/aten/src/ATen/native/mkl/SpectralOps.cpp
@@ -207,7 +207,7 @@ Tensor _fft_mkl(const Tensor& self, int64_t signal_ndim,
       onumel *= osize;
     }
   }
-  Tensor output = input.type().tensor(output_sizes);
+  Tensor output = at::empty(output_sizes, input.options());
 
   // precision
   DFTI_CONFIG_VALUE prec;
diff --git a/aten/src/ATen/native/mkldnn/Conv.cpp b/aten/src/ATen/native/mkldnn/Conv.cpp
index ddbd6977645e7..adfe15decbc9b 100644
--- a/aten/src/ATen/native/mkldnn/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/Conv.cpp
@@ -70,8 +70,8 @@ at::Tensor mkldnn_convolution(
     const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias,
     IntList padding, IntList stride, IntList dilation, int64_t groups)
 {
-  auto output = input.type().tensor(conv_output_size(
-    input.sizes(), weight.sizes(), padding, stride, dilation, groups));
+  auto output = at::empty(conv_output_size(
+    input.sizes(), weight.sizes(), padding, stride, dilation, groups), input.options());
 
   auto cpu_engine = CpuEngine::Instance().get_engine();
 
@@ -182,7 +182,7 @@ Tensor mkldnn_convolution_backward_input(
     IntList input_size, const at::Tensor& grad_output, const at::Tensor& weight,
     IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined)
 {
-  auto grad_input = grad_output.type().tensor(input_size);
+  auto grad_input = at::empty(input_size, grad_output.options());
 
   auto cpu_engine = CpuEngine::Instance().get_engine();
 
@@ -294,11 +294,11 @@ std::tuple<at::Tensor, at::Tensor> mkldnn_convolution_backward_weights(
     IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input,
     IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined)
 {
-  auto grad_weight = grad_output.type().tensor(weight_size);
+  auto grad_weight = at::empty(weight_size, grad_output.options());
 
   Tensor grad_bias;
   if (bias_defined) {
-    grad_bias = grad_output.type().tensor({grad_output.size(1)});
+    grad_bias = at::empty({grad_output.size(1)}, grad_output.options());
   }
 
   auto cpu_engine = CpuEngine::Instance().get_engine();
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index f54c9110c21f2..2cc0995dabada 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -647,6 +647,8 @@
 
 - func: empty_like(Tensor self, *, TensorOptions options) -> Tensor
 
+- func: empty_strided(IntList size, IntList stride, *, TensorOptions options={}) -> Tensor
+
 - func: erf(Tensor self) -> Tensor
   variants: function, method
 
@@ -1887,11 +1889,13 @@
 
 
 - func: native_tensor(Type self_ty) -> Tensor
+  variants: []
   dispatch:
     SparseCPU: new_sparse
     SparseCUDA: new_sparse
 
 - func: native_tensor(Type self_ty, IntList size) -> Tensor
+  variants: []
   dispatch:
     SparseCPU: new_with_size_sparse
     SparseCUDA: new_with_size_sparse
@@ -1932,15 +1936,17 @@
     SparseCPU: new_with_tensor_and_size_sparse
     SparseCUDA: new_with_tensor_and_size_sparse
 
-- func: sparse_coo_tensor(Type dtype, IntList size) -> Tensor
-  variants: []
-
 - func: sparse_coo_tensor(IndexTensor indices, Tensor values) -> Tensor
-  variants: []
 
 - func: sparse_coo_tensor(IndexTensor indices, Tensor values, IntList size) -> Tensor
-  variants: []
 
+# FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given
+# the default would never make sense.
+- func: sparse_coo_tensor(IntList size, *, TensorOptions options) -> Tensor
+
+- func: sparse_coo_tensor(IndexTensor indices, Tensor values, *, TensorOptions options) -> Tensor
+
+- func: sparse_coo_tensor(IndexTensor indices, Tensor values, IntList size, *, TensorOptions options) -> Tensor
 
 - func: _native_sparse_coo_tensor_unsafe(IndexTensor indices, Tensor values, IntList size) -> Tensor
   variants: []
diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
index 49efed2a1e066..83aee52cf8102 100644
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -286,8 +286,8 @@ SparseTensor coalesce_sparse_cpu(const SparseTensor& self) {
   SparseTensor dst = new_sparse(self.type());
   _get_sparse_impl(dst)->resize_(sparseDims, denseDims, self.sizes());
   // TODO: is there a more idiomatic way to do this?
-  LongTensor newIndices = indices.type().tensor(indices.sizes());
-  Tensor newValues = values.type().tensor(values.sizes());
+  LongTensor newIndices = at::empty(indices.sizes(), indices.options());
+  Tensor newValues = at::empty(values.sizes(), values.options());
   _alias_into_sparse(dst, newIndices, newValues);
 
   LongTensor indicesBuffer;
@@ -348,7 +348,7 @@ SparseTensor& sparse_mask_out_cpu(SparseTensor& r, const Tensor& t, const Sparse
   int64_t sparseDims = mask._sparseDims();
   LongTensor mask_indices = mask._indices();
   Tensor mask_values = mask._values();
-  Tensor r_values = r._values().type().tensor(mask_values.sizes());
+  Tensor r_values = at::empty(mask_values.sizes(), r._values().options());
   _alias_into_sparse(r, mask_indices.clone(), r_values);
   _get_sparse_impl(r)->set_coalesced(mask.is_coalesced());
   int64_t r_nnz = mask._nnz();
@@ -392,7 +392,7 @@ SparseTensor& sparse_mask_out_cpu(SparseTensor& r, const Tensor& t, const Sparse
 }
 
 SparseTensor sparse_mask_cpu(const Tensor& t, SparseTensorRef mask) {
-  SparseTensor r = t.type().toSparse().tensor();
+  SparseTensor r = at::empty({0}, t.options().layout(kSparse));
   sparse_mask_out_cpu(r, t, mask.tref);
   return r;
 }
diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
index ec074b5a6c8a8..8a8668fc48b8a 100644
--- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
@@ -135,7 +135,7 @@ SparseTensor& pow_out_sparse_scalar(SparseTensor& r, const SparseTensor& t_, Sca
 }
 
 SparseTensor pow_sparse_scalar(const SparseTensor& t, Scalar value) {
-  SparseTensor r = t.type().tensor();
+  SparseTensor r = at::empty({0}, t.options());
   pow_out_sparse_scalar(r, t, value);
   return r;
 }
@@ -208,7 +208,7 @@ SparseTensor& add_out_sparse_cpu(SparseTensor& r, const SparseTensor& t, const S
   Tensor t_values = t._values();
   LongTensor src_indices = src._indices();
   Tensor s_values = src._values();
-  LongTensor r_indices = t_indices.type().tensor({sparseDims, max_nnz});
+  LongTensor r_indices = at::empty({sparseDims, max_nnz}, t_indices.options());
   Tensor r_values = _new_values_with_size_of(s_values, max_nnz).zero_();
   r.resize_as_(src);
   _get_sparse_impl(r)->set_indices_and_values_unsafe(r_indices, r_values);
@@ -387,7 +387,7 @@ SparseTensor& mul_out_sparse_cpu(SparseTensor& r, const Tensor& t_, const Tensor
   Tensor t_values = t._values();
   LongTensor src_indices = src._indices();
   Tensor s_values = src._values();
-  LongTensor r_indices = t_indices.type().tensor({sparseDims, max_nnz});
+  LongTensor r_indices = at::empty({sparseDims, max_nnz}, t_indices.options());
   Tensor r_values = _new_values_with_size_of(t_values, max_nnz).zero_();
   r.resize_as_(src);
   _get_sparse_impl(r)->set_indices_and_values_unsafe(r_indices, r_values);
@@ -570,7 +570,7 @@ Tensor s_addmm_sparse_dense_cpu(
     Scalar beta,
     Scalar alpha
 ) {
-  Tensor r = t.type().tensor();
+  Tensor r = at::empty({0}, t.options());
   s_addmm_out_sparse_dense_cpu(r, t, sparse, dense, beta, alpha);
   return r;
 }
@@ -646,7 +646,7 @@ SparseTensor& hspmm_out_sparse_cpu(SparseTensor& r, const SparseTensor& sparse_,
   }
   int64_t outNnz = i + 1;
   indices.resize_({1, outNnz});
-  Tensor values = dense.type().tensor({outNnz, n});
+  Tensor values = at::empty({outNnz, n}, dense.options());
 
   std::vector<int64_t> new_size = _get_sparse_impl(newSparse)->sizes().vec();
   new_size[0] = outNnz;
@@ -660,7 +660,7 @@ SparseTensor& hspmm_out_sparse_cpu(SparseTensor& r, const SparseTensor& sparse_,
 }
 
 SparseTensor hspmm_sparse_cpu(const SparseTensor& sparse, const Tensor& dense) {
-  SparseTensor r = sparse.type().tensor();
+  SparseTensor r = at::empty({0}, sparse.options());
   hspmm_out_sparse_cpu(r, sparse, dense);
   return r;
 }
@@ -787,7 +787,7 @@ Tensor& _sspaddmm_out_only_sparse(Tensor& result, const Tensor& self,
 
 // sparse, dense -> sparse
 Tensor smm(const Tensor& self, const Tensor& mat2) {
-  auto result = self.type().tensor();
+  auto result = at::empty({0}, self.options());
   at::sspaddmm_out(result, result, self, mat2, 0.0, 1.0);
   return result;
 }
@@ -795,7 +795,7 @@ Tensor smm(const Tensor& self, const Tensor& mat2) {
 // sparse, sparse, dense, real, real -> sparse
 Tensor sspaddmm(const Tensor& self, const Tensor& mat1, const Tensor& mat2,
     Scalar beta, Scalar alpha) {
-  auto result = self.type().tensor();
+  auto result = at::empty({0}, self.options());
   at::sspaddmm_out(result, self, mat1, mat2, beta, alpha);
   return result;
 }
diff --git a/aten/src/ATen/native/sparse/SparseUtils.h b/aten/src/ATen/native/sparse/SparseUtils.h
index 3ce0eee53353e..2626eedebaf5e 100644
--- a/aten/src/ATen/native/sparse/SparseUtils.h
+++ b/aten/src/ATen/native/sparse/SparseUtils.h
@@ -110,7 +110,7 @@ inline LongTensor _newFlattenedIndices(const SparseTensor& self, bool forceClone
 inline Tensor _new_values_with_size_of(const Tensor& values, int64_t nnz) {
   std::vector<int64_t> size = values.sizes().vec();
   size[0] = nnz;
-  return values.type().tensor(size);
+  return at::empty(size, values.options());
 }
 
 
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp
index 107a30f51c2a9..ab9fb15c62873 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp
@@ -21,7 +21,7 @@ SparseTensor& sparse_mask_out_cuda(SparseTensor& r, const Tensor& t, const Spars
   }
   LongTensor mask_indices = mask._indices();
   Tensor mask_values = mask._values();
-  Tensor r_values = r._values().type().tensor(mask_values.sizes());
+  Tensor r_values = at::empty(mask_values.sizes(), r._values().options());
   _alias_into_sparse(r, mask_indices.clone(), r_values);
   _get_sparse_impl(r)->set_coalesced(mask.is_coalesced());
   _get_sparse_impl(r)->set_nnz_and_narrow(mask._nnz());
@@ -51,7 +51,7 @@ SparseTensor& sparse_mask_out_cuda(SparseTensor& r, const Tensor& t, const Spars
 }
 
 SparseTensor sparse_mask_cuda(const Tensor& t, SparseTensorRef mask) {
-  SparseTensor r = t.type().toSparse().tensor();
+  SparseTensor r = at::empty({0}, t.options().layout(kSparse));
   sparse_mask_out_cuda(r, t, mask.tref);
   return r;
 }
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
index 15d9afc04307a..7579b90b70e07 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
@@ -228,7 +228,7 @@ SparseTensor& hspmm_out_sparse_cuda(SparseTensor& r_, const SparseTensor& sparse
 }
 
 SparseTensor hspmm_sparse_cuda(const SparseTensor& sparse, const Tensor& dense) {
-  SparseTensor r = sparse.type().tensor();
+  SparseTensor r = at::empty({0}, sparse.options());
   hspmm_out_sparse_cuda(r, sparse, dense);
   return r;
 }
diff --git a/aten/src/ATen/preprocess_declarations.py b/aten/src/ATen/preprocess_declarations.py
index d27d0da7240fc..68bb4ecc531ca 100644
--- a/aten/src/ATen/preprocess_declarations.py
+++ b/aten/src/ATen/preprocess_declarations.py
@@ -220,8 +220,6 @@ def signature(option, i=None, value=None):
 def is_extended_method(option):
     if 'method' in option['variants']:
         return False
-    elif not option['variants']:
-        return False
     else:
         return True
 
diff --git a/aten/src/ATen/templates/NativeFunctions.h b/aten/src/ATen/templates/NativeFunctions.h
index c6355127734b1..1ca3e495358cb 100644
--- a/aten/src/ATen/templates/NativeFunctions.h
+++ b/aten/src/ATen/templates/NativeFunctions.h
@@ -4,7 +4,6 @@
 
 #include <ATen/Context.h>
 #include <ATen/ScalarType.h>
-#include <ATen/TensorOperators.h>
 #include <ATen/core/TensorMethods.h>
 #include <ATen/core/TensorOptions.h>
 
@@ -17,7 +16,7 @@
 namespace at {
 struct Generator;
 class Scalar;
-struct Tensor;
+class Tensor;
 struct Type;
 } // namespace at
 
@@ -49,23 +48,23 @@ inline Tensor from_blob(
 }
 
 // These functions are defined in native/TensorFactories.cpp.
-#define TENSOR(T, S, _1)                                               \
-  AT_API Tensor tensor(ArrayRef<T> values, const TensorOptions& options);     \
-  inline Tensor tensor(                                                \
-      std::initializer_list<T> values, const TensorOptions& options) { \
-    return native::tensor(ArrayRef<T>(values), options);               \
-  }                                                                    \
-  inline Tensor tensor(T value, const TensorOptions& options) {        \
-    return native::tensor(ArrayRef<T>(value), options);                \
-  }                                                                    \
-  inline Tensor tensor(ArrayRef<T> values) {                           \
-    return native::tensor(std::move(values), at::dtype(k##S));         \
-  }                                                                    \
-  inline Tensor tensor(std::initializer_list<T> values) {              \
-    return native::tensor(ArrayRef<T>(values));                        \
-  }                                                                    \
-  inline Tensor tensor(T value) {                                      \
-    return native::tensor(ArrayRef<T>(value));                         \
+#define TENSOR(T, S, _1)                                                      \
+  CAFFE2_API Tensor tensor(ArrayRef<T> values, const TensorOptions& options); \
+  inline Tensor tensor(                                                       \
+      std::initializer_list<T> values, const TensorOptions& options) {        \
+    return native::tensor(ArrayRef<T>(values), options);                      \
+  }                                                                           \
+  inline Tensor tensor(T value, const TensorOptions& options) {               \
+    return native::tensor(ArrayRef<T>(value), options);                       \
+  }                                                                           \
+  inline Tensor tensor(ArrayRef<T> values) {                                  \
+    return native::tensor(std::move(values), at::dtype(k##S));                \
+  }                                                                           \
+  inline Tensor tensor(std::initializer_list<T> values) {                     \
+    return native::tensor(ArrayRef<T>(values));                               \
+  }                                                                           \
+  inline Tensor tensor(T value) {                                             \
+    return native::tensor(ArrayRef<T>(value));                                \
   }
 AT_FORALL_SCALAR_TYPES_EXCEPT_HALF(TENSOR)
 #undef TENSOR
diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h
index 85e7c84961d6e..1d5ac020f231e 100644
--- a/aten/src/ATen/templates/Tensor.h
+++ b/aten/src/ATen/templates/Tensor.h
@@ -15,7 +15,7 @@
 namespace at {
 struct Generator;
 struct Type;
-struct Tensor;
+class Tensor;
 struct TensorOptions;
 } // namespace at
 
@@ -37,11 +37,12 @@ namespace at {
 //
 // Note that Tensor can also be NULL, i.e. it is not associated with any underlying TensorImpl, and
 // special care must be taken to handle this.
-struct AT_API Tensor {
+class CAFFE2_API Tensor {
+public:
   Tensor(){};
   Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
-      : tensor_impl_(std::move(tensor_impl)) {
-    if (tensor_impl_.get() == nullptr) {
+      : impl_(std::move(tensor_impl)) {
+    if (impl_.get() == nullptr) {
       throw std::runtime_error("TensorBaseImpl with nullptr not supported");
     }
   }
@@ -50,25 +51,25 @@ struct AT_API Tensor {
   Tensor(Tensor&&) = default;
 
   int64_t dim() const {
-    return tensor_impl_->dim();
+    return impl_->dim();
   }
 
   TensorImpl * unsafeGetTensorImpl() const {
-    return tensor_impl_.get();
+    return impl_.get();
   }
   TensorImpl * unsafeReleaseTensorImpl() {
-    return tensor_impl_.release();
+    return impl_.release();
   }
   const c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>& getIntrusivePtr() const {
-    return tensor_impl_;
+    return impl_;
   }
 
   bool defined() const {
-    return tensor_impl_;
+    return impl_;
   }
 
   void reset() {
-    tensor_impl_.reset();
+    impl_.reset();
   }
 
   // The following overloads are very intruiging.  Consider the following
@@ -102,11 +103,11 @@ struct AT_API Tensor {
   // Tensor& operator=(const Tensor&) & = default;
   // Tensor& operator=(Tensor&&) & = default;
   Tensor& operator=(const Tensor& x) & {
-    tensor_impl_ = x.tensor_impl_;
+    impl_ = x.impl_;
     return *this;
   }
   Tensor& operator=(Tensor&& x) & {
-    tensor_impl_ = std::move(x.tensor_impl_);
+    impl_ = std::move(x.impl_);
     return *this;
   }
 
@@ -115,37 +116,37 @@ struct AT_API Tensor {
   Tensor& operator=(Tensor&&) &&;
 
   bool is_same(const Tensor& other) const noexcept {
-    return tensor_impl_ == other.tensor_impl_;
+    return impl_ == other.impl_;
   }
   size_t use_count() const noexcept {
-    return tensor_impl_.use_count();
+    return impl_.use_count();
   }
   size_t weak_use_count() const noexcept {
-    return tensor_impl_.weak_use_count();
+    return impl_.weak_use_count();
   }
 
   const char * toString() const;
 
   IntList sizes() const {
-    return tensor_impl_->sizes();
+    return impl_->sizes();
   }
   IntList strides() const {
-    return tensor_impl_->strides();
+    return impl_->strides();
   }
   int64_t ndimension() const {
     return dim();
   }
   Type & type() const {
-    return tensor_impl_->type();
+    return impl_->type();
   }
   TensorTypeId type_id() const {
-    return tensor_impl_->type_id();
+    return impl_->type_id();
   }
   ScalarType scalar_type() const {
-    return dataTypeToScalarType(tensor_impl_->dtype().id());
+    return dataTypeToScalarType(impl_->dtype().id());
   }
   const Storage& storage() const {
-    return tensor_impl_->storage();
+    return impl_->storage();
   }
   Tensor toType(const Type & t, bool non_blocking=false) const;
   Tensor & copy_(const Tensor & src, bool non_blocking=false);
@@ -172,20 +173,12 @@ struct AT_API Tensor {
   template<typename T>
   T * data() const;
 
+  template <typename T>
+  T item() const;
+
   // Purposely not defined here to avoid inlining
   void print() const;
 
-  //toLongData(), toFloatData() etc.
-  #define TO_TYPE_DATA(T,name,_) \
-  T * to##name##Data() const;
-  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(TO_TYPE_DATA)
-  #undef TO_TYPE_DATA
-
-  #define TO_C_TYPE(T,name,_) \
-  T toC##name () const;
-  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(TO_C_TYPE)
-  #undef TO_C_TYPE
-
   // Return a `TensorAccessor` for CPU `Tensor`s. You have to specify scalar type and
   // dimension.
   template<typename T, size_t N>
@@ -230,18 +223,18 @@ struct AT_API Tensor {
   // ~~~~~ Autograd API ~~~~~
 
   Tensor& set_requires_grad(bool requires_grad) {
-    tensor_impl_->set_requires_grad(requires_grad);
+    impl_->set_requires_grad(requires_grad);
     return *this;
   }
   bool requires_grad() const {
-    return tensor_impl_->requires_grad();
+    return impl_->requires_grad();
   }
 
   Tensor& grad() {
-    return tensor_impl_->grad();
+    return impl_->grad();
   }
   const Tensor& grad() const {
-    return tensor_impl_->grad();
+    return impl_->grad();
   }
 
   void set_data(Tensor new_data);
@@ -267,35 +260,35 @@ struct AT_API Tensor {
   friend struct WeakTensor;
 
 protected:
-  c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl_;
+  c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> impl_;
 };
 
-struct AT_API WeakTensor {
-  WeakTensor(const Tensor& t) : weak_tensor_impl_(t.tensor_impl_) {}
+struct CAFFE2_API WeakTensor {
+  WeakTensor(const Tensor& t) : weak_impl_(t.impl_) {}
 
   // XXX: this can return undefined tensors
   // Ideally it would be at::optional<Tensor>, but MSVC is too cool for that
   Tensor lock() const {
-    return Tensor(weak_tensor_impl_.lock());
+    return Tensor(weak_impl_.lock());
   }
 
   bool is_same(const WeakTensor& other) const noexcept {
-    return weak_tensor_impl_ == other.weak_tensor_impl_;
+    return weak_impl_ == other.weak_impl_;
   }
 
   size_t use_count() const noexcept {
-    return weak_tensor_impl_.use_count();
+    return weak_impl_.use_count();
   }
   size_t weak_use_count() const noexcept {
-    return weak_tensor_impl_.weak_use_count();
+    return weak_impl_.weak_use_count();
   }
 
   TensorImpl* unsafeGetTensorImpl() const {
-    return weak_tensor_impl_._unsafe_get_target();
+    return weak_impl_._unsafe_get_target();
   }
 
 private:
-  c10::weak_intrusive_ptr<TensorImpl, UndefinedTensorImpl> weak_tensor_impl_;
+  c10::weak_intrusive_ptr<TensorImpl, UndefinedTensorImpl> weak_impl_;
 };
 } // namespace at
 
diff --git a/aten/src/ATen/templates/TensorMethods.h b/aten/src/ATen/templates/TensorMethods.h
index 8283bea01f6be..70f56bd37697d 100644
--- a/aten/src/ATen/templates/TensorMethods.h
+++ b/aten/src/ATen/templates/TensorMethods.h
@@ -81,16 +81,16 @@ inline Device Tensor::device() const {
         " but found ",                           \
         at::toString(type().scalarType()));      \
     return static_cast<T*>(this->data_ptr());    \
-  }                                              \
-  inline T* Tensor::to##name##Data() const {     \
-    return data<T>();                            \
   }
 
 AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_CAST)
 #undef DEFINE_CAST
 
-#define DEFINE_TO_C_TYPE(T,name,_) \
-inline T Tensor::toC##name () const { return _local_scalar().to##name (); }
+#define DEFINE_TO_C_TYPE(T, name, _)   \
+  template <>                          \
+  inline T Tensor::item() const {      \
+    return _local_scalar().to##name(); \
+  }
 
 AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_TO_C_TYPE)
 #undef DEFINE_TO_C_TYPE
diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h
index 0e00a5d3499fc..fbbf88823ea24 100644
--- a/aten/src/ATen/templates/Type.h
+++ b/aten/src/ATen/templates/Type.h
@@ -33,7 +33,7 @@ class Context;
 struct Allocator;
 struct Generator;
 struct Storage;
-struct Tensor;
+class Tensor;
 
 static inline void noop_deleter(void*) {}
 
@@ -47,7 +47,7 @@ enum class TypeID {
   NumOptions
 };
 
-struct AT_API Type {
+struct CAFFE2_API Type {
   explicit Type(TensorTypeId type_id, bool is_variable, bool is_undefined)
       : type_id_(type_id), is_variable_(is_variable), is_undefined_(is_undefined) {}
 
@@ -140,7 +140,6 @@ struct AT_API Type {
   TensorTypeId type_id_;
   bool is_variable_;
   bool is_undefined_;
-
 };
 
 } // namespace at
diff --git a/aten/src/ATen/templates/TypeDefault.h b/aten/src/ATen/templates/TypeDefault.h
index e4a75abb48993..73c1f0f1d27cd 100644
--- a/aten/src/ATen/templates/TypeDefault.h
+++ b/aten/src/ATen/templates/TypeDefault.h
@@ -6,7 +6,7 @@
 
 namespace at {
 
-struct AT_API TypeDefault : public TypeExtendedInterface {
+struct CAFFE2_API TypeDefault : public TypeExtendedInterface {
   explicit TypeDefault(TensorTypeId type_id, bool is_variable, bool is_undefined)
       : TypeExtendedInterface(type_id, is_variable, is_undefined) {}
 
diff --git a/aten/src/ATen/templates/TypeExtendedInterface.h b/aten/src/ATen/templates/TypeExtendedInterface.h
index 82cb658c9eeea..03af27f146b66 100644
--- a/aten/src/ATen/templates/TypeExtendedInterface.h
+++ b/aten/src/ATen/templates/TypeExtendedInterface.h
@@ -3,7 +3,7 @@
 
 namespace at {
 
-struct AT_API TypeExtendedInterface : public Type {
+struct CAFFE2_API TypeExtendedInterface : public Type {
   explicit TypeExtendedInterface(TensorTypeId type_id, bool is_variable, bool is_undefined)
       : Type(type_id, is_variable, is_undefined) {}
   ${pure_virtual_extended_type_method_declarations}
diff --git a/aten/src/ATen/test/apply_utils_test.cpp b/aten/src/ATen/test/apply_utils_test.cpp
index 22be6de7acbc0..ab7e3522bbeda 100644
--- a/aten/src/ATen/test/apply_utils_test.cpp
+++ b/aten/src/ATen/test/apply_utils_test.cpp
@@ -37,10 +37,10 @@ void test(Type& type, IntList shape, int64_t a = 0, int64_t b = 1) {
   empty_t.fill_(3);
   empty_t.exp_();
 
-  auto a0 = type.tensor();
-  auto a1 = type.tensor();
-  auto a2 = type.tensor();
-  auto a3 = type.tensor();
+  auto a0 = at::empty({0}, type.options());
+  auto a1 = at::empty({0}, type.options());
+  auto a2 = at::empty({0}, type.options());
+  auto a3 = at::empty({0}, type.options());
   auto a4 = CPU(kDouble).tensor();
 
   std::vector<Tensor> tensors({a0, a1, a2, a3, a4});
diff --git a/aten/src/ATen/test/atest.cpp b/aten/src/ATen/test/atest.cpp
index 8dffa3d7c02c7..edb3f79fd2d55 100644
--- a/aten/src/ATen/test/atest.cpp
+++ b/aten/src/ATen/test/atest.cpp
@@ -18,7 +18,7 @@ void trace() {
     trace += foo_a[i][i];
   }
 
-  EXPECT_FLOAT_EQ(foo.trace().toCFloat(), trace);
+  EXPECT_FLOAT_EQ(foo.trace().item<float>(), trace);
 }
 
 // TEST_CASE( "atest", "[]" ) {
@@ -27,7 +27,6 @@ TEST(atest, atest) {
   manual_seed(123, at::kCUDA);
 
   auto foo = rand({12,6});
-  EXPECT_EQ(foo.data<float>(), foo.toFloatData());
 
   EXPECT_EQ(foo.size(0), 12);
   EXPECT_EQ(foo.size(1), 6);
diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp
index c64fdec0089df..361d24b5a6b76 100644
--- a/aten/src/ATen/test/basic.cpp
+++ b/aten/src/ATen/test/basic.cpp
@@ -21,7 +21,7 @@ using Catch::Matchers::StartsWith;
 
 static void test(Type & type) {
   CATCH_SECTION( "resize" ) {
-    auto a = type.tensor();
+    auto a = at::empty({0}, type.options());
     a.resize_({3,4});
     CATCH_REQUIRE(a.numel() == 12);
     a.resize_({5, 7});
@@ -31,15 +31,15 @@ static void test(Type & type) {
 
   CATCH_SECTION( "ones and dot" ) {
     Tensor b0 = ones({1, 1}, type);
-    CATCH_REQUIRE(2 == (b0+b0).sum().toCDouble());
+    CATCH_REQUIRE(2 == (b0+b0).sum().item<double>());
 
     Tensor b1 = ones({1, 2}, type);
-    CATCH_REQUIRE(4 == (b1+b1).sum().toCDouble());
+    CATCH_REQUIRE(4 == (b1+b1).sum().item<double>());
 
     Tensor b = ones({3, 4}, type);
-    CATCH_REQUIRE(24 == (b+b).sum().toCDouble());
+    CATCH_REQUIRE(24 == (b+b).sum().item<double>());
     CATCH_REQUIRE(12 == b.numel());
-    CATCH_REQUIRE(b.view(-1).dot(b.view(-1)).toCDouble() == 12);
+    CATCH_REQUIRE(b.view(-1).dot(b.view(-1)).item<double>() == 12);
   }
 
   CATCH_SECTION( "rand" ) {
@@ -54,7 +54,7 @@ static void test(Type & type) {
     auto z = b.sort(1);
     auto z_sorted = std::get<0>(z);
 
-    CATCH_REQUIRE(z_sorted[0][0].toCFloat() < z_sorted[0][1].toCFloat());
+    CATCH_REQUIRE(z_sorted[0][0].item<float>() < z_sorted[0][1].item<float>());
   }
 
   if(type.backend() != Backend::CUDA)
@@ -62,7 +62,7 @@ static void test(Type & type) {
     Tensor b = randperm(15, type);
     Tensor rv, ri;
     std::tie(rv, ri) = sort(b, 0);
-    CATCH_REQUIRE(rv[0].toCFloat() <= rv[1].toCFloat());
+    CATCH_REQUIRE(rv[0].item<float>() <= rv[1].item<float>());
   }
 
   CATCH_SECTION( "context" ) {
@@ -89,7 +89,7 @@ static void test(Type & type) {
     auto end = std::chrono::high_resolution_clock::now();
     //TODO TEST PERF?
     std::cout << std::dec << "   " << std::chrono::duration_cast<std::chrono::milliseconds>(end-begin).count() << " ms" << std::endl;
-    CATCH_REQUIRE(norm(100000*d).toCDouble() == norm(r).toCDouble());
+    CATCH_REQUIRE(norm(100000*d).item<double>() == norm(r).item<double>());
   }
 
   CATCH_SECTION( "loads of adds (with copy)" ) {
@@ -102,7 +102,7 @@ static void test(Type & type) {
     auto end = std::chrono::high_resolution_clock::now();
     //TODO TEST PERF?
     std::cout << std::dec << "   " << std::chrono::duration_cast<std::chrono::milliseconds>(end-begin).count() << " ms" << std::endl;
-    CATCH_REQUIRE(norm(100000*d).toCDouble() == norm(r).toCDouble());
+    CATCH_REQUIRE(norm(100000*d).item<double>() == norm(r).item<double>());
   }
 
   CATCH_SECTION( "isContiguous" ) {
@@ -154,7 +154,7 @@ static void test(Type & type) {
 
   CATCH_SECTION( "abs(value)" ) {
     Tensor r = at::abs(type.scalarTensor(-3));
-    CATCH_REQUIRE(r.toCInt() == 3);
+    CATCH_REQUIRE(r.item<int32_t>() == 3);
   }
 
 //TODO(zach): operator overloads
@@ -195,7 +195,7 @@ static void test(Type & type) {
     auto f = rand({3,4}, type);
     f[2] = zeros({4}, type);
     f[1][0] = -1;
-    CATCH_REQUIRE(f[2][0].toCDouble() == 0);
+    CATCH_REQUIRE(f[2][0].item<double>() == 0);
   }
 
   CATCH_SECTION( "tensor from TH" ) {
@@ -206,14 +206,14 @@ static void test(Type & type) {
     CATCH_REQUIRE_NOTHROW(tt);
   }
 
-  CATCH_SECTION( "toCFloat" ) {
+  CATCH_SECTION( "item<float>" ) {
     Tensor a = zeros({3,4});
     Tensor b = ones({3,7});
     Tensor c = cat({a,b},1);
     CATCH_REQUIRE(c.size(1) == 11);
 
     Tensor e = rand({});
-    CATCH_REQUIRE(*e.data<float>() == e.sum().toCFloat());
+    CATCH_REQUIRE(*e.data<float>() == e.sum().item<float>());
   }
 
   CATCH_SECTION( "to string" ) {
diff --git a/aten/src/ATen/test/scalar_tensor_test.cpp b/aten/src/ATen/test/scalar_tensor_test.cpp
index 964f6260e7d9f..a89ca81da017f 100644
--- a/aten/src/ATen/test/scalar_tensor_test.cpp
+++ b/aten/src/ATen/test/scalar_tensor_test.cpp
@@ -234,7 +234,7 @@ void test(Type &T) {
                        [&]() {
                          int64_t dim0 = lhs.dim() == 0 ? 1 : lhs.size(0);
                          int64_t dim1 = rhs.dim() == 0 ? 1 : rhs.size(0);
-                         require_equal_size_dim(result, result.type().tensor({dim0, dim1}));
+                         require_equal_size_dim(result, at::empty({dim0, dim1}, result.options()));
                        }(););
       }
 
diff --git a/aten/src/ATen/test/scalar_test.cpp b/aten/src/ATen/test/scalar_test.cpp
index 247830c3cc839..10ffa9afc326f 100644
--- a/aten/src/ATen/test/scalar_test.cpp
+++ b/aten/src/ATen/test/scalar_test.cpp
@@ -71,7 +71,7 @@ CATCH_TEST_CASE( "scalar test", "[]" ) {
   auto t = ones({4,4});
 
   auto wha2 = zeros({4,4}).add(t).sum();
-  CATCH_REQUIRE( wha2.toCDouble() == 16.0 );
+  CATCH_REQUIRE( wha2.item<double>() == 16.0 );
 
   CATCH_REQUIRE( t.sizes()[0] == 4 );
   CATCH_REQUIRE( t.sizes()[1] == 4 );
@@ -116,10 +116,10 @@ CATCH_TEST_CASE( "scalar test", "[]" ) {
   // test direct C-scalar type conversions
   {
     auto x = ones({1,2}, T);
-    _CATCH_REQUIRE_THROWS(x.toCFloat());
+    _CATCH_REQUIRE_THROWS(x.item<float>());
   }
   auto float_one = ones({}, T);
-  CATCH_REQUIRE(float_one.toCFloat() == 1);
-  CATCH_REQUIRE(float_one.toCInt() == 1);
-  CATCH_REQUIRE((float_one.toCHalf() == 1));
+  CATCH_REQUIRE(float_one.item<float>() == 1);
+  CATCH_REQUIRE(float_one.item<int32_t>() == 1);
+  CATCH_REQUIRE((float_one.item<at::Half>() == 1));
 }
diff --git a/aten/src/TH/THAllocator.h b/aten/src/TH/THAllocator.h
index 578d689400baf..4cb316adc0174 100644
--- a/aten/src/TH/THAllocator.h
+++ b/aten/src/TH/THAllocator.h
@@ -32,8 +32,8 @@ TH_API THAllocator* getTHDefaultAllocator(void);
 // the non-file descriptor constructor
 enum WithFd { WITH_FD };
 
-class AT_API THMapAllocator {
-public:
+class CAFFE2_API THMapAllocator {
+ public:
   THMapAllocator(const char *filename, int flags, size_t size);
   THMapAllocator(WithFd, const char *filename, int fd, int flags, size_t size);
   THMapAllocator(const THMapAllocator&) = delete;
@@ -82,12 +82,14 @@ class AT_API THMapAllocator {
 };
 
 // Base-from-member idiom
-struct AT_API THRefcountedMapAllocatorArgCheck {
+struct CAFFE2_API THRefcountedMapAllocatorArgCheck {
   THRefcountedMapAllocatorArgCheck(int flags);
 };
 
-class AT_API THRefcountedMapAllocator : private THRefcountedMapAllocatorArgCheck, public THMapAllocator {
-public:
+class CAFFE2_API THRefcountedMapAllocator
+    : private THRefcountedMapAllocatorArgCheck,
+      public THMapAllocator {
+ public:
   THRefcountedMapAllocator(const char *filename, int flags, size_t size);
   THRefcountedMapAllocator(WithFd, const char *filename, int fd, int flags, size_t size);
 
diff --git a/aten/src/TH/generic/THTensorFastGetSet.hpp b/aten/src/TH/generic/THTensorFastGetSet.hpp
index f3c202fd4234b..f430839565471 100644
--- a/aten/src/TH/generic/THTensorFastGetSet.hpp
+++ b/aten/src/TH/generic/THTensorFastGetSet.hpp
@@ -3,47 +3,47 @@
 #else
 
 static inline scalar_t THTensor_(fastGetLegacy1dNoScalars)(THTensor *self, int64_t x0) {
-  return (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*THTensor_strideLegacyNoScalars(self, 0)];
+  return self->unsafe_data<scalar_t>()[x0*THTensor_strideLegacyNoScalars(self, 0)];
 }
 
 static inline scalar_t THTensor_(fastGet1d)(THTensor *self, int64_t x0) {
-  return (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)];
+  return self->unsafe_data<scalar_t>()[x0*self->stride(0)];
 }
 
 static inline scalar_t THTensor_(fastGet2d)(THTensor *self, int64_t x0, int64_t x1) {
-  return (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)];
+  return self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)];
 }
 
 static inline scalar_t THTensor_(fastGet3d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2) {
-  return (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)];
+  return self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)+x2*self->stride(2)];
 }
 
 static inline scalar_t THTensor_(fastGet4d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3) {
-  return (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)];
+  return self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)+x2*self->stride(2)+x3*self->stride(3)];
 }
 
 static inline scalar_t THTensor_(fastGet5d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, int64_t x4) {
-  return (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)+(x4)*self->stride(4)];
+  return self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)+x2*self->stride(2)+x3*self->stride(3)+(x4)*self->stride(4)];
 }
 
 static inline void THTensor_(fastSet1d)(THTensor *self, int64_t x0, scalar_t value) {
-  (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)] = value;
+  self->unsafe_data<scalar_t>()[x0*self->stride(0)] = value;
 }
 
 static inline void THTensor_(fastSet2d)(THTensor *self, int64_t x0, int64_t x1, scalar_t value) {
-  (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)] = value;
+  self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)] = value;
 }
 
 static inline void THTensor_(fastSet3d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, scalar_t value) {
-  (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)] = value;
+  self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)+x2*self->stride(2)] = value;
 }
 
 static inline void THTensor_(fastSet4d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, scalar_t value) {
-  (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)] = value;
+  self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)+x2*self->stride(2)+x3*self->stride(3)] = value;
 }
 
 static inline void THTensor_(fastSet5d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, int64_t x4, scalar_t value) {
-  (THStorage_(data)(THTensor_getStoragePtr(self))+self->storage_offset())[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)+(x4)*self->stride(4)] = value;
+  self->unsafe_data<scalar_t>()[x0*self->stride(0)+x1*self->stride(1)+x2*self->stride(2)+x3*self->stride(3)+(x4)*self->stride(4)] = value;
 }
 
 #endif
diff --git a/aten/src/THC/THCAllocator.h b/aten/src/THC/THCAllocator.h
index 8dadcc034b2c2..323f745a4ac30 100644
--- a/aten/src/THC/THCAllocator.h
+++ b/aten/src/THC/THCAllocator.h
@@ -7,8 +7,8 @@ THC_API THAllocator* getTHCudaHostAllocator(void);
 // IPC doesn't support (re)allocation
 
 #ifdef __cplusplus
-class AT_API THCIpcDeleter {
-public:
+class CAFFE2_API THCIpcDeleter {
+ public:
   THCIpcDeleter(void* data, int device) : data_(data), device_(device) {};
   ~THCIpcDeleter();
   static at::DataPtr makeDataPtr(void* data, int device);
diff --git a/binaries/benchmark_helper.cc b/binaries/benchmark_helper.cc
index 001c8e965f6a6..f481a6292c7f5 100644
--- a/binaries/benchmark_helper.cc
+++ b/binaries/benchmark_helper.cc
@@ -163,7 +163,7 @@ void loadInput(
           CAFFE_THROW("Not support GPU on mobile.");
 #endif
         } else {
-          caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
+          caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU);
           CHECK_NOTNULL(tensor);
           tensor->Resize(input_dims);
           if (input_type_list[i] == "uint8_t") {
@@ -200,7 +200,7 @@ void fillInputBlob(
     int protos_size = tensor_kv.second.protos_size();
     caffe2::TensorProto* tensor_proto =
         tensor_kv.second.mutable_protos(iteration % protos_size);
-    caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
+    caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU);
     if (tensor_proto->data_type() == caffe2::TensorProto::STRING) {
       int total_size = tensor_proto->string_data_size();
       for (size_t i = 0; i < total_size; i++) {
@@ -298,7 +298,7 @@ void writeOutput(
 #endif
         } else {
           writeTextOutput<caffe2::CPUContext, caffe2::TensorCPU>(
-              workspace->GetBlob(name)->GetMutableTensor(caffe2::CPU),
+              BlobGetMutableTensor(workspace->GetBlob(name), caffe2::CPU),
               output_prefix,
               name);
         }
diff --git a/binaries/speed_benchmark.cc b/binaries/speed_benchmark.cc
index 5914e3f58b44b..fd502cf3c078a 100644
--- a/binaries/speed_benchmark.cc
+++ b/binaries/speed_benchmark.cc
@@ -137,7 +137,7 @@ int main(int argc, char** argv) {
         if (blob == nullptr) {
           blob = workspace->CreateBlob(input_names[i]);
         }
-        caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
+        caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU);
         CHECK_NOTNULL(tensor);
         tensor->Resize(input_dims);
         if (input_type_list[i] == "uint8_t") {
diff --git a/binaries/tutorial_blob.cc b/binaries/tutorial_blob.cc
index f379eac663cbe..ac74ebb5ffb78 100644
--- a/binaries/tutorial_blob.cc
+++ b/binaries/tutorial_blob.cc
@@ -47,7 +47,7 @@ int main(int argc, char** argv) {
   LOG(INFO)
       << "Is the blob type float? "
       << myblob.IsType<float>();
-               
+
   const int& myint_const = myblob.Get<int>();
   LOG(INFO)
       << "The value of the int number stored in the blob is: "
@@ -80,7 +80,7 @@ int main(int argc, char** argv) {
 
   std::string* pvec = new std::string();
   myblob.Reset(pvec); // no need to release pvec, myblob takes ownership.
-  
+
   LOG(INFO) << "Is the blob now of type string? "
             << myblob.IsType<std::string>();
 
diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt
new file mode 100644
index 0000000000000..4b7bab4f42eeb
--- /dev/null
+++ b/c10/CMakeLists.txt
@@ -0,0 +1,43 @@
+# Main build file for the C10 library.
+#
+# Note that the C10 library should maintain minimal dependencies - especially,
+# it should not depend on any library that is implementation specific or
+# backend specific. It should in particular NOT be dependent on any generated
+# protobuf header files, because protobuf header files will transitively force
+# one to link against a specific protobuf version.
+
+# ---[ Configure macro file.
+set(C10_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) # used in cmake_macros.h.in
+configure_file(
+    ${CMAKE_CURRENT_LIST_DIR}/macros/cmake_macros.h.in
+    ${CMAKE_BINARY_DIR}/c10/macros/cmake_macros.h)
+
+# Note: if you want to add ANY dependency to the c10 library, make sure you
+# check with the core PyTorch developers as the dependendency will be
+# transitively passed on to all libraries dependent on PyTorch.
+file(GLOB_RECURSE C10_SRCS *.cpp)
+file(GLOB_RECURSE C10_HEADERS *.h)
+add_library(c10 ${C10_SRCS} ${C10_HEADERS})
+# If building shared library, set dllimport/dllexport proper.
+target_compile_options(c10 PRIVATE "-DC10_BUILD_MAIN_LIB")
+# Enable hidden visibility if compiler supports it.
+if (${COMPILER_SUPPORTS_HIDDEN_VISIBILITY})
+  target_compile_options(c10 PRIVATE "-fvisibility=hidden")
+endif()
+
+target_include_directories(
+    c10 PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../>
+    $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}>
+    $<INSTALL_INTERFACE:include>)
+
+# ---[ Installation
+# Note: for now, we will put all export path into one single Caffe2Targets group
+# to deal with the cmake deployment need. Inside the Caffe2Targets set, the
+# individual libraries like libc10.so and libcaffe2.so are still self-contained.
+install(TARGETS c10 EXPORT Caffe2Targets DESTINATION lib)
+install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
+        DESTINATION include
+        FILES_MATCHING PATTERN "*.h")
+install(FILES ${CMAKE_BINARY_DIR}/c10/macros/cmake_macros.h
+        DESTINATION include/c10/macros)
diff --git a/c10/c10_dummy.cpp b/c10/c10_dummy.cpp
new file mode 100644
index 0000000000000..df4e73171da3f
--- /dev/null
+++ b/c10/c10_dummy.cpp
@@ -0,0 +1,7 @@
+#include "c10/c10_dummy.h"
+
+namespace c10 {
+bool HasC10() {
+  return true;
+}
+} // namespace c10
diff --git a/c10/c10_dummy.h b/c10/c10_dummy.h
new file mode 100644
index 0000000000000..cf6c6b30c14bb
--- /dev/null
+++ b/c10/c10_dummy.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include "c10/macros/Macros.h"
+
+namespace c10 {
+C10_API bool HasC10();
+}
diff --git a/c10/macros/Export.h b/c10/macros/Export.h
new file mode 100644
index 0000000000000..8e593e0100bbf
--- /dev/null
+++ b/c10/macros/Export.h
@@ -0,0 +1,76 @@
+/* Header file to define the common scaffolding for exported symbols.
+ *
+ * Export is by itself a quite tricky situation to deal with, and if you are
+ * hitting this file, make sure you start with the background here:
+ * - Linux: https://gcc.gnu.org/wiki/Visibility
+ * - Windows:
+ * https://docs.microsoft.com/en-us/cpp/cpp/dllexport-dllimport?view=vs-2017
+ *
+ * Do NOT include this file directly. Instead, use c10/macros/Macros.h
+ */
+
+#pragma once
+
+// You do not need to edit this part of file unless you are changing the core
+// pytorch export abstractions.
+//
+// This part defines the C10 core export and import macros. This is controlled
+// by whether we are building shared libraries or not, which is determined
+// during build time and codified in c10/core/cmake_macros.h.
+// When the library is built as a shared lib, EXPORT and IMPORT will contain
+// visibility attributes. If it is being built as a static lib, then EXPORT
+// and IMPORT basically have no effect.
+
+// As a rule of thumb, you should almost NEVER mix static and shared builds for
+// libraries that depend on c10. AKA, if c10 is built as a static library, we
+// recommend everything dependent on c10 to be built statically. If c10 is built
+// as a shared library, everything dependent on it should be built as shared. In
+// the PyTorch project, all native libraries shall use the macro
+// C10_BUILD_SHARED_LIB to check whether pytorch is building shared or static
+// libraries.
+
+#ifdef _WIN32
+#if defined(C10_BUILD_SHARED_LIBS)
+#define C10_EXPORT __declspec(dllexport)
+#define C10_IMPORT __declspec(dllimport)
+#else
+#define C10_EXPORT
+#define C10_IMPORT
+#endif
+#else // _WIN32
+#if defined(__GNUC__)
+#define C10_EXPORT __attribute__((__visibility__("default")))
+#else // defined(__GNUC__)
+#define C10_EXPORT
+#endif // defined(__GNUC__)
+#define C10_IMPORT C10_EXPORT
+#endif // _WIN32
+
+// Definition of an adaptive XX_API macro, that depends on whether you are
+// building the library itself or not, routes to XX_EXPORT and XX_IMPORT.
+// Basically, you will need to do this for each shared library that you are
+// building, and the instruction is as follows: assuming that you are building
+// a library called libawesome.so. You should:
+// (1) for your cmake target (usually done by "add_library(awesome, ...)"),
+//     define a macro called AWESOME_BUILD_MAIN_DLL using
+//     target_compile_options.
+// (2) define the AWESOME_API macro similar to the one below.
+// And in the source file of your awesome library, use AWESOME_API to
+// annotate public symbols.
+
+// Here, for the C10 library, we will define the macro C10_API for both import
+// and export.
+
+// This one is being used by libc10.so
+#ifdef C10_BUILD_MAIN_DLL
+#define C10_API C10_EXPORT
+#else
+#define C10_API C10_IMPORT
+#endif
+
+// This one is being used by libcaffe2.so
+#ifdef CAFFE2_BUILD_MAIN_LIB
+#define CAFFE2_API C10_EXPORT
+#else
+#define CAFFE2_API C10_IMPORT
+#endif
diff --git a/c10/macros/Legacy.h b/c10/macros/Legacy.h
new file mode 100644
index 0000000000000..86752a838acd3
--- /dev/null
+++ b/c10/macros/Legacy.h
@@ -0,0 +1,7 @@
+/* A centralized location to provide legacy macro support, and a warning about
+ * when this legacy compatibility symbol is going to removed in the future.
+ *
+ * Do NOT include this file directly. Instead, use c10/macros/Macros.h
+ */
+
+#pragma once
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
new file mode 100644
index 0000000000000..2b438d670f00d
--- /dev/null
+++ b/c10/macros/Macros.h
@@ -0,0 +1,32 @@
+/* Main entry for c10/macros.
+ *
+ * In your code, include c10/macros/Macros.h directly, instead of individual
+ * files in this folder.
+ */
+
+#pragma once
+
+// For build systems that do not directly depend on CMake and directly build
+// from the source directory (such as Buck), one may not have a cmake_macros.h
+// file at all. In this case, the build system is responsible for providing
+// correct macro definitions corresponding to the cmake_macros.h.in file.
+//
+// In such scenarios, one should define the macro
+//     C10_USING_CUSTOM_GENERATED_MACROS
+// to inform this header that it does not need to include the cmake_macros.h
+// file.
+
+#ifndef C10_USING_CUSTOM_GENERATED_MACROS
+#include "c10/macros/cmake_macros.h"
+#endif // C10_USING_CUSTOM_GENERATED_MACROS
+
+#include "c10/macros/Export.h"
+
+// Disable the copy and assignment operator for a class. Note that this will
+// disable the usage of the class in std containers.
+#define C10_DISABLE_COPY_AND_ASSIGN(classname) \
+  classname(const classname&) = delete;        \
+  classname& operator=(const classname&) = delete
+
+// Finally, file that provides legacy support for macros
+#include "c10/macros/Legacy.h"
diff --git a/c10/macros/cmake_macros.h.in b/c10/macros/cmake_macros.h.in
new file mode 100644
index 0000000000000..73bc803f06355
--- /dev/null
+++ b/c10/macros/cmake_macros.h.in
@@ -0,0 +1,6 @@
+// Automatically generated header file for the C10 library.
+// Do not include this file directly. Instead, include c10/macros/Macros.h.
+
+#pragma once
+
+#cmakedefine C10_BUILD_SHARED_LIBS
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index d7490686ab757..3dda37c4c5b1f 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -194,7 +194,6 @@ target_include_directories(caffe2_protos INTERFACE $<INSTALL_INTERFACE:include>)
 target_link_libraries(caffe2_protos PUBLIC protobuf::libprotobuf)
 
 # Compile exposed libraries.
-list(APPEND Caffe2_CPU_SRCs $<TARGET_OBJECTS:c10>)
 add_library(caffe2 ${Caffe2_CPU_SRCS})
 if (NOT WIN32)
   target_compile_options(caffe2 PRIVATE "-fvisibility=hidden")
@@ -206,6 +205,7 @@ if (${CAFFE2_LINK_LOCAL_PROTOBUF})
 else()
   target_link_libraries(caffe2 PUBLIC protobuf::libprotobuf)
 endif()
+target_link_libraries(caffe2 PUBLIC c10)
 target_link_libraries(caffe2 PUBLIC ${Caffe2_PUBLIC_DEPENDENCY_LIBS})
 target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_LIBS})
 target_link_libraries(caffe2 PRIVATE ${Caffe2_DEPENDENCY_WHOLE_LINK_LIBS})
@@ -333,7 +333,7 @@ if(USE_CUDA)
   # NB: This must be target_compile_definitions, not target_compile_options,
   # as the latter is not respected by nvcc
   if (MSVC)
-	  target_compile_definitions(caffe2_gpu PRIVATE "-DCAFFE2_CUDA_BUILD_MAIN_LIB")
+    target_compile_definitions(caffe2_gpu PRIVATE "-DCAFFE2_CUDA_BUILD_MAIN_LIB")
   endif()
 
   # Set standard properties on the target
diff --git a/caffe2/contrib/gloo/allgather_ops.h b/caffe2/contrib/gloo/allgather_ops.h
index 1f55233a095c8..f97a00f8956ee 100644
--- a/caffe2/contrib/gloo/allgather_ops.h
+++ b/caffe2/contrib/gloo/allgather_ops.h
@@ -114,7 +114,7 @@ class AllgatherOp final : public Operator<Context> {
     params.size = Input(1).size();
     params.meta = Input(1).meta();
     for (auto i = 0; i < params.inputs.size(); i++) {
-      params.inputs[i] = Input(i + 1).template raw_data();
+      params.inputs[i] = Input(i + 1).raw_data();
     }
     params.outputs.resize(OutputSize());
     params.outputs[0] = Output(0)->raw_mutable_data(params.meta);
diff --git a/caffe2/contrib/gloo/allreduce_ops.h b/caffe2/contrib/gloo/allreduce_ops.h
index 85d10c313085f..f3b1bd3560b3d 100644
--- a/caffe2/contrib/gloo/allreduce_ops.h
+++ b/caffe2/contrib/gloo/allreduce_ops.h
@@ -117,8 +117,8 @@ class AllreduceOp final : public Operator<Context> {
     params.inputs.resize(InputSize() - 1);
     params.outputs.resize(OutputSize());
     for (auto i = 0; i < params.inputs.size(); i++) {
-      params.inputs[i] = Input(i + 1).template raw_data();
-      params.outputs[i] = Output(i)->template raw_mutable_data();
+      params.inputs[i] = Input(i + 1).raw_data();
+      params.outputs[i] = Output(i)->raw_mutable_data();
     }
     params.size = Output(0)->size();
     params.meta = Output(0)->meta();
diff --git a/caffe2/contrib/gloo/broadcast_ops.h b/caffe2/contrib/gloo/broadcast_ops.h
index e525b8e158f4c..171dbbd8c97a1 100644
--- a/caffe2/contrib/gloo/broadcast_ops.h
+++ b/caffe2/contrib/gloo/broadcast_ops.h
@@ -95,8 +95,8 @@ class BroadcastOp final : public Operator<Context> {
     params.inputs.resize(InputSize() - 1);
     params.outputs.resize(OutputSize());
     for (auto i = 0; i < params.inputs.size(); i++) {
-      params.inputs[i] = Input(i + 1).template raw_data();
-      params.outputs[i] = Output(i)->template raw_mutable_data();
+      params.inputs[i] = Input(i + 1).raw_data();
+      params.outputs[i] = Output(i)->raw_mutable_data();
     }
     params.size = Output(0)->size();
     params.meta = Output(0)->meta();
diff --git a/caffe2/contrib/gloo/common.cc b/caffe2/contrib/gloo/common.cc
index 21ce0343d8181..d4929938f1917 100644
--- a/caffe2/contrib/gloo/common.cc
+++ b/caffe2/contrib/gloo/common.cc
@@ -12,7 +12,7 @@ namespace caffe2 {
 namespace gloo {
 
 void signalFailure(Blob* status_blob, std::exception& /* unused */) {
-  auto* res = status_blob->GetMutableTensor(CPU);
+  auto* res = BlobGetMutableTensor(status_blob, CPU);
   res->Resize(1);
   res->template mutable_data<int32_t>()[0] = 1;
 }
diff --git a/caffe2/contrib/gloo/reduce_scatter_ops.h b/caffe2/contrib/gloo/reduce_scatter_ops.h
index 069c523869493..559b35618a108 100644
--- a/caffe2/contrib/gloo/reduce_scatter_ops.h
+++ b/caffe2/contrib/gloo/reduce_scatter_ops.h
@@ -108,15 +108,15 @@ class ReduceScatterOp final : public Operator<Context> {
     params.inputs.resize(InputSize() - 2);
     params.outputs.resize(OutputSize() - 1);
     for (auto i = 0; i < params.inputs.size(); i++) {
-      params.inputs[i] = Input(i + 1).template raw_data();
-      params.outputs[i] = Output(i)->template raw_mutable_data();
+      params.inputs[i] = Input(i + 1).raw_data();
+      params.outputs[i] = Output(i)->raw_mutable_data();
     }
     params.size = Output(0)->size();
     params.meta = Output(0)->meta();
 
     // Verify recvCountsSize == comm_size
     CAFFE_ENFORCE_EQ(Input(InputSize() - 1).size(), params.context->size);
-    int* recvCounts = (int*)Input(InputSize() - 1).template raw_data();
+    int* recvCounts = (int*)Input(InputSize() - 1).raw_data();
     recvCounts_.assign(recvCounts, recvCounts + Input(InputSize() - 1).size());
   }
 
diff --git a/caffe2/contrib/nccl/cuda_nccl_gpu.cc b/caffe2/contrib/nccl/cuda_nccl_gpu.cc
index b544445a26873..490a69b91abf5 100644
--- a/caffe2/contrib/nccl/cuda_nccl_gpu.cc
+++ b/caffe2/contrib/nccl/cuda_nccl_gpu.cc
@@ -72,7 +72,7 @@ class NCCLContext {
   cudaEvent_t master_event_;
   std::vector<cudaEvent_t> events_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(NCCLContext);
+  C10_DISABLE_COPY_AND_ASSIGN(NCCLContext);
 };
 
 // We share the contexts across multiple operators, hence the
diff --git a/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc b/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc
index 972d9231dcf9c..9eee8973142ed 100644
--- a/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc
+++ b/caffe2/contrib/nervana/nervana_fc_op_gpu_test.cc
@@ -22,7 +22,7 @@ static void AddConstInput(const std::vector<int>& shape, const float value,
   option.set_device_type(PROTO_CUDA);
   CUDAContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CUDA);
+  auto* tensor = BlobGetMutableTensor(blob, CUDA);
   tensor->Resize(shape);
   math::Set<float, CUDAContext>(tensor->size(), value,
                                 tensor->mutable_data<float>(),
diff --git a/caffe2/contrib/tensorrt/tensorrt_tranformer.cc b/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
index 3612d8b46f1f8..2dd17e0016990 100644
--- a/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
+++ b/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
@@ -95,10 +95,10 @@ void BlobToTensorProto(
   }
 
   // Set values
-  if (blob->IsTensorType(CPU)) {
+  if (BlobIsTensorType(*blob, CPU)) {
     const auto& cpu_tensor = blob->template Get<TensorCPU>();
     CPUTensorToTensorProto(cpu_tensor, t);
-  } else if (blob->IsTensorType(CUDA)) {
+  } else if (BlobIsTensorType(*blob, CUDA)) {
     const auto& cuda_tensor = blob->template Get<TensorCUDA>();
     const auto cpu_tensor = TensorCPU(cuda_tensor, context);
     context->FinishDeviceComputation();
diff --git a/caffe2/core/blob.h b/caffe2/core/blob.h
index 870fc88322b15..e09a54cbd2df5 100644
--- a/caffe2/core/blob.h
+++ b/caffe2/core/blob.h
@@ -6,246 +6,37 @@
 #include <typeinfo>
 #include <type_traits>
 #include <vector>
-
-#include "caffe2/core/blob_serializer_base.h"
 #include "caffe2/core/common.h"
+
+#include <ATen/core/blob.h>
+#include <ATen/core/typeid.h>
 #include "caffe2/core/logging.h"
 #include "caffe2/core/tensor.h"
-#include "caffe2/core/typeid.h"
-#include "caffe2/proto/caffe2_pb.h"
 
 namespace caffe2 {
 
-/**
- * @brief Blob is a general container that hosts a typed pointer.
- *
- * A Blob hosts a pointer as well as its type, and takes charge of deleting it
- * properly when the blob is deallocated or re-allocated with a new type. A blob
- * could contain anything, although the most common case is to contain a Tensor.
- */
-class CAFFE2_API Blob final {
- public:
-  using DestroyCall = void(void*);
-
-  /**
-   * Initializes an empty Blob.
-   */
-  Blob() noexcept : meta_(), pointer_(nullptr), destroy_(nullptr) {}
-  ~Blob() { Reset(); }
-
-  Blob(Blob&& other) noexcept : Blob() {
-    swap(other);
-  }
-
-  Blob& operator=(Blob&& other) noexcept {
-    Blob(std::move(other)).swap(*this);
-    return *this;
-  }
-
-  /**
-   * Checks if the content stored in the blob is of type T.
-   */
-  template <class T>
-  bool IsType() const noexcept {
-    return meta_.Match<T>();
-  }
-
-  bool IsTensorType(DeviceType device_type) const {
-    bool is_match = meta_.Match<Tensor>();
-    auto* tensor = static_cast<Tensor*>(pointer_);
-    if (is_match && tensor && tensor->GetDeviceType() == device_type) {
-      return true;
-    }
+inline bool BlobIsTensorType(const Blob& blob, DeviceType device_type) {
+  bool is_match = blob.meta().Match<Tensor>();
+  if (!is_match) {
     return false;
   }
+  const Tensor* tensor = &blob.Get<Tensor>();
+  return tensor && tensor->GetDeviceType() == device_type;
+}
 
-  /**
-   * Returns the meta info of the blob.
-   */
-  inline const TypeMeta& meta() const noexcept { return meta_; }
-
-  /**
-   * Returns a printable typename of the blob.
-   */
-  inline const char* TypeName() const noexcept { return meta_.name(); }
-
-  /**
-   * @brief Gets the const reference of the stored object. The code checks if
-   * the stored object is of the desired type.
-   */
-  // TODO(jerryzh): add a Get(DeviceType) function?
-  template <class T>
-  const T& Get() const {
-    CAFFE_ENFORCE(
-        IsType<T>(),
-        "wrong type for the Blob instance. Blob contains ",
-        meta_.name(),
-        " while caller expects ",
-        TypeMeta::TypeName<T>());
-    // TODO: after we add Get<Tensor>(DeviceType)
-    // and changed all the callsites, we can add
-    // a static assert here to enforce T != Tensor
-    return *static_cast<const T*>(pointer_);
-  }
-
-  const void* GetRaw() const noexcept {
-    return pointer_;
-  }
-  void* GetRaw() noexcept {
-    return pointer_;
-  }
-
-  /**
-   * @brief Gets a mutable pointer to the stored object.
-   *
-   * If the current object is not of the right type, a new object is created
-   * and the old object is freed. Note that type T should have a default
-   * constructor. Otherwise, create the object yourself first, and use
-   * Reset().
-   */
-  template <class T>
-  T* GetMutable() {
-    static_assert(
-        std::is_default_constructible<T>::value,
-        "GetMutable can't be called with non-default-constructible types. "
-        "Try using specialized methods");
-    static_assert(
-        !std::is_same<T, Tensor>::value,
-        "Use GetMutableTensor(DeviceType) instead");
-    if (IsType<T>()) {
-      return static_cast<T*>(pointer_);
-    } else {
-      VLOG(1) << "Create new mutable object " << TypeMeta::TypeName<T>();
-      return Reset<T>(new T());
-    }
-  }
-
-  template <class T>
-  T* GetMutableOrNull() {
-    if (IsType<T>()) {
-      return static_cast<T*>(pointer_);
-    } else {
-      return nullptr;
-    }
-  }
-
-  inline Tensor* GetMutableTensor(DeviceType device_type) {
-    if (IsTensorType(device_type)) {
-      return static_cast<Tensor*>(pointer_);
-    } else {
-      VLOG(1) << "Create new mutable object " << TypeMeta::TypeName<Tensor>()
-              << " DeviceType:" << device_type;
-      return Reset<Tensor>(new Tensor(device_type));
-    }
-  }
-
-  /**
-   * Sets the underlying object to the allocated one. The Blob then takes over
-   * the ownership of the passed in pointer. If there is already an object in
-   * the Blob, the old object is freed.
-   *
-   * This is used when the underlying class T does not have a default ctor, or
-   * complex initializations needs to be done outside the blob.
-   */
-  template <class T>
-  T* Reset(T* allocated) {
-    if (pointer_ && destroy_) {
-      destroy_(pointer_);
-    }
-    meta_ = TypeMeta::Make<T>();
-    pointer_ = static_cast<void*>(allocated);
-    destroy_ = &Destroy<T>;
-    return allocated;
-  }
-
-  inline void*
-  Reset(void* allocated, const TypeMeta& meta, DestroyCall* destroy) {
-    if (pointer_ && destroy_) {
-      destroy_(pointer_);
-    }
-    meta_ = meta;
-    pointer_ = static_cast<void*>(allocated);
-    destroy_ = destroy;
-    return allocated;
-  }
-
-  /**
-   * Releases the ownership, if any, this Blob has on the underlying pointer.
-   * The user is then responsible for freeing the data if needed
-   */
-  inline DestroyCall* Release() {
-    DestroyCall* d = destroy_;
-    destroy_ = nullptr;
-    return d;
-  }
-
-  /**
-   * Sets the underlying object to the allocated one, but does not take over
-   * the ownership of the passed in pointer. If there is already an object in
-   * the Blob, the old object is freed.
-   *
-   * Unlike Reset, this does not take over the ownership of the pointer and the
-   * caller is responsible for making sure that the lifetime of the allocated
-   * blob outlasts the lifetime of any access to this blob, until another Reset
-   * call is made or the blob is destructed.
-   */
-  template <class T>
-  typename std::remove_const<T>::type* ShareExternal(
-      typename std::remove_const<T>::type* allocated) {
-    return static_cast<T*>(ShareExternal(
-        static_cast<void*>(allocated),
-        TypeMeta::Make<typename std::remove_const<T>::type>()));
-  }
-
-  void* ShareExternal(void* allocated, const TypeMeta& meta) {
-    if (pointer_ && destroy_) {
-      destroy_(pointer_);
-    }
-    meta_ = meta;
-    pointer_ = static_cast<void*>(allocated);
-    destroy_ = nullptr;
-    return allocated;
-  }
-
-  /**
-   * Resets the Blob to an empty one.
-   */
-  inline void Reset() {
-    if (pointer_ && destroy_) {
-      destroy_(pointer_);
+inline Tensor* BlobGetMutableTensor(Blob* blob, DeviceType device_type) {
+  if (blob->IsType<Tensor>()) {
+    Tensor* tensor = blob->GetMutable<Tensor>();
+    if (tensor->GetDeviceType() == device_type) {
+      return tensor;
     }
-    pointer_ = nullptr;
-    meta_ = TypeMeta();
-    destroy_ = nullptr;
   }
 
-  /**
-   * @brief Swaps the underlying storage of two blobs.
-   */
-  void swap(Blob& rhs) {
-    using std::swap;
-    swap(meta_, rhs.meta_);
-    swap(pointer_, rhs.pointer_);
-    swap(destroy_, rhs.destroy_);
-  }
-
- private:
-  /**
-   * @brief A destroy call that is used to properly deconstruct objects.
-   */
-  template <class T>
-  static void Destroy(void* pointer) {
-    delete static_cast<T*>(pointer);
-  }
-  TypeMeta meta_;
-  void* pointer_ = nullptr;
-  DestroyCall* destroy_ = nullptr;
-
-  AT_DISABLE_COPY_AND_ASSIGN(Blob);
-};
-
-inline void swap(Blob& lhs, Blob& rhs) {
-  lhs.swap(rhs);
+  // if we're here, then either Blob didn't hold a Tensor
+  // or that Tensor had the wrong DeviceType.
+  VLOG(1) << "Create new mutable object " << TypeMeta::TypeName<Tensor>()
+          << " DeviceType:" << device_type;
+  return blob->Reset<Tensor>(new Tensor(device_type));
 }
 
 }  // namespace caffe2
diff --git a/caffe2/core/blob_gpu_test.cc b/caffe2/core/blob_gpu_test.cc
index e8fdf47f69ddb..55eafdede7269 100644
--- a/caffe2/core/blob_gpu_test.cc
+++ b/caffe2/core/blob_gpu_test.cc
@@ -132,7 +132,7 @@ TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
     for (int i = 0; i < 6; ++i) {                                          \
       cpu_tensor.mutable_data<TypeParam>()[i] = static_cast<TypeParam>(i); \
     }                                                                      \
-    blob.GetMutableTensor(CUDA)->CopyFrom(cpu_tensor);                     \
+    BlobGetMutableTensor(&blob, CUDA)->CopyFrom(cpu_tensor);               \
     string serialized = SerializeBlob(blob, "test");                       \
     BlobProto proto;                                                       \
     CAFFE_ENFORCE(proto.ParseFromString(serialized));                      \
@@ -149,7 +149,7 @@ TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
     }                                                                      \
     Blob new_blob;                                                         \
     EXPECT_NO_THROW(DeserializeBlob(serialized, &new_blob));               \
-    EXPECT_TRUE(new_blob.IsTensorType(CUDA));                              \
+    EXPECT_TRUE(BlobIsTensorType(new_blob, CUDA));                         \
     Tensor new_cpu_tensor(blob.Get<Tensor>(), CPU);                        \
     EXPECT_EQ(new_cpu_tensor.ndim(), 2);                                   \
     EXPECT_EQ(new_cpu_tensor.dim(0), 2);                                   \
@@ -199,7 +199,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
     // Test if the restored blob is still of the same device.
     blob.Reset();
     EXPECT_NO_THROW(DeserializeBlob(serialized, &blob));
-    EXPECT_TRUE(blob.IsTensorType(CUDA));
+    EXPECT_TRUE(BlobIsTensorType(blob, CUDA));
     EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorCUDA>().data<float>()),
               gpu_id);
     // Test if we force the restored blob on a different device, we
@@ -207,7 +207,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
     blob.Reset();
     proto.mutable_tensor()->mutable_device_detail()->set_cuda_gpu_id(0);
     EXPECT_NO_THROW(DeserializeBlob(proto.SerializeAsString(), &blob));
-    EXPECT_TRUE(blob.IsTensorType(CUDA));
+    EXPECT_TRUE(BlobIsTensorType(blob, CUDA));
     EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorCUDA>().data<float>()), 0);
   }
 }
diff --git a/caffe2/core/blob_serialization.cc b/caffe2/core/blob_serialization.cc
index 7ff5a2b25eacc..d4ef19db69ce4 100644
--- a/caffe2/core/blob_serialization.cc
+++ b/caffe2/core/blob_serialization.cc
@@ -363,7 +363,8 @@ void TensorDeserializer::Deserialize(const BlobProto& blob_proto, Blob* blob) {
   auto tensor_proto = blob_proto.tensor();
   Deserialize(
       tensor_proto,
-      blob->GetMutableTensor(
+      BlobGetMutableTensor(
+          blob,
           static_cast<DeviceType>(tensor_proto.device_detail().device_type())));
 }
 
diff --git a/caffe2/core/blob_test.cc b/caffe2/core/blob_test.cc
index 24b2a2d0593d3..bb2f4ba6a9181 100644
--- a/caffe2/core/blob_test.cc
+++ b/caffe2/core/blob_test.cc
@@ -86,15 +86,15 @@ TEST(BlobTest, Blob) {
   int* int_unused CAFFE2_UNUSED = blob.GetMutable<int>();
   EXPECT_TRUE(blob.IsType<int>());
   EXPECT_FALSE(blob.IsType<BlobTestFoo>());
-  EXPECT_FALSE(blob.IsTensorType(CPU));
+  EXPECT_FALSE(BlobIsTensorType(blob, CPU));
 
   BlobTestFoo* foo_unused CAFFE2_UNUSED = blob.GetMutable<BlobTestFoo>();
   EXPECT_TRUE(blob.IsType<BlobTestFoo>());
   EXPECT_FALSE(blob.IsType<int>());
-  EXPECT_FALSE(blob.IsTensorType(CPU));
+  EXPECT_FALSE(BlobIsTensorType(blob, CPU));
 
-  Tensor* tensor_unused CAFFE2_UNUSED = blob.GetMutableTensor(CPU);
-  EXPECT_TRUE(blob.IsTensorType(CPU));
+  Tensor* tensor_unused CAFFE2_UNUSED = BlobGetMutableTensor(&blob, CPU);
+  EXPECT_TRUE(BlobIsTensorType(blob, CPU));
   EXPECT_FALSE(blob.IsType<BlobTestFoo>());
   EXPECT_FALSE(blob.IsType<int>());
 }
@@ -600,7 +600,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
 #define TEST_SERIALIZATION_WITH_TYPE(TypeParam, field_name)               \
   TEST(TensorTest, TensorSerialization_##TypeParam) {                     \
     Blob blob;                                                            \
-    Tensor* tensor = blob.GetMutableTensor(CPU);                          \
+    Tensor* tensor = BlobGetMutableTensor(&blob, CPU);                    \
     tensor->Resize(2, 3);                                                 \
     for (int i = 0; i < 6; ++i) {                                         \
       tensor->mutable_data<TypeParam>()[i] = static_cast<TypeParam>(i);   \
@@ -621,7 +621,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
     }                                                                     \
     Blob new_blob;                                                        \
     EXPECT_NO_THROW(DeserializeBlob(serialized, &new_blob));              \
-    EXPECT_TRUE(new_blob.IsTensorType(CPU));                              \
+    EXPECT_TRUE(BlobIsTensorType(new_blob, CPU));                         \
     const TensorCPU& new_tensor = blob.Get<TensorCPU>();                  \
     EXPECT_EQ(new_tensor.ndim(), 2);                                      \
     EXPECT_EQ(new_tensor.dim(0), 2);                                      \
@@ -634,7 +634,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
                                                                           \
   TEST(EmptyTensorTest, TensorSerialization_##TypeParam) {                \
     Blob blob;                                                            \
-    TensorCPU* tensor = blob.GetMutableTensor(CPU);                       \
+    TensorCPU* tensor = BlobGetMutableTensor(&blob, CPU);                 \
     tensor->Resize(0, 3);                                                 \
     tensor->mutable_data<TypeParam>();                                    \
     string serialized = SerializeBlob(blob, "test");                      \
@@ -650,7 +650,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
     EXPECT_EQ(tensor_proto.field_name##_size(), 0);                       \
     Blob new_blob;                                                        \
     EXPECT_NO_THROW(DeserializeBlob(serialized, &new_blob));              \
-    EXPECT_TRUE(new_blob.IsTensorType(CPU));                              \
+    EXPECT_TRUE(BlobIsTensorType(new_blob, CPU));                         \
     const TensorCPU& new_tensor = blob.Get<TensorCPU>();                  \
     EXPECT_EQ(new_tensor.ndim(), 2);                                      \
     EXPECT_EQ(new_tensor.dim(0), 0);                                      \
@@ -669,7 +669,7 @@ TEST_SERIALIZATION_WITH_TYPE(int64_t, int64_data)
 
 TEST(TensorTest, TensorSerialization_CustomType) {
   Blob blob;
-  TensorCPU* tensor = blob.GetMutableTensor(CPU);
+  TensorCPU* tensor = BlobGetMutableTensor(&blob, CPU);
   tensor->Resize(2, 3);
   for (int i = 0; i < 6; ++i) {
     tensor->mutable_data<BlobTestFoo>()[i].val = i;
@@ -681,7 +681,7 @@ TEST(TensorTest, TensorSerialization_CustomType) {
   EXPECT_EQ(proto.type(), "Tensor");
   Blob new_blob;
   EXPECT_NO_THROW(DeserializeBlob(serialized, &new_blob));
-  EXPECT_TRUE(new_blob.IsTensorType(CPU));
+  EXPECT_TRUE(BlobIsTensorType(new_blob, CPU));
   const TensorCPU& new_tensor = blob.Get<TensorCPU>();
   EXPECT_EQ(new_tensor.ndim(), 2);
   EXPECT_EQ(new_tensor.dim(0), 2);
@@ -696,7 +696,7 @@ TEST(TensorTest, TensorSerialization_CustomType) {
 TEST(TensorTest, Half) {
   const int64_t kSize = 3000000;
   Blob blob;
-  TensorCPU* tensor = blob.GetMutableTensor(CPU);
+  TensorCPU* tensor = BlobGetMutableTensor(&blob, CPU);
   tensor->Resize(kSize);
   for (int i = 0; i < tensor->size(); ++i) {
     tensor->mutable_data<at::Half>()[i].x = i % 10000;
@@ -724,7 +724,7 @@ TEST(TensorTest, Half) {
   }
   Blob new_blob;
   EXPECT_NO_THROW(DeserializeBlob(serialized, &new_blob));
-  EXPECT_TRUE(new_blob.IsTensorType(CPU));
+  EXPECT_TRUE(BlobIsTensorType(new_blob, CPU));
   const TensorCPU& new_tensor = blob.Get<TensorCPU>();
   EXPECT_EQ(new_tensor.ndim(), 1);
   EXPECT_EQ(new_tensor.dim(0), kSize);
@@ -860,7 +860,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) {
   {
     VLOG(1) << "Test begin";
     Blob blob;
-    Tensor* tensor = blob.GetMutableTensor(CPU);
+    Tensor* tensor = BlobGetMutableTensor(&blob, CPU);
     VLOG(1) << "Allocating blob";
     tensor->Resize(d1, d2);
     auto mutableData = tensor->mutable_data<TypeParam>();
@@ -903,7 +903,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) {
     load_op->Run();
     VLOG(1) << "Reading blob from workspace";
     auto new_blob = ws.GetBlob("test");
-    EXPECT_TRUE(new_blob->IsTensorType(CPU));
+    EXPECT_TRUE(BlobIsTensorType(*new_blob, CPU));
     const auto& new_tensor = new_blob->Get<TensorCPU>();
 
     EXPECT_EQ(new_tensor.ndim(), d1);
@@ -1030,7 +1030,7 @@ TEST(CustomChunkSize, BigTensorSerialization) {
   int64_t size = d1 * d2;
 
   Blob blob;
-  TensorCPU* tensor = blob.GetMutableTensor(CPU);
+  TensorCPU* tensor = BlobGetMutableTensor(&blob, CPU);
   tensor->Resize(d1, d2);
   tensor->mutable_data<float>();
   std::mutex mutex;
diff --git a/caffe2/core/common.h b/caffe2/core/common.h
index 048d634df80df..93bbf341b5061 100644
--- a/caffe2/core/common.h
+++ b/caffe2/core/common.h
@@ -26,7 +26,7 @@
 // is automatically generated by the cmake script during build.
 #include "caffe2/core/macros.h"
 
-#include "ATen/core/Macros.h"
+#include "c10/macros/Macros.h"
 
 namespace caffe2 {
 
@@ -94,48 +94,6 @@ using std::vector;
 #define CAFFE2_NORETURN __attribute__((noreturn))
 #endif
 
-// Defines CAFFE2_EXPORT and CAFFE2_IMPORT. On Windows, this corresponds to
-// different declarations (dllexport and dllimport). On Linux/Mac, it just
-// resolves to the same "default visibility" setting.
-#if defined(_MSC_VER)
-#if defined(CAFFE2_BUILD_SHARED_LIBS)
-#define CAFFE2_EXPORT __declspec(dllexport)
-#define CAFFE2_IMPORT __declspec(dllimport)
-#else
-#define CAFFE2_EXPORT
-#define CAFFE2_IMPORT
-#endif
-#else
-#if defined(__GNUC__)
-#define CAFFE2_EXPORT __attribute__((__visibility__("default")))
-#else
-#define CAFFE2_EXPORT
-#endif
-#define CAFFE2_IMPORT CAFFE2_EXPORT
-#endif
-
-// CAFFE2_API is a macro that, depends on whether you are building the
-// main caffe2 library or not, resolves to either CAFFE2_EXPORT or
-// CAFFE2_IMPORT.
-//
-// This is used in e.g. Caffe2's protobuf files: when building the main library,
-// it is defined as CAFFE2_EXPORT to fix a Windows global-variable-in-dll
-// issue, and for anyone dependent on Caffe2 it will be defined as
-// CAFFE2_IMPORT.
-
-#ifdef CAFFE2_BUILD_MAIN_LIB
-#define CAFFE2_API CAFFE2_EXPORT
-#else
-#define CAFFE2_API CAFFE2_IMPORT
-#endif
-
-#ifdef CAFFE2_BUILD_OBSERVER_LIB
-#define CAFFE2_OBSERVER_API CAFFE2_EXPORT
-#else
-#define CAFFE2_OBSERVER_API CAFFE2_IMPORT
-#endif
-
-
 #if defined(_MSC_VER)
 #define NOMINMAX
 #endif
diff --git a/caffe2/core/common_cudnn.h b/caffe2/core/common_cudnn.h
index 5332026eedb0c..c0961c4c6411a 100644
--- a/caffe2/core/common_cudnn.h
+++ b/caffe2/core/common_cudnn.h
@@ -258,7 +258,7 @@ class cudnnTensorDescWrapper {
   cudnnTensorFormat_t format_;
   cudnnDataType_t type_;
   vector<int> dims_;
-  AT_DISABLE_COPY_AND_ASSIGN(cudnnTensorDescWrapper);
+  C10_DISABLE_COPY_AND_ASSIGN(cudnnTensorDescWrapper);
 };
 
 class cudnnFilterDescWrapper {
@@ -312,7 +312,7 @@ class cudnnFilterDescWrapper {
   StorageOrder order_;
   cudnnDataType_t type_;
   vector<int> dims_;
-  AT_DISABLE_COPY_AND_ASSIGN(cudnnFilterDescWrapper);
+  C10_DISABLE_COPY_AND_ASSIGN(cudnnFilterDescWrapper);
 };
 
 
diff --git a/caffe2/core/cudnn_wrappers.h b/caffe2/core/cudnn_wrappers.h
index b518914e50402..1bd39fa62a399 100644
--- a/caffe2/core/cudnn_wrappers.h
+++ b/caffe2/core/cudnn_wrappers.h
@@ -89,7 +89,7 @@ class CuDNNState {
   cudaStream_t stream_{nullptr};
   CuDNNWorkspace workspace_;
   size_t gpu_id_{0};
-  AT_DISABLE_COPY_AND_ASSIGN(CuDNNState);
+  C10_DISABLE_COPY_AND_ASSIGN(CuDNNState);
 };
 
 /**
@@ -153,7 +153,7 @@ class CuDNNWrapper {
       CAFFE2_COMPILE_TIME_MAX_GPUS>;
   static PerGPUCuDNNStates& cudnn_states();
 
-  AT_DISABLE_COPY_AND_ASSIGN(CuDNNWrapper);
+  C10_DISABLE_COPY_AND_ASSIGN(CuDNNWrapper);
 };
 
 }; // namespace caffe2
diff --git a/caffe2/core/db.cc b/caffe2/core/db.cc
index 386787b51c353..720c2dcaa46de 100644
--- a/caffe2/core/db.cc
+++ b/caffe2/core/db.cc
@@ -119,7 +119,7 @@ class MiniDBTransaction : public Transaction {
   FILE* file_;
   std::lock_guard<std::mutex> lock_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(MiniDBTransaction);
+  C10_DISABLE_COPY_AND_ASSIGN(MiniDBTransaction);
 };
 
 class MiniDB : public DB {
diff --git a/caffe2/core/db.h b/caffe2/core/db.h
index 06b74d11bd585..39f8b6f3f02b0 100644
--- a/caffe2/core/db.h
+++ b/caffe2/core/db.h
@@ -52,7 +52,7 @@ class CAFFE2_API Cursor {
    */
   virtual bool Valid() = 0;
 
-  AT_DISABLE_COPY_AND_ASSIGN(Cursor);
+  C10_DISABLE_COPY_AND_ASSIGN(Cursor);
 };
 
 /**
@@ -71,7 +71,7 @@ class CAFFE2_API Transaction {
    */
   virtual void Commit() = 0;
 
-  AT_DISABLE_COPY_AND_ASSIGN(Transaction);
+  C10_DISABLE_COPY_AND_ASSIGN(Transaction);
 };
 
 /**
@@ -99,7 +99,7 @@ class CAFFE2_API DB {
  protected:
   Mode mode_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(DB);
+  C10_DISABLE_COPY_AND_ASSIGN(DB);
 };
 
 // Database classes are registered by their names so we can do optional
@@ -285,7 +285,7 @@ class CAFFE2_API DBReader {
   uint32_t num_shards_;
   uint32_t shard_id_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(DBReader);
+  C10_DISABLE_COPY_AND_ASSIGN(DBReader);
 };
 
 class CAFFE2_API DBReaderSerializer : public BlobSerializerBase {
diff --git a/caffe2/core/dispatch/KernelRegistration.h b/caffe2/core/dispatch/KernelRegistration.h
index 9ebc20b7ab0a6..619cef616222b 100644
--- a/caffe2/core/dispatch/KernelRegistration.h
+++ b/caffe2/core/dispatch/KernelRegistration.h
@@ -57,7 +57,7 @@ class KernelRegistrar final {
   const typename Schema::dispatch::dispatch_key_type dispatch_key_;
   bool owns_registration_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(KernelRegistrar);
+  C10_DISABLE_COPY_AND_ASSIGN(KernelRegistrar);
 };
 
 /**
diff --git a/caffe2/core/flags.cc b/caffe2/core/flags.cc
index e7c19efde21b3..a84d298466dc0 100644
--- a/caffe2/core/flags.cc
+++ b/caffe2/core/flags.cc
@@ -9,7 +9,7 @@ namespace caffe2 {
 
 #ifdef CAFFE2_USE_GFLAGS
 
-CAFFE2_EXPORT void SetUsageMessage(const string& str) {
+C10_EXPORT void SetUsageMessage(const string& str) {
   if (UsageMessage() != nullptr) {
     // Usage message has already been set, so we will simply return.
     return;
@@ -17,16 +17,16 @@ CAFFE2_EXPORT void SetUsageMessage(const string& str) {
   gflags::SetUsageMessage(str);
 }
 
-CAFFE2_EXPORT const char* UsageMessage() {
+C10_EXPORT const char* UsageMessage() {
   return gflags::ProgramUsage();
 }
 
-CAFFE2_EXPORT bool ParseCaffeCommandLineFlags(int* pargc, char*** pargv) {
+C10_EXPORT bool ParseCaffeCommandLineFlags(int* pargc, char*** pargv) {
   if (*pargc == 0) return true;
   return gflags::ParseCommandLineFlags(pargc, pargv, true);
 }
 
-CAFFE2_EXPORT bool CommandLineFlagsHasBeenParsed() {
+C10_EXPORT bool CommandLineFlagsHasBeenParsed() {
   // There is no way we query gflags right now, so we will simply return true.
   return true;
 }
@@ -48,11 +48,14 @@ std::stringstream& GlobalInitStream() {
 static string gUsageMessage = "(Usage message not set.)";
 }
 
+C10_EXPORT void SetUsageMessage(const string& str) {
+  gUsageMessage = str;
+}
+C10_EXPORT const char* UsageMessage() {
+  return gUsageMessage.c_str();
+}
 
-CAFFE2_EXPORT void SetUsageMessage(const string& str) { gUsageMessage = str; }
-CAFFE2_EXPORT const char* UsageMessage() { return gUsageMessage.c_str(); }
-
-CAFFE2_EXPORT bool ParseCaffeCommandLineFlags(int* pargc, char*** pargv) {
+C10_EXPORT bool ParseCaffeCommandLineFlags(int* pargc, char*** pargv) {
   if (*pargc == 0) return true;
   char** argv = *pargv;
   bool success = true;
@@ -136,18 +139,22 @@ CAFFE2_EXPORT bool ParseCaffeCommandLineFlags(int* pargc, char*** pargv) {
   return success;
 }
 
-CAFFE2_EXPORT bool CommandLineFlagsHasBeenParsed() {
+C10_EXPORT bool CommandLineFlagsHasBeenParsed() {
   return gCommandLineFlagsParsed;
 }
 
 template <>
-CAFFE2_EXPORT bool Caffe2FlagParser::Parse<string>(const string& content, string* value) {
+C10_EXPORT bool Caffe2FlagParser::Parse<string>(
+    const string& content,
+    string* value) {
   *value = content;
   return true;
 }
 
 template <>
-CAFFE2_EXPORT bool Caffe2FlagParser::Parse<int>(const string& content, int* value) {
+C10_EXPORT bool Caffe2FlagParser::Parse<int>(
+    const string& content,
+    int* value) {
   try {
     *value = std::atoi(content.c_str());
     return true;
@@ -159,7 +166,9 @@ CAFFE2_EXPORT bool Caffe2FlagParser::Parse<int>(const string& content, int* valu
 }
 
 template <>
-CAFFE2_EXPORT bool Caffe2FlagParser::Parse<int64_t>(const string& content, int64_t* value) {
+C10_EXPORT bool Caffe2FlagParser::Parse<int64_t>(
+    const string& content,
+    int64_t* value) {
   try {
     static_assert(sizeof(long long) == sizeof(int64_t), "");
 #ifdef __ANDROID__
@@ -177,7 +186,9 @@ CAFFE2_EXPORT bool Caffe2FlagParser::Parse<int64_t>(const string& content, int64
 }
 
 template <>
-CAFFE2_EXPORT bool Caffe2FlagParser::Parse<double>(const string& content, double* value) {
+C10_EXPORT bool Caffe2FlagParser::Parse<double>(
+    const string& content,
+    double* value) {
   try {
     *value = std::atof(content.c_str());
     return true;
@@ -190,7 +201,9 @@ CAFFE2_EXPORT bool Caffe2FlagParser::Parse<double>(const string& content, double
 }
 
 template <>
-CAFFE2_EXPORT bool Caffe2FlagParser::Parse<bool>(const string& content, bool* value) {
+C10_EXPORT bool Caffe2FlagParser::Parse<bool>(
+    const string& content,
+    bool* value) {
   if (content == "false" || content == "False" || content == "FALSE" ||
       content == "0") {
     *value = false;
diff --git a/caffe2/core/flags.h b/caffe2/core/flags.h
index 2226b66af56fd..4e39c7bdebf13 100644
--- a/caffe2/core/flags.h
+++ b/caffe2/core/flags.h
@@ -79,14 +79,14 @@ namespace gflags = google;
 // (3) Gflags has a design issue that does not properly expose the global flags,
 // if one builds the library with -fvisibility=hidden. The current gflags (as of
 // Aug 2018) only deals with the Windows case using dllexport, and not the Linux
-// counterparts. As a result, we will explciitly use CAFFE2_EXPORT to export the
+// counterparts. As a result, we will explciitly use C10_EXPORT to export the
 // flags defined in Caffe2. This is done via a global reference, so the flag
 // itself is not duplicated - under the hood it is the same global gflags flag.
-#define CAFFE2_GFLAGS_DEF_WRAPPER(                                             \
-    type, real_type, name, default_value, help_str)                            \
-  DEFINE_##type(name, default_value, help_str);                                \
-  namespace caffe2 {                                                           \
-    CAFFE2_EXPORT real_type& FLAGS_##name = ::FLAGS_##name;                    \
+#define CAFFE2_GFLAGS_DEF_WRAPPER(                     \
+    type, real_type, name, default_value, help_str)    \
+  DEFINE_##type(name, default_value, help_str);        \
+  namespace caffe2 {                                   \
+  C10_EXPORT real_type& FLAGS_##name = ::FLAGS_##name; \
   }
 
 #define CAFFE2_DEFINE_int(name, default_value, help_str)                       \
@@ -102,11 +102,11 @@ namespace gflags = google;
       string, ::fLS::clstring, name, default_value, help_str)
 
 // DECLARE_typed_var should be used in header files and in the global namespace.
-#define CAFFE2_GFLAGS_DECLARE_WRAPPER(type, real_type, name)                   \
-  DECLARE_##type(name);                                                        \
-  namespace caffe2 {                                                           \
-    CAFFE2_IMPORT extern real_type& FLAGS_##name;                              \
-  }  // namespace caffe2
+#define CAFFE2_GFLAGS_DECLARE_WRAPPER(type, real_type, name) \
+  DECLARE_##type(name);                                      \
+  namespace caffe2 {                                         \
+  C10_IMPORT extern real_type& FLAGS_##name;                 \
+  } // namespace caffe2
 
 #define CAFFE2_DECLARE_int(name)                                               \
   CAFFE2_GFLAGS_DECLARE_WRAPPER(int32, gflags::int32, name)
@@ -150,22 +150,22 @@ CAFFE_DECLARE_REGISTRY(Caffe2FlagsRegistry, Caffe2FlagParser, const string&);
 // write the CAFFE2_DEFINE_* and CAFFE2_DECLARE_* macros outside any namespace
 // as well.
 
-#define CAFFE2_DEFINE_typed_var(type, name, default_value, help_str)           \
-  namespace caffe2 {                                                           \
-  CAFFE2_EXPORT type FLAGS_##name = default_value;                             \
-  namespace {                                                                  \
-  class Caffe2FlagParser_##name : public Caffe2FlagParser {                    \
-   public:                                                                     \
-    explicit Caffe2FlagParser_##name(const string& content) {                  \
-      success_ = Caffe2FlagParser::Parse<type>(content, &FLAGS_##name);        \
-    }                                                                          \
-  };                                                                           \
-  }                                                                            \
-  RegistererCaffe2FlagsRegistry g_Caffe2FlagsRegistry_##name(                  \
-      #name,                                                                   \
-      Caffe2FlagsRegistry(),                                                   \
-      RegistererCaffe2FlagsRegistry::DefaultCreator<Caffe2FlagParser_##name>,  \
-      "(" #type ", default " #default_value ") " help_str);                    \
+#define CAFFE2_DEFINE_typed_var(type, name, default_value, help_str)          \
+  namespace caffe2 {                                                          \
+  C10_EXPORT type FLAGS_##name = default_value;                               \
+  namespace {                                                                 \
+  class Caffe2FlagParser_##name : public Caffe2FlagParser {                   \
+   public:                                                                    \
+    explicit Caffe2FlagParser_##name(const string& content) {                 \
+      success_ = Caffe2FlagParser::Parse<type>(content, &FLAGS_##name);       \
+    }                                                                         \
+  };                                                                          \
+  }                                                                           \
+  RegistererCaffe2FlagsRegistry g_Caffe2FlagsRegistry_##name(                 \
+      #name,                                                                  \
+      Caffe2FlagsRegistry(),                                                  \
+      RegistererCaffe2FlagsRegistry::DefaultCreator<Caffe2FlagParser_##name>, \
+      "(" #type ", default " #default_value ") " help_str);                   \
   }
 
 #define CAFFE2_DEFINE_int(name, default_value, help_str)                       \
@@ -180,9 +180,9 @@ CAFFE_DECLARE_REGISTRY(Caffe2FlagsRegistry, Caffe2FlagParser, const string&);
   CAFFE2_DEFINE_typed_var(string, name, default_value, help_str)
 
 // DECLARE_typed_var should be used in header files and in the global namespace.
-#define CAFFE2_DECLARE_typed_var(type, name)                                   \
-  namespace caffe2 {                                                           \
-    CAFFE2_IMPORT extern type FLAGS_##name;                                    \
+#define CAFFE2_DECLARE_typed_var(type, name) \
+  namespace caffe2 {                         \
+  C10_IMPORT extern type FLAGS_##name;       \
   } // namespace caffe2
 
 #define CAFFE2_DECLARE_int(name) CAFFE2_DECLARE_typed_var(int, name)
diff --git a/caffe2/core/hip/common_miopen.h b/caffe2/core/hip/common_miopen.h
index 59fa0f429f8ac..ecdf376e47490 100644
--- a/caffe2/core/hip/common_miopen.h
+++ b/caffe2/core/hip/common_miopen.h
@@ -164,7 +164,7 @@ class miopenTensorDescWrapper
     miopenTensorDescriptor_t desc_;
     miopenDataType_t type_;
     vector<int> dims_;
-    AT_DISABLE_COPY_AND_ASSIGN(miopenTensorDescWrapper);
+    C10_DISABLE_COPY_AND_ASSIGN(miopenTensorDescWrapper);
 };
 
 } // namespace caffe2
diff --git a/caffe2/core/hip/miopen_wrapper.h b/caffe2/core/hip/miopen_wrapper.h
index 910db8b79d788..328c7522258d9 100644
--- a/caffe2/core/hip/miopen_wrapper.h
+++ b/caffe2/core/hip/miopen_wrapper.h
@@ -92,7 +92,7 @@ class MIOPENState
     hipStream_t stream_{nullptr};
     MIOPENWorkspace workspace_;
     size_t gpu_id_{0};
-    AT_DISABLE_COPY_AND_ASSIGN(MIOPENState);
+    C10_DISABLE_COPY_AND_ASSIGN(MIOPENState);
 };
 
 /**
@@ -157,7 +157,7 @@ class MIOPENWrapper
                    CAFFE2_COMPILE_TIME_MAX_HIP_GPUS>;
     static PerGPUMIOPENStates& miopen_states();
 
-    AT_DISABLE_COPY_AND_ASSIGN(MIOPENWrapper);
+    C10_DISABLE_COPY_AND_ASSIGN(MIOPENWrapper);
 };
 
 }; // namespace caffe2
diff --git a/caffe2/core/hip/net_async_dag_hip.cc b/caffe2/core/hip/net_async_dag_hip.cc
index fa35b2a8c2161..faac5b119f576 100644
--- a/caffe2/core/hip/net_async_dag_hip.cc
+++ b/caffe2/core/hip/net_async_dag_hip.cc
@@ -58,7 +58,7 @@ class ProfiledRange
     ProfiledRange(const OperatorDef& def, Color color) {}
 
     private:
-    AT_DISABLE_COPY_AND_ASSIGN(ProfiledRange);
+     C10_DISABLE_COPY_AND_ASSIGN(ProfiledRange);
 };
 
 } // namespace
diff --git a/caffe2/core/logging.h b/caffe2/core/logging.h
index 37fcd939c4d61..288c34afd5dbe 100644
--- a/caffe2/core/logging.h
+++ b/caffe2/core/logging.h
@@ -8,6 +8,7 @@
 #include <sstream>
 
 #include <ATen/core/Error.h>
+#include "caffe2/core/common.h"
 #include "caffe2/core/flags.h"
 
 // CAFFE2_LOG_THRESHOLD is a compile time flag that would allow us to turn off
diff --git a/caffe2/core/macros.h.in b/caffe2/core/macros.h.in
index a055de0fdc254..188853296f816 100644
--- a/caffe2/core/macros.h.in
+++ b/caffe2/core/macros.h.in
@@ -11,7 +11,6 @@
 #define CAFFE2_VERSION_MAJOR @CAFFE2_VERSION_MAJOR@
 #define CAFFE2_VERSION_MINOR @CAFFE2_VERSION_MINOR@
 #define CAFFE2_VERSION_PATCH @CAFFE2_VERSION_PATCH@
-#define CAFFE2_GIT_VERSION "@CAFFE2_GIT_VERSION@"
 
 static_assert(
     CAFFE2_VERSION_MINOR < 100,
@@ -54,7 +53,6 @@ static_assert(
 
 // Useful build settings that are recorded in the compiled binary
 #define CAFFE2_BUILD_STRINGS { \
-  {"GIT_VERSION", "${CAFFE2_GIT_VERSION}"}, \
   {"CXX_FLAGS", "${CMAKE_CXX_FLAGS}"}, \
   {"BUILD_TYPE", "${CMAKE_BUILD_TYPE}"}, \
   {"BLAS", "${BLAS}"}, \
diff --git a/caffe2/core/net.h b/caffe2/core/net.h
index 962363ad0270e..57fd53f1de4f1 100644
--- a/caffe2/core/net.h
+++ b/caffe2/core/net.h
@@ -124,7 +124,7 @@ class CAFFE2_API NetBase : public Observable<NetBase> {
   string name_;
   vector<const Event*> events_;
   std::shared_ptr<const NetDef> net_def_;
-  AT_DISABLE_COPY_AND_ASSIGN(NetBase);
+  C10_DISABLE_COPY_AND_ASSIGN(NetBase);
 };
 
 class CAFFE2_API ExecutorHelper {
diff --git a/caffe2/core/net_async_base.h b/caffe2/core/net_async_base.h
index 7edec76c439a9..502233e7f045b 100644
--- a/caffe2/core/net_async_base.h
+++ b/caffe2/core/net_async_base.h
@@ -125,7 +125,7 @@ class CAFFE2_API AsyncNetBase : public NetBase {
   bool use_per_net_pools_;
   bool is_blocking_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(AsyncNetBase);
+  C10_DISABLE_COPY_AND_ASSIGN(AsyncNetBase);
 
  private:
   void storeExceptionPtr();
diff --git a/caffe2/core/net_async_dag_gpu.cc b/caffe2/core/net_async_dag_gpu.cc
index 225337d1452b9..550a760826edd 100644
--- a/caffe2/core/net_async_dag_gpu.cc
+++ b/caffe2/core/net_async_dag_gpu.cc
@@ -71,7 +71,7 @@ class ProfiledRange {
 
  private:
   nvtxRangeId_t range_ = 0;
-  AT_DISABLE_COPY_AND_ASSIGN(ProfiledRange);
+  C10_DISABLE_COPY_AND_ASSIGN(ProfiledRange);
 };
 
 #else
@@ -81,7 +81,7 @@ class ProfiledRange {
   ProfiledRange(const OperatorDef& def, Color color) {}
 
  private:
-  AT_DISABLE_COPY_AND_ASSIGN(ProfiledRange);
+  C10_DISABLE_COPY_AND_ASSIGN(ProfiledRange);
 };
 
 #endif // ifdef CAFFE2_USE_NVTX
diff --git a/caffe2/core/net_async_dag_gpu.h b/caffe2/core/net_async_dag_gpu.h
index 62ae301e4cbf2..845e5160d27b9 100644
--- a/caffe2/core/net_async_dag_gpu.h
+++ b/caffe2/core/net_async_dag_gpu.h
@@ -32,7 +32,7 @@ class AsyncDAGNet : public DAGNetBase {
   int stream(const DeviceOption& device_option);
   static thread_local std::vector<int> stream_counters_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(AsyncDAGNet);
+  C10_DISABLE_COPY_AND_ASSIGN(AsyncDAGNet);
 };
 
 } // namespace caffe2
diff --git a/caffe2/core/net_async_polling.h b/caffe2/core/net_async_polling.h
index 8b3d6db8d695e..9c4a284f0d13a 100644
--- a/caffe2/core/net_async_polling.h
+++ b/caffe2/core/net_async_polling.h
@@ -40,7 +40,7 @@ class AsyncPollingNet : public AsyncNetBase {
   void reset() override;
   std::atomic<bool> has_chain_failed_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(AsyncPollingNet);
+  C10_DISABLE_COPY_AND_ASSIGN(AsyncPollingNet);
 };
 
 } // namespace caffe2
diff --git a/caffe2/core/net_async_scheduling.h b/caffe2/core/net_async_scheduling.h
index 8576fca1bb07f..4fcdf4b731681 100644
--- a/caffe2/core/net_async_scheduling.h
+++ b/caffe2/core/net_async_scheduling.h
@@ -30,7 +30,7 @@ class CAFFE2_API AsyncSchedulingNet : public AsyncNetBase {
 
   std::atomic<int> processed_tasks_num_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(AsyncSchedulingNet);
+  C10_DISABLE_COPY_AND_ASSIGN(AsyncSchedulingNet);
 };
 
 } // namespace caffe2
diff --git a/caffe2/core/net_dag.h b/caffe2/core/net_dag.h
index 078fa63a4238a..ab3ce0f6f3fa1 100644
--- a/caffe2/core/net_dag.h
+++ b/caffe2/core/net_dag.h
@@ -84,7 +84,7 @@ class CAFFE2_API DAGNetBase : public NetBase {
   mutable std::vector<DAGNetStats> stats_;
   std::unordered_map<int, std::unique_ptr<Timer>> task_timers_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(DAGNetBase);
+  C10_DISABLE_COPY_AND_ASSIGN(DAGNetBase);
 };
 
 class CAFFE2_API DAGNet : public DAGNetBase {
diff --git a/caffe2/core/net_simple.h b/caffe2/core/net_simple.h
index a8ac751dbb5ed..c114fd8d224f2 100644
--- a/caffe2/core/net_simple.h
+++ b/caffe2/core/net_simple.h
@@ -48,7 +48,7 @@ class CAFFE2_API SimpleNet : public NetBase {
 
   vector<unique_ptr<OperatorBase>> operators_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(SimpleNet);
+  C10_DISABLE_COPY_AND_ASSIGN(SimpleNet);
 };
 
 } // namespace caffe2
diff --git a/caffe2/core/net_simple_async.h b/caffe2/core/net_simple_async.h
index 38c3255bf4df3..ea5aae959870f 100644
--- a/caffe2/core/net_simple_async.h
+++ b/caffe2/core/net_simple_async.h
@@ -43,7 +43,7 @@ class AsyncSimpleNet : public NetBase {
 
   vector<unique_ptr<OperatorBase>> operators_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(AsyncSimpleNet);
+  C10_DISABLE_COPY_AND_ASSIGN(AsyncSimpleNet);
 };
 
 } // namespace caffe2
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
index 2a03e428619b3..e7a889980365c 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
@@ -259,7 +259,7 @@ template <bool B, class T = void>
 using enable_if_t = typename std::enable_if<B, T>::type;
 
 template <typename T, typename U>
-struct CAFFE2_EXPORT inheritedFrom {
+struct C10_EXPORT inheritedFrom {
   static constexpr bool value =
       std::is_base_of<U, T>::value && !std::is_same<U, T>::value;
 };
@@ -267,14 +267,15 @@ struct CAFFE2_EXPORT inheritedFrom {
 // This is just a way to fix issues when the isa<> implementation
 // can't automatically downcast.
 template <typename T, typename N, typename = void>
-struct CAFFE2_EXPORT is_impl {
+struct C10_EXPORT is_impl {
   inline static bool impl(N n) {
     return isa<T>(n->data());
   }
 };
 
 template <typename T, typename N>
-struct CAFFE2_EXPORT is_impl<T, N, enable_if_t<inheritedFrom<T, NeuralNetOperator>::value>> {
+struct C10_EXPORT
+    is_impl<T, N, enable_if_t<inheritedFrom<T, NeuralNetOperator>::value>> {
   inline static bool impl(N n) {
     if (!isa<NeuralNetOperator>(n->data().get())) {
       return false;
@@ -285,7 +286,8 @@ struct CAFFE2_EXPORT is_impl<T, N, enable_if_t<inheritedFrom<T, NeuralNetOperato
 };
 
 template <typename T, typename N>
-struct CAFFE2_EXPORT is_impl<T, N, enable_if_t<inheritedFrom<T, NeuralNetData>::value>> {
+struct C10_EXPORT
+    is_impl<T, N, enable_if_t<inheritedFrom<T, NeuralNetData>::value>> {
   inline static bool impl(N n) {
     if (!isa<NeuralNetData>(n->data().get())) {
       return false;
@@ -303,14 +305,15 @@ inline bool is(N n) {
 // This is just a way to fix issues when the dyn_cast<> implementation
 // can't automatically downcast.
 template <typename T, typename N, typename = void>
-struct CAFFE2_EXPORT get_impl {
+struct C10_EXPORT get_impl {
   inline static T* impl(N n) {
     return dyn_cast<T>(n->data().get());
   }
 };
 
 template <typename T, typename N>
-struct CAFFE2_EXPORT get_impl<T, N, enable_if_t<inheritedFrom<T, NeuralNetOperator>::value>> {
+struct C10_EXPORT
+    get_impl<T, N, enable_if_t<inheritedFrom<T, NeuralNetOperator>::value>> {
   inline static T* impl(N n) {
     if (!is<T>(n)) {
       assert(0 && "Cannot get type from node");
@@ -322,7 +325,8 @@ struct CAFFE2_EXPORT get_impl<T, N, enable_if_t<inheritedFrom<T, NeuralNetOperat
 };
 
 template <typename T, typename N>
-struct CAFFE2_EXPORT get_impl<T, N, enable_if_t<inheritedFrom<T, NeuralNetData>::value>> {
+struct C10_EXPORT
+    get_impl<T, N, enable_if_t<inheritedFrom<T, NeuralNetData>::value>> {
   inline static T* impl(N n) {
     if (!is<T>(n)) {
       assert(0 && "Cannot get type from node");
@@ -422,7 +426,7 @@ CAFFE2_API std::vector<NNGraph::NodeRef> getOutputs(NNGraph::NodeRef n);
 CAFFE2_API void coalesceInsertedDataDependencies(repr::NNModule* m);
 
 template <NNGraph* G>
-struct CAFFE2_EXPORT NodeHelper {};
+struct C10_EXPORT NodeHelper {};
 
 struct NNNodeMatchCriteria {
   std::function<bool(NNGraph::NodeRef)> predicate;
diff --git a/caffe2/core/observer.h b/caffe2/core/observer.h
index e10ab0bb7eac6..378a7569d37bb 100644
--- a/caffe2/core/observer.h
+++ b/caffe2/core/observer.h
@@ -51,7 +51,7 @@ class Observable {
 
   virtual ~Observable() = default;
 
-  AT_DISABLE_COPY_AND_ASSIGN(Observable);
+  C10_DISABLE_COPY_AND_ASSIGN(Observable);
 
   using Observer = ObserverBase<T>;
 
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index 25aa801d265db..1a968c4c3755f 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -122,7 +122,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
     static_assert(
         std::is_same<T, Tensor>::value,
         "Output(int, DeviceType) is only available for Tensor");
-    return outputs_.at(idx)->GetMutableTensor(type);
+    return BlobGetMutableTensor(outputs_.at(idx), type);
   }
 
   template <typename T>
@@ -149,7 +149,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
   }
 
   inline bool InputIsTensorType(int idx, DeviceType device_type) {
-    return inputs_.at(idx)->IsTensorType(device_type);
+    return BlobIsTensorType(*inputs_.at(idx), device_type);
   }
 
   template <typename T>
@@ -162,7 +162,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
   }
 
   inline bool OutputIsTensorType(int idx, DeviceType type) {
-    return outputs_.at(idx)->IsTensorType(type);
+    return BlobIsTensorType(*outputs_.at(idx), type);
   }
 
   inline int InputSize() const {
@@ -397,7 +397,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
   // An event used by asynchronous execution.
   std::unique_ptr<Event> event_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(OperatorBase);
+  C10_DISABLE_COPY_AND_ASSIGN(OperatorBase);
 };
 
 // If your operator does not need any specialized contructor or destructor,
@@ -825,7 +825,7 @@ CAFFE_DECLARE_REGISTRY(
 #define REGISTER_CPU_OPERATOR_CREATOR(key, ...) \
   CAFFE_REGISTER_CREATOR(CPUOperatorRegistry, key, __VA_ARGS__)
 #define REGISTER_CPU_OPERATOR(name, ...)                           \
-  CAFFE2_IMPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();\
+  C10_IMPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();  \
   static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_CPU##name() { \
     CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();                \
   }                                                                \
@@ -844,7 +844,7 @@ CAFFE_DECLARE_REGISTRY(
 #define REGISTER_CUDA_OPERATOR_CREATOR(key, ...) \
   CAFFE_REGISTER_CREATOR(CUDAOperatorRegistry, key, __VA_ARGS__)
 #define REGISTER_CUDA_OPERATOR(name, ...)                           \
-  CAFFE2_IMPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();       \
+  C10_IMPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();   \
   static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_CUDA##name() { \
     CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();                 \
   }                                                                 \
@@ -869,10 +869,10 @@ CAFFE_DECLARE_REGISTRY(
 #define REGISTER_HIP_OPERATOR_CREATOR(key, ...) \
   CAFFE_REGISTER_CREATOR(HIPOperatorRegistry, key, __VA_ARGS__)
 #define REGISTER_HIP_OPERATOR(name, ...)                           \
-  CAFFE2_IMPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();       \
+  C10_IMPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();  \
   static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_HIP##name() { \
-    CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();                 \
-  }                                                                 \
+    CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name();                \
+  }                                                                \
   CAFFE_REGISTER_CLASS(HIPOperatorRegistry, name, __VA_ARGS__)
 #define REGISTER_HIP_OPERATOR_STR(str_name, ...) \
   CAFFE_REGISTER_TYPED_CLASS(HIPOperatorRegistry, str_name, __VA_ARGS__)
diff --git a/caffe2/core/operator_schema.cc b/caffe2/core/operator_schema.cc
index a76a0df9bd004..3082810b85cde 100644
--- a/caffe2/core/operator_schema.cc
+++ b/caffe2/core/operator_schema.cc
@@ -415,7 +415,7 @@ std::vector<TensorFiller> OpSchema::SupplyDenseFillers(
   return fillers;
 }
 
-CAFFE2_EXPORT std::ostream& operator<<(std::ostream& out, const OpSchema& schema) {
+C10_EXPORT std::ostream& operator<<(std::ostream& out, const OpSchema& schema) {
   if (!schema.args().empty()) {
     out << "Arguments:" << std::endl;
     for (const auto& arg : schema.args()) {
diff --git a/caffe2/core/operator_schema.h b/caffe2/core/operator_schema.h
index e0b6495647ebd..54a6a17b8a0d2 100644
--- a/caffe2/core/operator_schema.h
+++ b/caffe2/core/operator_schema.h
@@ -576,16 +576,16 @@ OpSchema::Cost PointwiseCostInference(
 
 #ifndef CAFFE2_NO_OPERATOR_SCHEMA
 
-#define OPERATOR_SCHEMA(name)                                     \
-  CAFFE2_EXPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(){};          \
-  static OpSchema* CAFFE_ANONYMOUS_VARIABLE(name) CAFFE2_UNUSED = \
+#define OPERATOR_SCHEMA(name)                                       \
+  C10_EXPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(){}; \
+  static OpSchema* CAFFE_ANONYMOUS_VARIABLE(name) CAFFE2_UNUSED =   \
       &OpSchemaRegistry::NewSchema(#name, __FILE__, __LINE__)
 
 #else // CAFFE2_NO_OPERATOR_SCHEMA
 
-#define OPERATOR_SCHEMA(name)                                     \
-  CAFFE2_EXPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(){};          \
-  static OpSchema* CAFFE_ANONYMOUS_VARIABLE(name) CAFFE2_UNUSED = \
+#define OPERATOR_SCHEMA(name)                                       \
+  C10_EXPORT void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(){}; \
+  static OpSchema* CAFFE_ANONYMOUS_VARIABLE(name) CAFFE2_UNUSED =   \
       1 ? nullptr : &OpSchemaRegistry::NewSchema(#name, __FILE__, __LINE__)
 
 #endif // CAFFE2_NO_OPERATOR_SCHEMA
diff --git a/caffe2/core/plan_executor.cc b/caffe2/core/plan_executor.cc
index 2c0ad9e7a8127..8e48b6b7beabc 100644
--- a/caffe2/core/plan_executor.cc
+++ b/caffe2/core/plan_executor.cc
@@ -131,7 +131,8 @@ struct WorkspaceIdInjector {
           "Integer overflow while calculating GLOBAL_WORKSPACE_ID blob");
       int32_t global_ws_id = (seq_++) + (static_cast<int32_t>(node_id) << 16);
       Blob* global_ws_id_blob = workspace->CreateLocalBlob(GLOBAL_WORKSPACE_ID);
-      TensorCPU* global_ws_id_tensor = global_ws_id_blob->GetMutableTensor(CPU);
+      TensorCPU* global_ws_id_tensor =
+          BlobGetMutableTensor(global_ws_id_blob, CPU);
       global_ws_id_tensor->Resize();
       global_ws_id_tensor->template mutable_data<int32_t>()[0] = global_ws_id;
       VLOG(1) << "Adding " << GLOBAL_WORKSPACE_ID << " = " << global_ws_id;
diff --git a/caffe2/core/qtensor.h b/caffe2/core/qtensor.h
index f277ffdbdd0a6..385ebf1d5f9f8 100644
--- a/caffe2/core/qtensor.h
+++ b/caffe2/core/qtensor.h
@@ -14,7 +14,7 @@
 namespace caffe2 {
 
 template <class Context>
-class CAFFE2_EXPORT QTensor {
+class C10_EXPORT QTensor {
  public:
   QTensor() {}
   virtual ~QTensor() {}
diff --git a/caffe2/core/registry.h b/caffe2/core/registry.h
index 7db975077ea8b..f026795b23c3e 100644
--- a/caffe2/core/registry.h
+++ b/caffe2/core/registry.h
@@ -100,7 +100,7 @@ class Registry {
   CaffeMap<SrcType, string> help_message_;
   std::mutex register_mutex_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(Registry);
+  C10_DISABLE_COPY_AND_ASSIGN(Registry);
 };
 
 template <class SrcType, class ObjectPtrType, class... Args>
@@ -142,16 +142,16 @@ class Registerer {
  * declaration, as well as creating a convenient typename for its corresponding
  * registerer.
  */
-#define CAFFE_DECLARE_TYPED_REGISTRY(                                    \
-    RegistryName, SrcType, ObjectType, PtrType, ...)                     \
-  CAFFE2_EXPORT Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>*   \
-  RegistryName();                                                        \
-  typedef Registerer<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>        \
+#define CAFFE_DECLARE_TYPED_REGISTRY(                               \
+    RegistryName, SrcType, ObjectType, PtrType, ...)                \
+  C10_EXPORT Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>* \
+  RegistryName();                                                   \
+  typedef Registerer<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>   \
       Registerer##RegistryName;
 
 #define CAFFE_DEFINE_TYPED_REGISTRY(                                         \
     RegistryName, SrcType, ObjectType, PtrType, ...)                         \
-  CAFFE2_EXPORT Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>*       \
+  C10_EXPORT Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>*          \
   RegistryName() {                                                           \
     static Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>* registry = \
         new Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>();         \
diff --git a/caffe2/core/tensor.cc b/caffe2/core/tensor.cc
index e142e1a6b6a90..caa0ba9ea55f4 100644
--- a/caffe2/core/tensor.cc
+++ b/caffe2/core/tensor.cc
@@ -86,7 +86,7 @@ vector<int64_t> GetTensorInfo(
   CHECK(tc);
   CHECK(tc->unsafeGetTensorImpl());
   CHECK(tc->unsafeGetTensorImpl()->storage().unsafeGetStorageImpl());
-  *capacity = tc->capacity_nbytes();
+  *capacity = tc->storage().capacity();
   tc->ExtractDeviceOption(device);
   return tc->dims();
 }
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 286718d4268ca..1e4cac2788b56 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -262,10 +262,6 @@ class CAFFE2_API Tensor final {
     return impl_.get()->nbytes();
   }
 
-  inline size_t capacity_nbytes() const {
-    return impl_.get()->capacity_nbytes();
-  }
-
   inline const vector<int64_t>& dims() const {
     return impl_.get()->dims();
   }
@@ -322,6 +318,10 @@ class CAFFE2_API Tensor final {
   const Storage& storage() {
     return impl_->storage();
   }
+
+  const Storage& storage() const {
+    return impl_->storage();
+  }
 };
 
 using TensorCPU = Tensor;
diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h
index 20c398f7e4c82..53c812f55e297 100644
--- a/caffe2/core/tensor_impl.h
+++ b/caffe2/core/tensor_impl.h
@@ -693,11 +693,6 @@ class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
     ;
   }
 
-  // NB: This capacity may also include available space
-  // in the storage BEFORE the tensor data, if storage_offset != 0
-  inline size_t capacity_nbytes() const {
-    return storage_.capacity();
-  }
   /**
    * Returns the dimensions of the tensor as a vector.
    */
diff --git a/caffe2/core/timer.h b/caffe2/core/timer.h
index a290ffc4aadc1..a0384b0dbdbd0 100644
--- a/caffe2/core/timer.h
+++ b/caffe2/core/timer.h
@@ -41,7 +41,7 @@ class Timer {
 
  protected:
   std::chrono::time_point<clock> start_time_;
-  AT_DISABLE_COPY_AND_ASSIGN(Timer);
+  C10_DISABLE_COPY_AND_ASSIGN(Timer);
 };
 }
 
diff --git a/caffe2/core/workspace.h b/caffe2/core/workspace.h
index 11bf9c413c596..2ad486c328f56 100644
--- a/caffe2/core/workspace.h
+++ b/caffe2/core/workspace.h
@@ -151,7 +151,7 @@ class CAFFE2_API Workspace {
       auto* to_blob = CreateBlob(blob);
       CAFFE_ENFORCE(to_blob);
       const auto& from_tensor = from_blob->template Get<Tensor>();
-      auto* to_tensor = to_blob->GetMutableTensor(Context::GetDeviceType());
+      auto* to_tensor = BlobGetMutableTensor(to_blob, Context::GetDeviceType());
       to_tensor->CopyFrom(from_tensor);
     }
   }
@@ -328,7 +328,7 @@ class CAFFE2_API Workspace {
   std::mutex thread_pool_creation_mutex_;
   std::shared_ptr<Bookkeeper> bookkeeper_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(Workspace);
+  C10_DISABLE_COPY_AND_ASSIGN(Workspace);
 };
 
 }  // namespace caffe2
diff --git a/caffe2/db/create_db_op.h b/caffe2/db/create_db_op.h
index ac7c137cea9aa..6a964f86d1b43 100644
--- a/caffe2/db/create_db_op.h
+++ b/caffe2/db/create_db_op.h
@@ -34,7 +34,7 @@ class CreateDBOp final : public Operator<Context> {
   string db_name_;
   uint32_t num_shards_;
   uint32_t shard_id_;
-  AT_DISABLE_COPY_AND_ASSIGN(CreateDBOp);
+  C10_DISABLE_COPY_AND_ASSIGN(CreateDBOp);
 };
 
 } // namespace caffe2
diff --git a/caffe2/db/leveldb.cc b/caffe2/db/leveldb.cc
index 23a188027ece7..fe2665f3a6f0e 100644
--- a/caffe2/db/leveldb.cc
+++ b/caffe2/db/leveldb.cc
@@ -51,7 +51,7 @@ class LevelDBTransaction : public Transaction {
   leveldb::DB* db_;
   std::unique_ptr<leveldb::WriteBatch> batch_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(LevelDBTransaction);
+  C10_DISABLE_COPY_AND_ASSIGN(LevelDBTransaction);
 };
 
 class LevelDB : public DB {
diff --git a/caffe2/db/lmdb.cc b/caffe2/db/lmdb.cc
index 2eb65bb7aa738..a2eee9910655a 100644
--- a/caffe2/db/lmdb.cc
+++ b/caffe2/db/lmdb.cc
@@ -114,7 +114,7 @@ class LMDBTransaction final : public Transaction {
   MDB_dbi mdb_dbi_;
   MDB_txn* mdb_txn_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(LMDBTransaction);
+  C10_DISABLE_COPY_AND_ASSIGN(LMDBTransaction);
 };
 
 class LMDB : public DB {
diff --git a/caffe2/db/protodb.cc b/caffe2/db/protodb.cc
index 2473ad23b6c45..fdaaaf57f1716 100644
--- a/caffe2/db/protodb.cc
+++ b/caffe2/db/protodb.cc
@@ -60,7 +60,7 @@ class ProtoDBTransaction : public Transaction {
   TensorProtos* proto_;
   std::unordered_set<string> existing_names_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(ProtoDBTransaction);
+  C10_DISABLE_COPY_AND_ASSIGN(ProtoDBTransaction);
 };
 
 class ProtoDB : public DB {
diff --git a/caffe2/ideep/operators/concat_split_op.cc b/caffe2/ideep/operators/concat_split_op.cc
index 8d011cd3be8bf..38ffdc9942645 100644
--- a/caffe2/ideep/operators/concat_split_op.cc
+++ b/caffe2/ideep/operators/concat_split_op.cc
@@ -33,8 +33,9 @@ class IDEEPConcatOp final : public IDEEPOperator {
       if (OperatorBase::InputBlob(i).template IsType<itensor>()) {
         inputs.emplace_back(Input(i));
       } else {
-        CAFFE_ENFORCE(OperatorBase::InputBlob(i).IsTensorType(CPU),
-                      "Expect cpu tensor if not itensor");
+        CAFFE_ENFORCE(
+            BlobIsTensorType(OperatorBase::InputBlob(i), CPU),
+            "Expect cpu tensor if not itensor");
         auto& tensor_cpu = OperatorBase::Input<Tensor>(i, CPU);
         CAFFE_ENFORCE(tensor_cpu.dims().size() == 0 ||
                       tensor_cpu.size_from_dim(0) == 0,
diff --git a/caffe2/ideep/operators/operator_fallback_ideep.h b/caffe2/ideep/operators/operator_fallback_ideep.h
index 08e6de2ae3f0d..3226a08c4af9c 100644
--- a/caffe2/ideep/operators/operator_fallback_ideep.h
+++ b/caffe2/ideep/operators/operator_fallback_ideep.h
@@ -89,7 +89,7 @@ class IDEEPFallbackOp final : public IDEEPOperator {
           local_input_blobs_[i]->Reset();
         }
         input_share_[i] = false;
-        auto dtensor = local_input_blobs_[i]->GetMutableTensor(CPU);
+        auto dtensor = BlobGetMutableTensor(local_input_blobs_[i], CPU);
         dtensor->Resize(input.get_dims());
         if (input.is_public_format()) {
           dtensor->ShareExternalPointer(
@@ -121,7 +121,7 @@ class IDEEPFallbackOp final : public IDEEPOperator {
         continue;
       }
       CAFFE_ENFORCE(
-          local_output_blobs_[i]->IsTensorType(CPU),
+          BlobIsTensorType(*local_output_blobs_[i], CPU),
           "IDEEP fallback op currently does not support non-TensorCPU "
           "output type who needs copying.");
       const auto& src = local_output_blobs_[i]->template Get<TensorCPU>();
@@ -153,7 +153,7 @@ class IDEEPFallbackOp final : public IDEEPOperator {
         VLOG(2) << "Output " << base_def_.output(i) << " as CPUTensor";
         Blob* dst = OperatorBase::OutputBlob(i);
         dst->Reset(new Tensor(CPU));
-        auto dtensor = dst->GetMutableTensor(CPU);
+        auto dtensor = BlobGetMutableTensor(dst, CPU);
         dtensor->Resize(src_dims);
         dtensor->ShareData(src);
       }
diff --git a/caffe2/ideep/operators/utility_ops.cc b/caffe2/ideep/operators/utility_ops.cc
index 626568a989b93..468a42df1a923 100644
--- a/caffe2/ideep/operators/utility_ops.cc
+++ b/caffe2/ideep/operators/utility_ops.cc
@@ -31,7 +31,7 @@ class CopyIDEEPToCPUOp final : public IDEEPOperator {
   USE_IDEEP_DEF_ALIASES();
   bool RunOnDevice() override {
     const auto& input_blob = OperatorBase::InputBlob(0);
-    if (input_blob.IsTensorType(CPU)) {
+    if (BlobIsTensorType(input_blob, CPU)) {
       VLOG(2) << "Directing sharing of TensorCPU";
       const auto& X = OperatorBase::Input<Tensor>(0, CPU);
       auto* Y = OperatorBase::Output<Tensor>(0, CPU);
diff --git a/caffe2/mkl/operators/operator_fallback_mkl.h b/caffe2/mkl/operators/operator_fallback_mkl.h
index 6d9713b74612d..a3135758813ec 100644
--- a/caffe2/mkl/operators/operator_fallback_mkl.h
+++ b/caffe2/mkl/operators/operator_fallback_mkl.h
@@ -66,10 +66,10 @@ class MKLFallbackOp final : public Operator<MKLContext> {
     for (int i = 0; i < InputSize(); ++i) {
       if (OperatorBase::InputIsType<MKLMemory<float>>(i)) {
         OperatorBase::Input<MKLMemory<float>>(i).CopyTo(
-            local_input_blobs_[i]->GetMutableTensor(CPU));
+            BlobGetMutableTensor(local_input_blobs_[i], CPU));
       } else if (OperatorBase::InputIsType<MKLMemory<double>>(i)) {
         OperatorBase::Input<MKLMemory<double>>(i).CopyTo(
-            local_input_blobs_[i]->GetMutableTensor(CPU));
+            BlobGetMutableTensor(local_input_blobs_[i], CPU));
       } else {
         VLOG(1) << "Input " << i << " is not MKLMemory. Skipping copy.";
         // Note(jiayq): This removes a const but conceptually
@@ -93,7 +93,7 @@ class MKLFallbackOp final : public Operator<MKLContext> {
         continue;
       }
       CAFFE_ENFORCE(
-          local_output_blobs_[i]->IsTensorType(CPU),
+          BlobIsTensorType(*local_output_blobs_[i], CPU),
           "MKL fallback op currently does not support non-TensorCPU "
           "output type who needs copying.");
       const auto& src = local_output_blobs_[i]->template Get<TensorCPU>();
diff --git a/caffe2/mkl/utils/mkl_memory.h b/caffe2/mkl/utils/mkl_memory.h
index bd0ad40422079..736d8ede8cf53 100644
--- a/caffe2/mkl/utils/mkl_memory.h
+++ b/caffe2/mkl/utils/mkl_memory.h
@@ -58,7 +58,7 @@ class PrimitiveWrapper {
 
  private:
   dnnPrimitive_t primitive_ = 0;
-  AT_DISABLE_COPY_AND_ASSIGN(PrimitiveWrapper);
+  C10_DISABLE_COPY_AND_ASSIGN(PrimitiveWrapper);
 };
 
 template <typename T>
@@ -138,7 +138,7 @@ class LayoutWrapper {
 
  private:
   dnnLayout_t layout_ = 0;
-  AT_DISABLE_COPY_AND_ASSIGN(LayoutWrapper);
+  C10_DISABLE_COPY_AND_ASSIGN(LayoutWrapper);
 };
 
 /**
@@ -557,7 +557,7 @@ class MKLMemory {
   // The primitive to use to convert from internal layout to user layout
   PrimitiveWrapper<T> convert_out_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(MKLMemory);
+  C10_DISABLE_COPY_AND_ASSIGN(MKLMemory);
 };
 
 template <typename T>
@@ -575,7 +575,7 @@ class MKLWorkspace {
 
  private:
   void* buffer_;
-  AT_DISABLE_COPY_AND_ASSIGN(MKLWorkspace);
+  C10_DISABLE_COPY_AND_ASSIGN(MKLWorkspace);
 };
 
 } // namespace mkl
diff --git a/caffe2/mobile/contrib/arm-compute/core/net_gl.h b/caffe2/mobile/contrib/arm-compute/core/net_gl.h
index 029d888b1ebf9..1dc93dedc3fff 100644
--- a/caffe2/mobile/contrib/arm-compute/core/net_gl.h
+++ b/caffe2/mobile/contrib/arm-compute/core/net_gl.h
@@ -57,7 +57,7 @@ class GLNet : public NetBase {
 
   vector<unique_ptr<OperatorBase>> operators_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(GLNet);
+  C10_DISABLE_COPY_AND_ASSIGN(GLNet);
 };
 
 } // namespace caffe2
diff --git a/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc b/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc
index 111af03f8602b..06ec2b50acc17 100644
--- a/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc
+++ b/caffe2/mobile/contrib/arm-compute/operators/copy_op.cc
@@ -43,7 +43,7 @@ bool CopyFromGLOp<T>::RunOnDevice() {
   if (first_run_) {
     first_run_ = false;
     for (int i = 0; i < Inputs().size(); ++i) {
-      auto* Y = OperatorBase::Outputs()[i]->GetMutableTensor(CPU);
+      auto* Y = BlobGetMutableTensor(OperatorBase::Outputs()[i], CPU);
       Y->Resize(inputs_[i]->dims());
       Y->template mutable_data<float>();
     }
@@ -54,7 +54,7 @@ bool CopyFromGLOp<T>::RunOnDevice() {
       // GLTensor
       auto* X = inputs_[i].get();
       X->lazy_allocate(Xblob, second_run_, true);
-      auto* Y = OperatorBase::Outputs()[i]->GetMutableTensor(CPU);
+      auto* Y = BlobGetMutableTensor(OperatorBase::Outputs()[i], CPU);
       Timer timer;
       timer.Start();
       getTensorCPU(*X, *Y);
diff --git a/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h b/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h
index daa7ef008fc7b..68f79e84a89f8 100644
--- a/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h
+++ b/caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h
@@ -27,7 +27,7 @@ template<typename T = float>
 void PopulateCPUBlob(Workspace *ws, bool random, std::string name,
                      std::vector<int> dims, int val = 1, int dist_shift = 0, float variance = 1) {
   Blob *blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(dims);
   T *t_data = tensor->mutable_data<T>();
   std::random_device rd;
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
index 52f746f63f317..742f8e48f4e9e 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
@@ -489,13 +489,13 @@ bool RunOnDevice() override {
         "noise_size", 491 /* prime to avoid artifacts */);
     // Treaded as half4 in the kernel, so need half4 here.
     noiseSize = divRoundUp(noiseSize, 4) * 4;
-    if (!noiseBlob->IsTensorType(CPU) ||
+    if (!BlobIsTensorType(*noiseBlob, CPU) ||
         noiseBlob->Get<TensorCPU>().size() != noiseSize) {
       VLOG(2) << "Initializing stylizer with noise: " << noiseSize;
       caffe2::Timer rt;
       // Initialize random noise on first use.
       // Cache it to maintain temporal consistency.
-      auto* t = noiseBlob->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(noiseBlob, CPU);
       t->Resize(noiseSize);
       math::RandGaussian<float, CPUContext>(
           t->size(),
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
index 7216b16611aa2..7ac629019c58c 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
@@ -94,7 +94,7 @@ void testMPSCNN() {
 
               Workspace ws;
               for (auto i = 0; i < N; ++i) {
-                auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
+                auto* t = BlobGetMutableTensor(ws.CreateBlob(cpu(i)), CPU);
                 t->Resize(BS, C, H, W);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -152,7 +152,7 @@ void testMPSCNN() {
 
         Workspace ws;
         for (auto i = 0; i < N; ++i) {
-          auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob(cpu(i)), CPU);
           switch (ndim) {
             case 1:
               t->Resize(5);
@@ -210,7 +210,7 @@ void testMPSCNN() {
         LOG(INFO) << "MPSCNNNormalizePlanarYUV Test: ";
         Workspace ws;
         {
-          auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
           t->Resize(batch_size, channels, 8, 13);
           CPUContext ctx;
           math::RandGaussian<float, CPUContext>(
@@ -218,14 +218,14 @@ void testMPSCNN() {
         }
 
         {
-          auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
           t->Resize(1, channels);
           CPUContext ctx;
           math::RandGaussian<float, CPUContext>(
               t->size(), 0, 1, t->mutable_data<float>(), &ctx);
         }
         {
-          auto* t = ws.CreateBlob("stddev")->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob("stddev"), CPU);
           t->Resize(1, channels);
           CPUContext ctx;
           math::RandUniform<float, CPUContext>(
@@ -290,7 +290,7 @@ void testMPSCNN() {
           for (const auto dim : {10, 40}) {
             Workspace ws;
             {
-              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
               t->Resize(batchSize, channels, dim, dim);
               CPUContext ctx;
               // Too noisy.
@@ -299,7 +299,7 @@ void testMPSCNN() {
             }
 
             {
-              auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
               t->Resize(channels);
               CPUContext ctx;
               for (auto i = 0; i < t->size(); ++i) {
@@ -310,7 +310,7 @@ void testMPSCNN() {
               // t->mutable_data<float>(), &ctx);
             }
             {
-              auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
               t->Resize(channels);
               CPUContext ctx;
               for (auto i = 0; i < t->size(); ++i) {
@@ -321,7 +321,7 @@ void testMPSCNN() {
               // t->mutable_data<float>(), &ctx);
             }
             {
-              auto* t = ws.CreateBlob("pw")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("pw"), CPU);
               t->Resize(prelu == PreluTy::SHARED ? 1 : channels);
               CPUContext ctx;
               // Too noisy.
@@ -409,7 +409,7 @@ void testMPSCNN() {
           Workspace ws;
           const auto channels = array ? 12 : 3;
           {
-            auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+            auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
             t->Resize(batch_size, channels, 8, 13);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -417,7 +417,7 @@ void testMPSCNN() {
           }
 
           {
-            auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+            auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
             t->Resize(shared ? channels : 1);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -480,7 +480,7 @@ void testMPSCNN() {
         LOG(INFO) << "MPSCNNSpatialBN Test: " << channels;
         Workspace ws;
         {
-          auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
           t->Resize(batch_size, channels, 8, 13);
           CPUContext ctx;
           math::RandGaussian<float, CPUContext>(
@@ -488,7 +488,7 @@ void testMPSCNN() {
         }
 
         for (const std::string name : {"scale", "bias", "mean", "var"}) {
-          auto* t = ws.CreateBlob(name)->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob(name), CPU);
           t->Resize(channels);
           CPUContext ctx;
           // High mean to avoid var division by zero.
@@ -575,7 +575,7 @@ void testMPSCNN() {
               LOG(INFO) << "MPSCNNFC Test";
               Workspace ws;
               {
-                auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
                 t->Resize(batchSize, CIn, H, W);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -583,7 +583,7 @@ void testMPSCNN() {
               }
 
               {
-                auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+                auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
                 t->Resize(COut, CIn * H * W);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -591,7 +591,7 @@ void testMPSCNN() {
               }
 
               {
-                auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+                auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
                 t->Resize(COut);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -682,8 +682,8 @@ void testMPSCNN() {
                           LOG(INFO) << "MPSCNNPool Test: " << pool;
                           Workspace ws;
                           {
-                            auto* t =
-                                ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                            auto* t = BlobGetMutableTensor(
+                                ws.CreateBlob("X_cpu"), CPU);
                             t->Resize(batchSize, 8, 8, 13);
                             CPUContext ctx;
                             math::RandGaussian<float, CPUContext>(
@@ -784,7 +784,7 @@ void testMPSCNN() {
          std::vector<std::vector<size_t>>{{1, 3, 50, 80}, {1, 12, 50, 80}}) {
       Workspace ws;
       {
-        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+        auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
         t->Resize(dims);
         CPUContext ctx;
         math::RandGaussian<float, CPUContext>(
@@ -860,7 +860,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNPreprocess Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
       t->Resize(1, 8, 13, 4);
       CPUContext ctx;
       for (auto i = 0; i < t->size(); ++i) {
@@ -869,7 +869,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
       t->Resize(3);
       CPUContext ctx;
       t->mutable_data<float>()[0] = 100;
@@ -940,7 +940,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNDeprocess Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
       t->Resize(1, 3, 8, 24);
       CPUContext ctx;
       for (auto i = 0; i < t->size(); ++i) {
@@ -949,7 +949,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
       t->Resize(3);
       CPUContext ctx;
       t->mutable_data<float>()[0] = 100;
@@ -999,7 +999,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNDeprocess Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
       t->Resize(1, 3, 1280, 720);
       CPUContext ctx;
       for (auto i = 0; i < t->size(); ++i) {
@@ -1008,7 +1008,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
       t->Resize(3);
       CPUContext ctx;
       t->mutable_data<float>()[0] = 30;
@@ -1072,7 +1072,8 @@ void testMPSCNN() {
                       LOG(INFO) << "MPSCNNConv Test";
                       Workspace ws;
                       {
-                        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                        auto* t =
+                            BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
                         t->Resize(batchSize, 12, 57, 72);
                         CPUContext ctx;
                         math::RandGaussian<float, CPUContext>(
@@ -1080,7 +1081,7 @@ void testMPSCNN() {
                       }
 
                       {
-                        auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+                        auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
                         t->Resize(8, 12, kernel_h, kernel_w);
                         CPUContext ctx;
                         math::RandGaussian<float, CPUContext>(
@@ -1092,7 +1093,7 @@ void testMPSCNN() {
                       }
 
                       {
-                        auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+                        auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
                         t->Resize(8);
                         CPUContext ctx;
                         math::RandGaussian<float, CPUContext>(
@@ -1188,7 +1189,7 @@ void testMPSCNN() {
             Workspace ws;
             int output_channels = input_channels * channel_multiplier;
             {
-              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
               t->Resize(batchSize, input_channels, 57, 72);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -1196,7 +1197,7 @@ void testMPSCNN() {
             }
 
             {
-              auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
               t->Resize(output_channels, 1, 3, 3);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -1204,7 +1205,7 @@ void testMPSCNN() {
             }
 
             {
-              auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
               t->Resize(output_channels);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -1275,7 +1276,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNConvRelu Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1283,7 +1284,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
       t->Resize(8, 12, 3, 3);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1291,7 +1292,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
       t->Resize(8);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1385,7 +1386,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSConv Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1393,7 +1394,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
       t->Resize(8, 12, 3, 3);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1401,7 +1402,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
       t->Resize(8);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1493,7 +1494,7 @@ void testMPSCNN() {
               LOG(INFO) << "MPSConv Test";
               Workspace ws;
               {
-                auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
                 t->Resize(batchSize, C, 12, 16);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -1501,7 +1502,7 @@ void testMPSCNN() {
               }
 
               {
-                auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+                auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
                 t->Resize(M, C, K, K);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -1509,7 +1510,7 @@ void testMPSCNN() {
               }
 
               {
-                auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+                auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
                 t->Resize(M);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -1607,7 +1608,7 @@ void testMPSCNN() {
                 LOG(INFO) << "MPSCNNConv Test - group";
                 Workspace ws;
                 {
-                  auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                  auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
                   t->Resize(batchSize, C, 12, 16);
                   CPUContext ctx;
                   math::RandGaussian<float, CPUContext>(
@@ -1615,7 +1616,7 @@ void testMPSCNN() {
                 }
 
                 {
-                  auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+                  auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
                   t->Resize(M, C / group, K, K);
                   CPUContext ctx;
                   math::RandGaussian<float, CPUContext>(
@@ -1623,7 +1624,7 @@ void testMPSCNN() {
                 }
 
                 {
-                  auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+                  auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
                   t->Resize(M);
                   CPUContext ctx;
                   math::RandGaussian<float, CPUContext>(
@@ -1726,7 +1727,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNMul Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X0_cpu"), CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1734,7 +1735,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X1_cpu"), CPU);
       t->Resize(72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1791,7 +1792,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNSub Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X0_cpu"), CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1799,7 +1800,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X1_cpu"), CPU);
       t->Resize(72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1856,7 +1857,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSAdd Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X0_cpu"), CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1864,7 +1865,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X1_cpu"), CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1921,7 +1922,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSAdd Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X0_cpu"), CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1929,7 +1930,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X1_cpu"), CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -2011,7 +2012,7 @@ void testMPSCNN() {
       LOG(INFO) << "MPSCNNNeuron Test: " << n;
       Workspace ws;
       {
-        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+        auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
         t->Resize(1, 4, 12, 12);
         CPUContext ctx;
         math::RandGaussian<float, CPUContext>(
@@ -2065,7 +2066,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNDropout Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -2136,7 +2137,7 @@ void testMPSCNN() {
                       << " - scale: " << scale;
             Workspace ws;
             {
-              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
               t->Resize(1, channels, 40, 40);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -2144,7 +2145,7 @@ void testMPSCNN() {
             }
             {
               // Use the batch-first encoding (n, [bbox])
-              auto* t = ws.CreateBlob("R")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("R"), CPU);
               t->Resize(6, 5);
               for (auto i = 0; i < t->dim32(0); ++i) {
                 t->mutable_data<float>()[5 * i + 0] = 0; // batch
@@ -2250,14 +2251,14 @@ void testMPSCNN() {
         LOG(INFO) << "MPSCNNRoIWarp Test 2";
         Workspace ws;
         {
-          auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
           t->Resize(1, 8, 40, 40);
           CPUContext ctx;
           math::RandGaussian<float, CPUContext>(
               t->size(), 4, 2, t->mutable_data<float>(), &ctx);
         }
         {
-          auto* t = ws.CreateBlob("R")->GetMutableTensor(CPU);
+          auto* t = BlobGetMutableTensor(ws.CreateBlob("R"), CPU);
           t->Resize(6, 4);
           for (auto i = 0; i < t->dim32(0); ++i) {
             t->mutable_data<float>()[4 * i + 0] = (i % 4 + 1) * 1.0 / scale;
@@ -2362,7 +2363,7 @@ void testMPSCNN() {
             LOG(INFO) << "MPSCNNResizeNearestOp Test";
             Workspace ws;
             {
-              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+              auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
               t->Resize(N, C, 37, 89);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -2497,7 +2498,7 @@ void testMPSCNN() {
     vector<float> im_info{60, 80, 0.166667};
     vector<float> anchors{-38, -16, 53, 31, -120, -120, 135, 135};
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
       t->Resize(num_images, A, H, W);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<float>()[i] = scores[i];
@@ -2505,7 +2506,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("bbox_delta_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("bbox_delta_cpu"), CPU);
       t->Resize(num_images, 4 * A, H, W);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<float>()[i] = bbx[i];
@@ -2513,7 +2514,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("im_info")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("im_info"), CPU);
       t->Resize(num_images, 3);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<float>()[i] = im_info[i];
@@ -2521,7 +2522,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("anchors")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("anchors"), CPU);
       t->Resize(A, 4);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<float>()[i] = anchors[i];
@@ -2587,7 +2588,7 @@ void testMPSCNN() {
       LOG(INFO) << "MPSCNNSoftmax Test";
       Workspace ws;
       {
-        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+        auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
         // Only works for spatial dimension of (1, 1) - weird.
         t->Resize(batchSize, 12, 1, 1);
         CPUContext ctx;
@@ -2661,8 +2662,8 @@ void testMPSCNN() {
                             LOG(INFO) << "MPSConvTranspose Test";
                             Workspace ws;
                             {
-                              auto* t =
-                                  ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                              auto* t = BlobGetMutableTensor(
+                                  ws.CreateBlob("X_cpu"), CPU);
                               t->Resize(batchSize, inputChannels, 8, 12);
                               CPUContext ctx;
                               math::RandGaussian<float, CPUContext>(
@@ -2675,7 +2676,7 @@ void testMPSCNN() {
 
                             {
                               auto* t =
-                                  ws.CreateBlob("W")->GetMutableTensor(CPU);
+                                  BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
                               t->Resize(
                                   inputChannels,
                                   outputChannels,
@@ -2692,7 +2693,7 @@ void testMPSCNN() {
 
                             {
                               auto* t =
-                                  ws.CreateBlob("b")->GetMutableTensor(CPU);
+                                  BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
                               t->Resize(outputChannels);
                               CPUContext ctx;
                               math::RandGaussian<float, CPUContext>(
@@ -2809,7 +2810,7 @@ void testMPSCNN() {
                     << batchSize;
           Workspace ws;
           for (auto i = 0; i < numInputs; ++i) {
-            auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
+            auto* t = BlobGetMutableTensor(ws.CreateBlob(cpu(i)), CPU);
             t->Resize(batchSize, array ? (i + 1) * 4 : 4, 10, 10);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -2891,7 +2892,7 @@ void testMPSCNN() {
           }
           Workspace ws;
           {
-            auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+            auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
             t->Resize(batchSize, inputChannels, 53, 47);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -2964,7 +2965,7 @@ void testMPSCNN() {
                     << numInputs << ", " << batchSize;
           Workspace ws;
           for (auto i = 0; i < numInputs; ++i) {
-            auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
+            auto* t = BlobGetMutableTensor(ws.CreateBlob(cpu(i)), CPU);
             t->Resize(batchSize, channelCount, 9, 17);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -3336,8 +3337,8 @@ void compareModels(const NetDef& initNet, NetDef predictNet) {
     Workspace cws;
     cws.RunNetOnce(initNet);
     {
-      auto* t =
-          cws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(
+          cws.CreateBlob(predictNet.external_input(0)), CPU);
       t->Resize(1, 224, 224, 4);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<uint8_t>()[i] = i % 225;
@@ -3348,8 +3349,8 @@ void compareModels(const NetDef& initNet, NetDef predictNet) {
     Workspace mws;
     mws.RunNetOnce(initNet);
     {
-      auto* t =
-          mws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(
+          mws.CreateBlob(predictNet.external_input(0)), CPU);
       t->Resize(1, 224, 224, 4);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<uint8_t>()[i] = i % 225;
@@ -3397,16 +3398,16 @@ void verifyRewrite(
   dumpDef(predictNet);
   dumpDef(metalPredictNet);
 
-#define RUN_NET(ws, predictNet)                                             \
-  ws.RunNetOnce(initNet);                                                   \
-  {                                                                         \
-    auto* t =                                                               \
-        ws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU); \
-    t->Resize(inputDims);                                                   \
-    CPUContext ctx;                                                         \
-    math::RandGaussian<float, CPUContext>(                                  \
-        t->size(), 0, 1, t->mutable_data<float>(), &ctx);                   \
-  }                                                                         \
+#define RUN_NET(ws, predictNet)                            \
+  ws.RunNetOnce(initNet);                                  \
+  {                                                        \
+    auto* t = BlobGetMutableTensor(                        \
+        ws.CreateBlob(predictNet.external_input(0)), CPU); \
+    t->Resize(inputDims);                                  \
+    CPUContext ctx;                                        \
+    math::RandGaussian<float, CPUContext>(                 \
+        t->size(), 0, 1, t->mutable_data<float>(), &ctx);  \
+  }                                                        \
   ws.RunNetOnce(predictNet);
 
   // initialize
diff --git a/caffe2/mobile/contrib/ios/pool_test.cc b/caffe2/mobile/contrib/ios/pool_test.cc
index 47fd405eef01e..3f78c5d1fcd6a 100644
--- a/caffe2/mobile/contrib/ios/pool_test.cc
+++ b/caffe2/mobile/contrib/ios/pool_test.cc
@@ -16,7 +16,7 @@ void AddNoiseInput(const vector<int64_t>& shape, const string& name, Workspace*
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/mobile/contrib/ios/resize_test.cc b/caffe2/mobile/contrib/ios/resize_test.cc
index 1c08df0f32a1c..428c395fe442d 100644
--- a/caffe2/mobile/contrib/ios/resize_test.cc
+++ b/caffe2/mobile/contrib/ios/resize_test.cc
@@ -16,7 +16,7 @@ void AddNoiseInput(const vector<int64_t>& shape, const string& name, Workspace*
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/mobile/contrib/nnapi/nnapi.cc b/caffe2/mobile/contrib/nnapi/nnapi.cc
index 45ea26c44cc96..56f1fc28986a7 100644
--- a/caffe2/mobile/contrib/nnapi/nnapi.cc
+++ b/caffe2/mobile/contrib/nnapi/nnapi.cc
@@ -679,7 +679,7 @@ void NNApi::init(const TensorVector& inputs, TensorVector* outputs) {
         output_dims.push_back(dim);
       }
 
-      auto* tensor = ws_.CreateBlob(blob)->GetMutableTensor(CPU);
+      auto* tensor = BlobGetMutableTensor(ws_.CreateBlob(blob), CPU);
       tensor->Resize(output_dims);
       outputs->push_back(tensor);
 
diff --git a/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc b/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc
index 359e7767746b6..c14e9ed26376e 100644
--- a/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc
+++ b/caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
- 
+
 #include "caffe2/core/init.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor.h"
@@ -43,14 +43,14 @@ static double benchmark_conv_caffe2(
     ws = &localWs;
   }
   {
-    auto* t = ws->CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws->CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("W"), CPU);
     if (group == 1) {
       t->Resize(K, C, kernel, kernel);
     } else {
@@ -61,7 +61,7 @@ static double benchmark_conv_caffe2(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws->CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("B"), CPU);
     t->Resize(K);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -129,14 +129,14 @@ static double benchmark_conv_nnapi(
     ws = &localWs;
   }
   {
-    auto* t = ws->CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("X_cpu"), CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws->CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("W"), CPU);
     if (group > 1) {
       CAFFE_ENFORCE_EQ(C, group);
       t->Resize(1, kernel, kernel, C);
@@ -148,7 +148,7 @@ static double benchmark_conv_nnapi(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws->CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("B"), CPU);
     t->Resize(K);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -190,7 +190,7 @@ static double benchmark_conv_nnapi(
   NetDef initNet;
   NNApi model(initNet, netdef, ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws->GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(BlobGetMutableTensor(ws->GetBlob("X_cpu"), CPU));
   CAFFE_ENFORCE(model.run(inputs, &outputs));
 
   for (int i = 0; i < warmup; i++) {
@@ -220,14 +220,14 @@ static double benchmark_conv_nnapi_int8(
     ws = &localWs;
   }
   {
-    auto* t = ws->CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("X_cpu"), CPU);
     t->Resize(N, H, W, C);
     for (int i = 0; i < t->size(); i++) {
       t->mutable_data<uint8_t>()[i] = rand() % 10;
     }
   }
   {
-    auto* t = ws->CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("W"), CPU);
     if (group > 1) {
       CAFFE_ENFORCE_EQ(C, group);
       t->Resize(1, kernel, kernel, C);
@@ -243,7 +243,7 @@ static double benchmark_conv_nnapi_int8(
   // should be of ANEURALNETWORKS_TENSOR_INT32, with zeroPoint of 0 and
   // bias_scale == input_scale * filter_scale.
   {
-    auto* t = ws->CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws->CreateBlob("B"), CPU);
     t->Resize(K);
     for (int i = 0; i < t->size(); i++) {
       t->mutable_data<int32_t>()[i] = rand() % 10;
@@ -322,7 +322,7 @@ static double benchmark_conv_nnapi_int8(
   NetDef initNet;
   NNApi model(initNet, netdef, ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws->GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(BlobGetMutableTensor(ws->GetBlob("X_cpu"), CPU));
   CAFFE_ENFORCE(model.run(inputs, &outputs));
 
   for (int i = 0; i < warmup; i++) {
diff --git a/caffe2/mobile/contrib/nnapi/nnapi_test.cc b/caffe2/mobile/contrib/nnapi/nnapi_test.cc
index deab1ca7b43f7..9b4608dc07aee 100644
--- a/caffe2/mobile/contrib/nnapi/nnapi_test.cc
+++ b/caffe2/mobile/contrib/nnapi/nnapi_test.cc
@@ -55,7 +55,7 @@ static void test_relu(int N, int C, int H, int W) {
   // CPU reference
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -81,7 +81,7 @@ static void test_relu(int N, int C, int H, int W) {
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(BlobGetMutableTensor(ws.GetBlob("X_cpu"), CPU));
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
@@ -103,21 +103,21 @@ static void test_conv_NHWC(
     int stride_w) {
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
     t->Resize(K, kernel, kernel, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws.CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("B"), CPU);
     t->Resize(K);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -189,7 +189,7 @@ static void test_conv_NHWC(
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(BlobGetMutableTensor(ws.GetBlob("X_cpu"), CPU));
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
@@ -211,21 +211,21 @@ static void test_depthwise_conv_NHWC(
     int stride_w) {
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
     t->Resize(1, kernel, kernel, D);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
-    auto* t = ws.CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("B"), CPU);
     t->Resize(D);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -406,7 +406,7 @@ static void test_depthwise_conv_NHWC(
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(BlobGetMutableTensor(ws.GetBlob("X_cpu"), CPU));
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
@@ -428,7 +428,7 @@ static void test_pooling(
     int stride_w) {
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
@@ -496,7 +496,7 @@ static void test_pooling(
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(BlobGetMutableTensor(ws.GetBlob("X_cpu"), CPU));
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
@@ -506,7 +506,7 @@ static void test_pooling(
 static void test_softmax(int N, int C, int H = 1, int W = 1) {
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     if (H == 1 && W == 1) {
       t->Resize(N, C);
     } else {
@@ -538,7 +538,7 @@ static void test_softmax(int N, int C, int H = 1, int W = 1) {
   NetDef initNet;
   NNApi model(initNet, netdef, &ws);
   std::vector<TensorCPU*> inputs, outputs;
-  inputs.push_back(ws.GetBlob("X_cpu")->GetMutableTensor(CPU));
+  inputs.push_back(BlobGetMutableTensor(ws.GetBlob("X_cpu"), CPU));
   EXPECT_TRUE(model.run(inputs, &outputs));
   const auto& t_nn = *outputs[0];
 
diff --git a/caffe2/mobile/contrib/opengl/test/opengl_test.cc b/caffe2/mobile/contrib/opengl/test/opengl_test.cc
index 49a875184c10d..690a33cb854f1 100644
--- a/caffe2/mobile/contrib/opengl/test/opengl_test.cc
+++ b/caffe2/mobile/contrib/opengl/test/opengl_test.cc
@@ -178,7 +178,7 @@ void testOpenGLCopyOps(int N, int C, int H, int W, float error, int tile_x = 1,
   LOG(INFO) << "OPENGLCopyFrom/To Test";
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -275,7 +275,7 @@ void testOpenGLConv(int N,
             << " Op: " << glPoolOperationName[poolOp];
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     if (random_input) {
@@ -301,7 +301,7 @@ void testOpenGLConv(int N,
   }
 
   if (poolOp != AveragePool && poolOp != MaxPool) {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
     if (poolOp != ConvTranspose && poolOp != ConvTransposePRelu && poolOp != ConvTransposeRelu) {
       t->Resize(K, C, kernel_h, kernel_w);
     } else {
@@ -343,7 +343,7 @@ void testOpenGLConv(int N,
 
     // bias
     {
-      auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
       t->Resize(K);
       CPUContext ctx;
       if (random_input) {
@@ -367,7 +367,7 @@ void testOpenGLConv(int N,
   }
 
   if (poolOp == ConvPRelu || poolOp == ConvTransposePRelu) {
-    auto* t = ws.CreateBlob("p")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("p"), CPU);
     t->Resize(K);
     CPUContext ctx;
     if (random_input) {
@@ -532,7 +532,7 @@ void testOpenGLPRelu(
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -541,7 +541,7 @@ void testOpenGLPRelu(
 
   // prelu scale
   {
-    auto* t = ws.CreateBlob("p")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("p"), CPU);
     t->Resize(prelu_size);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -603,7 +603,7 @@ void testOpenGLRelu(int N, int C, int H, int W, int input_tile_x, int input_tile
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -664,13 +664,13 @@ void testOpenGLAdd(int N, int C, int H, int W, float error = 0.1, int input_tile
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t0 = ws.CreateBlob("X_cpu0")->GetMutableTensor(CPU);
+    auto* t0 = BlobGetMutableTensor(ws.CreateBlob("X_cpu0"), CPU);
     t0->Resize(N, C, H, W);
     CPUContext ctx0;
     // Too noisy.
     math::RandGaussian<float, CPUContext>(t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);
 
-    auto* t1 = ws.CreateBlob("X_cpu1")->GetMutableTensor(CPU);
+    auto* t1 = BlobGetMutableTensor(ws.CreateBlob("X_cpu1"), CPU);
     t1->Resize(N, C, H, W);
     CPUContext ctx1;
     // Too noisy.
@@ -750,13 +750,13 @@ void testOpenGLSub(int N, int C, int H, int W, float error = 0.1) {
 
   Workspace ws;
   {
-    auto* t0 = ws.CreateBlob("X_cpu0")->GetMutableTensor(CPU);
+    auto* t0 = BlobGetMutableTensor(ws.CreateBlob("X_cpu0"), CPU);
     t0->Resize(N, C, H, W);
     CPUContext ctx0;
     // Too noisy.
     math::RandGaussian<float, CPUContext>(t0->size(), 0, 30, t0->mutable_data<float>(), &ctx0);
 
-    auto* t1 = ws.CreateBlob("X_cpu1")->GetMutableTensor(CPU);
+    auto* t1 = BlobGetMutableTensor(ws.CreateBlob("X_cpu1"), CPU);
     t1->Resize(N, C, H, W);
     CPUContext ctx1;
     // Too noisy.
@@ -814,8 +814,8 @@ void testOpenGLConcat(int N, std::vector<int> Cs, int H, int W, bool tiling = fa
             << "H: " << H << ", W: " << W;
   Workspace ws;
   for (int i = 0; i < Cs.size(); i++) {
-    auto* t =
-        ws.CreateBlob("X_cpu" + caffe2::to_string(i))->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(
+        ws.CreateBlob("X_cpu" + caffe2::to_string(i)), CPU);
     t->Resize(N, Cs[i], H, W);
     CPUContext ctx0;
     // Too noisy.
@@ -891,7 +891,7 @@ void testOpenGLSigmoid(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -942,7 +942,7 @@ void testOpenGLTanh(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), 0, 2, t->mutable_data<float>(), &ctx);
@@ -992,14 +992,14 @@ void testOpenGLMul(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), -10, 10, t->mutable_data<float>(), &ctx);
   }
 
   {
-    auto* t = ws.CreateBlob("B")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("B"), CPU);
     t->Resize(1);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), -10, 10, t->mutable_data<float>(), &ctx);
@@ -1060,7 +1060,7 @@ void testOpenGLSoftmax(int N, int D, float error, bool tiled = false) {
   LOG(INFO) << "OpenGL Softmax Test "
             << "N: " << N << " D: " << D << " Tiled:" << tiled;
   Workspace ws;
-  auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+  auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
   {
     t->Resize(N, D);
     CPUContext ctx;
@@ -1151,7 +1151,7 @@ void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -1163,7 +1163,7 @@ void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {
 
   // scale
   {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
     t->Resize(C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1172,7 +1172,7 @@ void testOpenGLInstanceNorm(int N, int C, int H, int W, float error) {
   }
   // bias
   {
-    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
     t->Resize(C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1254,7 +1254,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
             << "C: " << C << ", H: " << H << ", W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     // Too noisy.
@@ -1266,7 +1266,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
 
   // scale
   {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
     t->Resize(C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1275,7 +1275,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
   }
   // bias
   {
-    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
     t->Resize(C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1284,7 +1284,7 @@ void testOpenGLInstanceNormPRelu(int N, int C, int H, int W, float error) {
   }
   // prelu scale
   {
-    auto* t = ws.CreateBlob("p")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("p"), CPU);
     t->Resize(C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -1385,7 +1385,7 @@ void OpenGL_speedtest(int N,
             << " C: " << C << " H: " << H << " W: " << W;
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     if (random_input) {
@@ -1399,7 +1399,7 @@ void OpenGL_speedtest(int N,
   }
 
   {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
     t->Resize(K, C, kernel_h, kernel_w);
     CPUContext ctx;
     if (random_input) {
@@ -1413,7 +1413,7 @@ void OpenGL_speedtest(int N,
   }
 
   {
-    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
     t->Resize(K);
     CPUContext ctx;
     if (random_input) {
@@ -1479,7 +1479,7 @@ void testOpenGLPadImage(
   {
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
       t->Resize(N, C, H, W);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -1593,7 +1593,7 @@ void testOpenGLResize(int N,
   {
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
       t->Resize(N, C, H, W);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
@@ -1675,7 +1675,7 @@ void testOpenGLPreprocess(int N, int C, int H, int W, float error) {
   LOG(INFO) << "OpenGL Preprocess Test";
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1684,7 +1684,7 @@ void testOpenGLPreprocess(int N, int C, int H, int W, float error) {
   }
 
   {
-    auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
     t->Resize(3);
     CPUContext ctx;
     t->mutable_data<float>()[0] = 100;
@@ -1748,7 +1748,7 @@ void testOpenGLDeprocess(int N, int C, int H, int W, float error) {
   LOG(INFO) << "OpenGLDeprocess Test";
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1757,7 +1757,7 @@ void testOpenGLDeprocess(int N, int C, int H, int W, float error) {
   }
 
   {
-    auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
     t->Resize(3);
     CPUContext ctx;
     t->mutable_data<float>()[0] = 30;
@@ -1800,7 +1800,7 @@ void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
   LOG(INFO) << "OpenGLNormPlanarYUV Test";
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, 3, H, W);
     CPUContext ctx;
     for (auto i = 0; i < t->size(); ++i) {
@@ -1809,7 +1809,7 @@ void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
   }
 
   {
-    auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("mean"), CPU);
     t->Resize(1, 3);
     CPUContext ctx;
     t->mutable_data<float>()[0] = 30;
@@ -1818,7 +1818,7 @@ void testOpenGLNormPlanarYUV(int N, int C, int H, int W, float error) {
   }
 
   {
-    auto* t = ws.CreateBlob("stdev")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("stdev"), CPU);
     t->Resize(1, 3);
     CPUContext ctx;
     t->mutable_data<float>()[0] = 6;
@@ -1879,7 +1879,7 @@ void OpenGL_copyops_speedtest(int N,
   LOG(INFO) << "OpenGL CopyOps Speed Test";
   Workspace ws;
   {
-    auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     if (random_input) {
@@ -1893,7 +1893,7 @@ void OpenGL_copyops_speedtest(int N,
   }
 
   {
-    auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
     t->Resize(K, C, kernel_h, kernel_w);
     CPUContext ctx;
     if (random_input) {
@@ -1907,7 +1907,7 @@ void OpenGL_copyops_speedtest(int N,
   }
 
   {
-    auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+    auto* t = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
     t->Resize(K);
     CPUContext ctx;
     if (random_input) {
@@ -1990,8 +1990,8 @@ void compareModelsForOpenGL(std::string name,
     Workspace cws;
     cws.RunNetOnce(initNet);
 
-    auto* t_cpu = cws.CreateBlob(truncatedPredictNet.external_input(0))
-                      ->GetMutableTensor(CPU);
+    auto* t_cpu = BlobGetMutableTensor(
+        cws.CreateBlob(truncatedPredictNet.external_input(0)), CPU);
     if (name == "styleTransfer") {
       CAFFE_ENFORCE_EQ(input_order, "NHWC");
       CAFFE_ENFORCE_EQ(input_type, "uint8_t");
@@ -2032,26 +2032,26 @@ void compareModelsForOpenGL(std::string name,
     Workspace mws;
     mws.RunNetOnce(initNet);
 
-    auto* t_gl = mws.CreateBlob(truncatedOpenGLPredictNet.external_input(0))
-                     ->GetMutableTensor(CPU);
+    auto* t_gl = BlobGetMutableTensor(
+        mws.CreateBlob(truncatedOpenGLPredictNet.external_input(0)), CPU);
     if (name == "styleTransfer") {
       CAFFE_ENFORCE_EQ(input_order, "NHWC");
       CAFFE_ENFORCE_EQ(input_type, "uint8_t");
       t_gl->Resize(1, height, width, channel);
       uint8_t* input = t_gl->mutable_data<uint8_t>();
-      memcpy(input, t_cpu->mutable_data<uint8_t>(), t_cpu->capacity_nbytes());
+      memcpy(input, t_cpu->mutable_data<uint8_t>(), t_cpu->storage().capacity());
     } else if (name == "segmentation") {
       CAFFE_ENFORCE_EQ(input_order, "NCHW");
       CAFFE_ENFORCE_EQ(input_type, "float");
       t_gl->Resize(1, channel, height, width);
       float* input = t_gl->mutable_data<float>();
-      memcpy(input, t_cpu->mutable_data<float>(), t_cpu->capacity_nbytes());
+      memcpy(input, t_cpu->mutable_data<float>(), t_cpu->storage().capacity());
     } else if (name == "denoiser") {
       CAFFE_ENFORCE_EQ(input_order, "NCHW");
       CAFFE_ENFORCE_EQ(input_type, "float");
       t_gl->Resize(1, channel, height, width);
       float* input = t_gl->mutable_data<float>();
-      memcpy(input, t_cpu->mutable_data<float>(), t_cpu->capacity_nbytes());
+      memcpy(input, t_cpu->mutable_data<float>(), t_cpu->storage().capacity());
     }
 
     cws.RunNetOnce(truncatedPredictNet);
@@ -2116,7 +2116,7 @@ void compareBatchedToTiledModels(std::string name,
     tws.RunNetOnce(initNet);
 
     auto* t_batch =
-        tws.CreateBlob(bachedNet.external_input(0))->GetMutableTensor(CPU);
+        BlobGetMutableTensor(tws.CreateBlob(bachedNet.external_input(0)), CPU);
     if (name == "styleTransfer") {
       CAFFE_ENFORCE_EQ(input_order, "NHWC");
       CAFFE_ENFORCE_EQ(input_type, "uint8_t");
@@ -2143,20 +2143,20 @@ void compareBatchedToTiledModels(std::string name,
     bws.RunNetOnce(initNet);
 
     auto* t_tiling =
-        bws.CreateBlob(tiledNet.external_input(0))->GetMutableTensor(CPU);
+        BlobGetMutableTensor(bws.CreateBlob(tiledNet.external_input(0)), CPU);
     if (name == "styleTransfer") {
       CAFFE_ENFORCE_EQ(input_order, "NHWC");
       CAFFE_ENFORCE_EQ(input_type, "uint8_t");
       t_tiling->Resize(1, height, width, channel);
       uint8_t* input = t_tiling->mutable_data<uint8_t>();
-      memcpy(input, t_batch->mutable_data<uint8_t>(), t_batch->capacity_nbytes());
+      memcpy(input, t_batch->mutable_data<uint8_t>(), t_batch->storage().capacity());
 
     } else if (name == "segmentation") {
       CAFFE_ENFORCE_EQ(input_order, "NCHW");
       CAFFE_ENFORCE_EQ(input_type, "float");
       t_tiling->Resize(1, channel, height, width);
       float* input = t_tiling->mutable_data<float>();
-      memcpy(input, t_batch->mutable_data<float>(), t_batch->capacity_nbytes());
+      memcpy(input, t_batch->mutable_data<float>(), t_batch->storage().capacity());
     }
 
     bws.RunNetOnce(bachedNet);
diff --git a/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc b/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc
index deced71964496..cfeed00e8b973 100644
--- a/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc
+++ b/caffe2/mobile/contrib/snpe/snpe_op_benchmark.cc
@@ -14,7 +14,7 @@
 #define POPULATE_DATA(_n, _s, _l)                                         \
   do {                                                                    \
     Blob* _blob = ws.CreateBlob((_n));                                    \
-    auto* _tensor = _blob->GetMutableTensor(CPU);                         \
+    auto* _tensor = BlobGetMutableTensor(_blob, CPU);                     \
     _tensor->Resize((_s));                                                \
     memcpy(_tensor->mutable_data<float>(), data_##_l, _tensor->nbytes()); \
   } while (0)
@@ -23,7 +23,7 @@
 #define POPULATE_DATA(_n, _s, _l)                                 \
   do {                                                            \
     Blob* _blob = ws.CreateBlob((_n));                            \
-    auto* _tensor = _blob->GetMutableTensor(CPU);                 \
+    auto* _tensor = BlobGetMutableTensor(_blob, CPU);             \
     _tensor->Resize((_s));                                        \
     memset(_tensor->mutable_data<float>(), 1, _tensor->nbytes()); \
   } while (0)
@@ -43,7 +43,7 @@ void AddConstInput(const vector<int64_t>& shape,
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
   math::Set<float, CPUContext>(tensor->size(), value,
                                tensor->mutable_data<float>(),
@@ -56,7 +56,7 @@ void AddNoiseInput(const vector<int64_t>& shape,
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/mobile/contrib/ulp2/ulp_test.cc b/caffe2/mobile/contrib/ulp2/ulp_test.cc
index a1c1af0f6dfb8..6316b05284fba 100644
--- a/caffe2/mobile/contrib/ulp2/ulp_test.cc
+++ b/caffe2/mobile/contrib/ulp2/ulp_test.cc
@@ -289,13 +289,13 @@ void ConvTest2b1b(int IC, int KH, int KW, int H, int W, int OC, int N, ConvArgs
     def.add_arg()->CopyFrom(MakeArgument("pad_r", args.pad_r));
     def.add_arg()->CopyFrom(MakeArgument("pad_t", args.pad_t));
     def.add_arg()->CopyFrom(MakeArgument("pad_b", args.pad_b));
-    auto* Xws = ws.CreateBlob("X")->GetMutableTensor(CPU);
+    auto* Xws = BlobGetMutableTensor(ws.CreateBlob("X"), CPU);
     Xws->ResizeLike(X);
     Xws->ShareExternalPointer(X.mutable_data<float>(), X.size());
-    auto* Wws = ws.CreateBlob("W")->GetMutableTensor(CPU);
+    auto* Wws = BlobGetMutableTensor(ws.CreateBlob("W"), CPU);
     Wws->ResizeLike(W_);
     Wws->ShareExternalPointer(W_.mutable_data<float>(), W_.size());
-    auto* bws = ws.CreateBlob("b")->GetMutableTensor(CPU);
+    auto* bws = BlobGetMutableTensor(ws.CreateBlob("b"), CPU);
     bws->ResizeLike(bias);
     bws->ShareExternalPointer(bias.mutable_data<float>(), bias.size());
     ws.RunOperatorOnce(def);
diff --git a/caffe2/operators/atomic_ops.cc b/caffe2/operators/atomic_ops.cc
index 2ce97b0d58c5f..2c8f17649f516 100644
--- a/caffe2/operators/atomic_ops.cc
+++ b/caffe2/operators/atomic_ops.cc
@@ -2,6 +2,11 @@
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 
+#ifdef CAFFE2_USE_IDEEP
+#include <caffe2/ideep/operators/operator_fallback_ideep.h>
+#include <caffe2/ideep/utils/ideep_operator.h>
+#endif
+
 namespace caffe2 {
 namespace fb {
 namespace {
@@ -85,6 +90,10 @@ class CheckAtomicBoolOp final : public Operator<CPUContext> {
 REGISTER_CPU_OPERATOR(CreateMutex, CreateMutexOp);
 REGISTER_CPU_OPERATOR(AtomicFetchAdd, AtomicFetchAddOp);
 
+#ifdef CAFFE2_USE_IDEEP
+REGISTER_IDEEP_OPERATOR(CreateMutex, IDEEPFallbackOp<CreateMutexOp, SkipIndices<0>>);
+#endif
+
 REGISTER_CPU_OPERATOR(CreateAtomicBool, CreateAtomicBoolOp);
 REGISTER_CPU_OPERATOR(ConditionalSetAtomicBool, ConditionalSetAtomicBoolOp);
 REGISTER_CPU_OPERATOR(CheckAtomicBool, CheckAtomicBoolOp);
diff --git a/caffe2/operators/batch_matmul_op_gpu_test.cc b/caffe2/operators/batch_matmul_op_gpu_test.cc
index 804296307d6ef..31e179b3e41f8 100644
--- a/caffe2/operators/batch_matmul_op_gpu_test.cc
+++ b/caffe2/operators/batch_matmul_op_gpu_test.cc
@@ -30,7 +30,7 @@ class BatchMatMulOpGPUTest : public testing::Test {
       const float value,
       const string& name) {
     Blob* blob = ws_.CreateBlob(name);
-    auto* tensor = blob->GetMutableTensor(CUDA);
+    auto* tensor = BlobGetMutableTensor(blob, CUDA);
     tensor->Resize(dims);
     math::Set<float, CUDAContext>(
         tensor->size(),
diff --git a/caffe2/operators/batch_matmul_op_test.cc b/caffe2/operators/batch_matmul_op_test.cc
index 45db7dd5b8484..c74829b4f8f9c 100644
--- a/caffe2/operators/batch_matmul_op_test.cc
+++ b/caffe2/operators/batch_matmul_op_test.cc
@@ -24,7 +24,7 @@ class BatchMatMulOpTest : public testing::Test {
       const float value,
       const string& name) {
     Blob* blob = ws_.CreateBlob(name);
-    auto* tensor = blob->GetMutableTensor(CPU);
+    auto* tensor = BlobGetMutableTensor(blob, CPU);
     tensor->Resize(dims);
     math::Set<float, CPUContext>(
         tensor->size(),
diff --git a/caffe2/operators/boolean_unmask_ops_test.cc b/caffe2/operators/boolean_unmask_ops_test.cc
index 8814be17153d4..b0c5f7dcdfff0 100644
--- a/caffe2/operators/boolean_unmask_ops_test.cc
+++ b/caffe2/operators/boolean_unmask_ops_test.cc
@@ -16,7 +16,7 @@ static void AddScalarInput(
     Workspace* ws,
     bool isEmpty = false) {
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   if (!isEmpty) {
     tensor->Resize(vector<int64_t>{1});
     *(tensor->template mutable_data<DataT>()) = value;
diff --git a/caffe2/operators/conv_op_shared.cc b/caffe2/operators/conv_op_shared.cc
index b9f54b6d55be7..155b6f0cd2456 100644
--- a/caffe2/operators/conv_op_shared.cc
+++ b/caffe2/operators/conv_op_shared.cc
@@ -27,8 +27,8 @@ void runWithSharedBuffer<CPUContext>(
 
   auto* mutexPtr = mutexBlob->GetMutable<std::unique_ptr<std::mutex>>();
   std::lock_guard<std::mutex> g(**mutexPtr);
-  auto* buffer =
-      ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CPU__")->GetMutableTensor(CPU);
+  auto* buffer = BlobGetMutableTensor(
+      ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CPU__"), CPU);
   f(buffer);
 }
 }
diff --git a/caffe2/operators/conv_op_shared_gpu.cc b/caffe2/operators/conv_op_shared_gpu.cc
index f80d15a5d9054..c1f37c7f1362f 100644
--- a/caffe2/operators/conv_op_shared_gpu.cc
+++ b/caffe2/operators/conv_op_shared_gpu.cc
@@ -20,8 +20,8 @@ void runWithSharedBuffer<CUDAContext>(
 
   auto* mutexPtr = mutexBlob->GetMutable<std::unique_ptr<std::mutex>>();
   std::lock_guard<std::mutex> g(**mutexPtr);
-  auto* buffer =
-      ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CUDA__")->GetMutableTensor(CUDA);
+  auto* buffer = BlobGetMutableTensor(
+      ws->GetBlob("__CAFFE2_SHARED_CONV_BUFFER_CUDA__"), CUDA);
   f(buffer);
 }
 }
diff --git a/caffe2/operators/conv_transpose_op_mobile_test.cc b/caffe2/operators/conv_transpose_op_mobile_test.cc
index 6eb45eb5f8d17..3bc2951664353 100644
--- a/caffe2/operators/conv_transpose_op_mobile_test.cc
+++ b/caffe2/operators/conv_transpose_op_mobile_test.cc
@@ -17,7 +17,7 @@ void AddConstInput(const vector<int64_t>& shape,
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
   math::Set<float, CPUContext>(
       tensor->size(), value, tensor->template mutable_data<float>(), &context);
@@ -29,7 +29,7 @@ void AddNoiseInput(const vector<int64_t>& shape,
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/operators/dataset_ops.cc b/caffe2/operators/dataset_ops.cc
index 8329422428083..e3c0abe83d8b4 100644
--- a/caffe2/operators/dataset_ops.cc
+++ b/caffe2/operators/dataset_ops.cc
@@ -1428,7 +1428,7 @@ class TreeCursorSerializer : public BlobSerializerBase {
     // serialize offsets as a tensor
     if (cursor->offsets.size() > 0) {
       Blob offsets_blob;
-      auto* offsets = offsets_blob.GetMutableTensor(CPU);
+      auto* offsets = BlobGetMutableTensor(&offsets_blob, CPU);
       offsets->Resize(cursor->offsets.size());
       std::copy(
           cursor->offsets.begin(),
diff --git a/caffe2/operators/dropout_op_cudnn.cc b/caffe2/operators/dropout_op_cudnn.cc
index a68a1263f6f45..8a40c731143f4 100644
--- a/caffe2/operators/dropout_op_cudnn.cc
+++ b/caffe2/operators/dropout_op_cudnn.cc
@@ -150,7 +150,7 @@ bool CuDNNDropoutOp::DoRunWithType() {
     // Reshape tensor descriptors if necessary
     if (X.dims() != cudnn_input_dims_ && !is_test_) {
       CAFFE_ENFORCE(scratch_blob_);
-      Tensor* states = scratch_blob_->GetMutableTensor(CUDA);
+      Tensor* states = BlobGetMutableTensor(scratch_blob_, CUDA);
       cudnn_input_dims_ = X.dims();
       CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
           data_desc_,
diff --git a/caffe2/operators/elementwise_op_test.h b/caffe2/operators/elementwise_op_test.h
index bcd547e28f098..b785d040c8f1a 100644
--- a/caffe2/operators/elementwise_op_test.h
+++ b/caffe2/operators/elementwise_op_test.h
@@ -19,7 +19,7 @@ void FillTensor(
     const std::vector<int64_t>& shape,
     const std::vector<I_Type>& values) {
   auto* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(Context::GetDeviceType());
+  auto* tensor = BlobGetMutableTensor(blob, Context::GetDeviceType());
   tensor->Resize(shape);
   auto* mutable_data = tensor->template mutable_data<O_Type>();
   const O_Type* data = reinterpret_cast<const O_Type*>(values.data());
diff --git a/caffe2/operators/expand_squeeze_dims_op.h b/caffe2/operators/expand_squeeze_dims_op.h
index 505b1ec7d6909..37a3b5716127d 100644
--- a/caffe2/operators/expand_squeeze_dims_op.h
+++ b/caffe2/operators/expand_squeeze_dims_op.h
@@ -112,7 +112,7 @@ class SqueezeOp : public Operator<Context> {
   vector<int> dims_;
 
  public:
-  AT_DISABLE_COPY_AND_ASSIGN(SqueezeOp);
+  C10_DISABLE_COPY_AND_ASSIGN(SqueezeOp);
 };
 } // namespace caffe2
 #endif // CAFFE2_OPERATORS_EXPAND_SQUEEZE_DIMS_OP_H_
diff --git a/caffe2/operators/generate_proposals_op_test.cc b/caffe2/operators/generate_proposals_op_test.cc
index 2b3a033a665df..da7fdc650879c 100644
--- a/caffe2/operators/generate_proposals_op_test.cc
+++ b/caffe2/operators/generate_proposals_op_test.cc
@@ -18,7 +18,7 @@ static void AddConstInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
   math::Set<float, CPUContext>(
       tensor->size(), value, tensor->template mutable_data<float>(), &context);
@@ -34,7 +34,7 @@ static void AddLinSpacedInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
   EigenVectorMap<float> tensor_vec(
       tensor->template mutable_data<float>(), tensor->size());
@@ -51,7 +51,7 @@ static void AddInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
   EigenVectorMap<float> tensor_vec(
       tensor->template mutable_data<float>(), tensor->size());
diff --git a/caffe2/operators/index_ops.cc b/caffe2/operators/index_ops.cc
index 241b0ff97c607..2fb8f3b338dc6 100644
--- a/caffe2/operators/index_ops.cc
+++ b/caffe2/operators/index_ops.cc
@@ -353,7 +353,7 @@ class IndexSerializer : public BlobSerializerBase {
       SerializationAcceptor acceptor) override {
     auto& base = blob.template Get<std::unique_ptr<IndexBase>>();
     Blob tensor_blob;
-    auto* tensor_out = tensor_blob.GetMutableTensor(CPU);
+    auto* tensor_out = BlobGetMutableTensor(&tensor_blob, CPU);
 
     if (base->Type().Match<std::string>()) {
       doStore<std::string>(base, tensor_out);
diff --git a/caffe2/operators/onnx_while_op.h b/caffe2/operators/onnx_while_op.h
index dbd5103952469..7a3c34cfbf7cc 100644
--- a/caffe2/operators/onnx_while_op.h
+++ b/caffe2/operators/onnx_while_op.h
@@ -213,23 +213,23 @@ class ONNXWhileOp final : public Operator<Context> {
       lcd_tensors_.clear();
       for (int i = 2; i < body_net_def.external_input_size(); ++i) {
         Blob* b = loop_ws_->CreateBlob(body_net_def.external_input(i));
-        Tensor* t = b->GetMutableTensor(Context::GetDeviceType());
+        Tensor* t = BlobGetMutableTensor(b, Context::GetDeviceType());
         lcd_tensors_.push_back(t);
       }
       // First output is the iteration variable
       auto* iteration_var_blob = loop_ws_->CreateBlob(
           body_net_def.external_input(0));
       iteration_var_ =
-          iteration_var_blob->GetMutableTensor(Context::GetDeviceType());
+          BlobGetMutableTensor(iteration_var_blob, Context::GetDeviceType());
 
-      input_condition_var_ =
-          loop_ws_->CreateBlob(body_net_def.external_input(1))
-              ->GetMutableTensor(Context::GetDeviceType());
+      input_condition_var_ = BlobGetMutableTensor(
+          loop_ws_->CreateBlob(body_net_def.external_input(1)),
+          Context::GetDeviceType());
 
       auto* condition_var_blob =
           loop_ws_->CreateBlob(body_net_def.external_output(0));
       condition_var_ =
-          condition_var_blob->GetMutableTensor(Context::GetDeviceType());
+          BlobGetMutableTensor(condition_var_blob, Context::GetDeviceType());
       condition_var_->Resize(1);
       condition_var_->template mutable_data<bool>();
 
diff --git a/caffe2/operators/onnxifi_op.cc b/caffe2/operators/onnxifi_op.cc
index d1b0824f1b319..767a37d5fc792 100644
--- a/caffe2/operators/onnxifi_op.cc
+++ b/caffe2/operators/onnxifi_op.cc
@@ -15,7 +15,7 @@ void BlobToTensorDescriptor(
   // Memory type
   // We only allow weights to be CPU tensor for now
   CAFFE_ENFORCE(
-      blob->IsTensorType(CPU),
+      BlobIsTensorType(*blob, CPU),
       "Initialization blob ",
       name,
       " needs to be TensorCPU");
diff --git a/caffe2/operators/operator_fallback_gpu.h b/caffe2/operators/operator_fallback_gpu.h
index 8ef39e7c0e78d..5b3a38dbfbd13 100644
--- a/caffe2/operators/operator_fallback_gpu.h
+++ b/caffe2/operators/operator_fallback_gpu.h
@@ -65,8 +65,8 @@ class GPUFallbackOpEx final : public Operator<CUDAContext> {
     bool need_sync = false;
     for (int i = 0; i < InputSize(); ++i) {
       if (this->InputIsTensorType(i, CUDA)) {
-        local_input_blobs_[i]->GetMutableTensor(CPU)->CopyFrom(
-            Input(i), &context_);
+        BlobGetMutableTensor(local_input_blobs_[i], CPU)
+            ->CopyFrom(Input(i), &context_);
         need_sync = true;
       } else {
         VLOG(1) << "Input " << i << " is not TensorCUDA. Skipping copy.";
@@ -95,7 +95,7 @@ class GPUFallbackOpEx final : public Operator<CUDAContext> {
         continue;
       }
       CAFFE_ENFORCE(
-          local_output_blobs_[i]->IsTensorType(CPU),
+          BlobIsTensorType(*local_output_blobs_[i], CPU),
           "GPU fallback op currently does not support non-TensorCPU "
           "output type who needs copying.");
       Output(i)->CopyFrom(local_output_blobs_[i]->template Get<TensorCPU>());
diff --git a/caffe2/operators/operator_fallback_gpu_test.cc b/caffe2/operators/operator_fallback_gpu_test.cc
index 964708bc10906..0870a4be2dd7b 100644
--- a/caffe2/operators/operator_fallback_gpu_test.cc
+++ b/caffe2/operators/operator_fallback_gpu_test.cc
@@ -40,7 +40,7 @@ TEST(OperatorFallbackTest, IncrementByOneOp) {
   for (int i = 0; i < 6; ++i) {
     source_tensor.mutable_data<float>()[i] = i;
   }
-  ws.CreateBlob("X")->GetMutableTensor(CPU)->CopyFrom(source_tensor);
+  BlobGetMutableTensor(ws.CreateBlob("X"), CPU)->CopyFrom(source_tensor);
   unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
   EXPECT_TRUE(op.get() != nullptr);
   EXPECT_TRUE(op->Run());
@@ -64,7 +64,7 @@ TEST(OperatorFallbackTest, GPUIncrementByOneOp) {
   for (int i = 0; i < 6; ++i) {
     source_tensor.mutable_data<float>()[i] = i;
   }
-  ws.CreateBlob("X")->GetMutableTensor(CUDA)->CopyFrom(source_tensor);
+  BlobGetMutableTensor(ws.CreateBlob("X"), CUDA)->CopyFrom(source_tensor);
   unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
   EXPECT_TRUE(op.get() != nullptr);
   EXPECT_TRUE(op->Run());
diff --git a/caffe2/operators/partition_ops.h b/caffe2/operators/partition_ops.h
index 94bd1e6150cef..32f31f97d878c 100644
--- a/caffe2/operators/partition_ops.h
+++ b/caffe2/operators/partition_ops.h
@@ -221,7 +221,7 @@ class PartitionOp : public PartitionOpBase {
     return true;
   }
 
-  AT_DISABLE_COPY_AND_ASSIGN(PartitionOp);
+  C10_DISABLE_COPY_AND_ASSIGN(PartitionOp);
 };
 
 class LengthsPartitionOp : public PartitionOpBase {
@@ -302,7 +302,7 @@ class LengthsPartitionOp : public PartitionOpBase {
     return true;
   }
 
-  AT_DISABLE_COPY_AND_ASSIGN(LengthsPartitionOp);
+  C10_DISABLE_COPY_AND_ASSIGN(LengthsPartitionOp);
 
   vector<int32_t*> out_length_;
 };
diff --git a/caffe2/operators/reshape_op_gpu_test.cc b/caffe2/operators/reshape_op_gpu_test.cc
index 3537ab69d058f..d4ac325a78b80 100644
--- a/caffe2/operators/reshape_op_gpu_test.cc
+++ b/caffe2/operators/reshape_op_gpu_test.cc
@@ -20,7 +20,7 @@ static void AddConstInput(
   option.set_device_type(PROTO_CUDA);
   CUDAContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CUDA);
+  auto* tensor = BlobGetMutableTensor(blob, CUDA);
   tensor->Resize(shape);
   math::Set<float, CUDAContext>(
       tensor->size(), value, tensor->template mutable_data<float>(), &context);
diff --git a/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h b/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
index 98675cea858d5..63d58f3ccd8f6 100644
--- a/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
+++ b/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h
@@ -43,11 +43,10 @@ class RecurrentNetworkBlobFetcherOp final : public Operator<Context> {
             prefix_ + std::string("_") + blob_name + caffe2::to_string(i);
         blob_names_vector.push_back(newBlobName);
 
-        ws_->CreateBlob(newBlobName)
-            ->GetMutableTensor(CPU)
+        BlobGetMutableTensor(ws_->CreateBlob(newBlobName), CPU)
             ->ResizeLike(currentTensor);
         auto type = Context::GetDeviceType();
-        auto* newTensor = ws_->GetBlob(newBlobName)->GetMutableTensor(type);
+        auto* newTensor = BlobGetMutableTensor(ws_->GetBlob(newBlobName), type);
         newTensor->CopyFrom(currentTensor);
       }
     }
diff --git a/caffe2/operators/rnn/recurrent_network_executor.h b/caffe2/operators/rnn/recurrent_network_executor.h
index 7e37e562e77a5..4cb53a6d7d330 100644
--- a/caffe2/operators/rnn/recurrent_network_executor.h
+++ b/caffe2/operators/rnn/recurrent_network_executor.h
@@ -111,10 +111,10 @@ class RecurrentNetworkExecutorBase {
       // the forward-only mode.
       std::string this_timestep_blob =
           timestep_blob_ + "_rnnexec_t" + caffe2::to_string(t);
-      ws->CreateBlob(this_timestep_blob)->GetMutableTensor(CPU)->Resize(1);
+      BlobGetMutableTensor(ws->CreateBlob(this_timestep_blob), CPU)->Resize(1);
       auto b = ws->GetBlob(this_timestep_blob);
       CAFFE_ENFORCE(b);
-      b->GetMutableTensor(CPU)->template mutable_data<int32_t>()[0] = t;
+      BlobGetMutableTensor(b, CPU)->template mutable_data<int32_t>()[0] = t;
 
       // Copy the operators from template
       for (auto& template_rnn_op : timestep_ops_template_) {
diff --git a/caffe2/operators/rnn/recurrent_network_op.h b/caffe2/operators/rnn/recurrent_network_op.h
index 2421bc44263af..21b3064a6fac3 100644
--- a/caffe2/operators/rnn/recurrent_network_op.h
+++ b/caffe2/operators/rnn/recurrent_network_op.h
@@ -52,10 +52,11 @@ struct CAFFE2_API ScratchWorkspaces {
 };
 
 inline void UpdateTimestepBlob(Workspace* ws, std::string blob_name, int t) {
-  ws->CreateBlob(blob_name)->GetMutableTensor(CPU)->Resize(1);
+  BlobGetMutableTensor(ws->CreateBlob(blob_name), CPU)->Resize(1);
   auto timestepBlob = ws->GetBlob(blob_name);
   CAFFE_ENFORCE(timestepBlob);
-  timestepBlob->GetMutableTensor(CPU)->template mutable_data<int32_t>()[0] = t;
+  BlobGetMutableTensor(timestepBlob, CPU)->template mutable_data<int32_t>()[0] =
+      t;
 }
 
 CAFFE2_API std::map<string, string> GetRecurrentMapping(
@@ -71,8 +72,9 @@ void applyOffsetAlias(
           << " at offset: " << oc.offset;
   auto srcBlob = ws->GetBlob(oc.src);
   CAFFE_ENFORCE(srcBlob);
-  auto* src = srcBlob->GetMutableTensor(Context::GetDeviceType());
-  auto* dst = ws->GetBlob(oc.dst)->GetMutableTensor(Context::GetDeviceType());
+  auto* src = BlobGetMutableTensor(srcBlob, Context::GetDeviceType());
+  auto* dst =
+      BlobGetMutableTensor(ws->GetBlob(oc.dst), Context::GetDeviceType());
   auto timestep = src->size() / src->dim(0);
   auto dims = src->dims();
   const int32_t startDstTimestep =
@@ -113,7 +115,7 @@ void initializeRecurrentInput(
     Context* context) {
   auto stateBlob = ws->GetBlob(rc.state);
   CAFFE_ENFORCE(stateBlob);
-  auto* state = stateBlob->GetMutableTensor(Context::GetDeviceType());
+  auto* state = BlobGetMutableTensor(stateBlob, Context::GetDeviceType());
 
   auto inputBlob = ws->GetBlob(rc.input);
   CAFFE_ENFORCE(inputBlob);
@@ -660,7 +662,7 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
 
       auto gBlob = sharedWs_->GetBlob(param.grad);
       CAFFE_ENFORCE(gBlob);
-      auto* g = gBlob->GetMutableTensor(Context::GetDeviceType());
+      auto* g = BlobGetMutableTensor(gBlob, Context::GetDeviceType());
       g->ResizeLike(p);
       math::Set<T, Context>(
           g->size(),
@@ -676,7 +678,7 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
 
       auto gBlob = sharedWs_->CreateBlob(rg.grad);
       CAFFE_ENFORCE(gBlob);
-      auto* g = gBlob->GetMutableTensor(Context::GetDeviceType());
+      auto* g = BlobGetMutableTensor(gBlob, Context::GetDeviceType());
       g->ResizeLike(p);
       CAFFE_ENFORCE_EQ(g->ndim(), 3);
       const auto timestep = g->size() / g->dim(0);
@@ -703,7 +705,7 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
               << ". Size: " << Input(gradientInputIndex).size();
       auto pGradientBlob = sharedWs_->GetBlob(gradientName);
       CAFFE_ENFORCE(pGradientBlob);
-      auto* g = pGradientBlob->GetMutableTensor(Context::GetDeviceType());
+      auto* g = BlobGetMutableTensor(pGradientBlob, Context::GetDeviceType());
       g->ResizeLike(Input(gradientInputIndex));
       g->template mutable_data<T>();
     }
@@ -717,7 +719,7 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
                 << rg.lastExternalGrad << " for final time step (sep. blob)";
         auto gBlob = sharedWs_->GetBlob(rg.grad);
         CAFFE_ENFORCE(gBlob);
-        auto* g = gBlob->GetMutableTensor(Context::GetDeviceType());
+        auto* g = BlobGetMutableTensor(gBlob, Context::GetDeviceType());
 
         auto oglastBlob = sharedWs_->GetBlob(rg.lastExternalGrad);
         CAFFE_ENFORCE(oglastBlob);
@@ -779,7 +781,7 @@ class RecurrentNetworkGradientOp final : public Operator<Context> {
       T* output_data = Output(outputIdx)->template mutable_data<T>();
       auto pBlob = sharedWs_->GetBlob(recurrentGradients_[i].grad);
       CAFFE_ENFORCE(pBlob);
-      auto* p = pBlob->GetMutableTensor(Context::GetDeviceType());
+      auto* p = BlobGetMutableTensor(pBlob, Context::GetDeviceType());
 
       if (Input(inputId).ndim() >= 2) {
         // Gradient states blob should live. And if it gets changed by the
diff --git a/caffe2/operators/roi_align_op_gpu_test.cc b/caffe2/operators/roi_align_op_gpu_test.cc
index 2647a97d6f0b9..7257ec44c2598 100644
--- a/caffe2/operators/roi_align_op_gpu_test.cc
+++ b/caffe2/operators/roi_align_op_gpu_test.cc
@@ -18,7 +18,7 @@ void AddConstInput(
     Context* context,
     Workspace* ws) {
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(Context::GetDeviceType());
+  auto* tensor = BlobGetMutableTensor(blob, Context::GetDeviceType());
   tensor->Resize(shape);
   math::Set<float, Context>(
       tensor->size(), value, tensor->template mutable_data<float>(), context);
@@ -39,7 +39,7 @@ void AddInput<CPUContext>(
     const string& name,
     Workspace* ws) {
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
   EigenVectorMap<float> tensor_vec(
       tensor->template mutable_data<float>(), tensor->size());
@@ -57,7 +57,7 @@ void AddInput<CUDAContext>(
   tmp_vec.array() = utils::AsEArrXt(values);
 
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CUDA);
+  auto* tensor = BlobGetMutableTensor(blob, CUDA);
   tensor->CopyFrom(tmp);
 }
 
diff --git a/caffe2/operators/slice_op.cu b/caffe2/operators/slice_op.cu
index 475d8329c9249..8ddb204ebd5b4 100644
--- a/caffe2/operators/slice_op.cu
+++ b/caffe2/operators/slice_op.cu
@@ -302,7 +302,7 @@ class SliceGradientOp<CUDAContext> : public Operator<CUDAContext> {
         ends_(this->template GetRepeatedArgument<int64_t>("ends")),
         statically_inited_(false) {}
 
-  AT_DISABLE_COPY_AND_ASSIGN(SliceGradientOp);
+  C10_DISABLE_COPY_AND_ASSIGN(SliceGradientOp);
 
   bool RunOnDevice() override {
     if (InputSize() == 4) {
diff --git a/caffe2/operators/slice_op.h b/caffe2/operators/slice_op.h
index e7f8919bb81c8..6149f077669d7 100644
--- a/caffe2/operators/slice_op.h
+++ b/caffe2/operators/slice_op.h
@@ -249,7 +249,7 @@ class SliceOp : public Operator<Context> {
         output, data, starts_host_, ends_host_, &context_);
   }
 
-  AT_DISABLE_COPY_AND_ASSIGN(SliceOp);
+  C10_DISABLE_COPY_AND_ASSIGN(SliceOp);
 
  protected:
   std::vector<int64_t> starts_;
@@ -269,7 +269,7 @@ class SliceGradientOp : public Operator<Context> {
         ends_(this->template GetRepeatedArgument<int64_t>("ends")),
         statically_inited_(false) {}
 
-        AT_DISABLE_COPY_AND_ASSIGN(SliceGradientOp);
+  C10_DISABLE_COPY_AND_ASSIGN(SliceGradientOp);
 
   bool RunOnDevice() override {
     if (InputSize() == 4) {
diff --git a/caffe2/operators/string_ops_test.cc b/caffe2/operators/string_ops_test.cc
index c9ba13efb5025..2092ae804f2c3 100644
--- a/caffe2/operators/string_ops_test.cc
+++ b/caffe2/operators/string_ops_test.cc
@@ -9,7 +9,7 @@ class StringJoinOpTest : public testing::Test {
  public:
   bool runOp(const TensorCPU& input) {
     auto* blob = ws_.CreateBlob("X");
-    auto* tensor = blob->GetMutableTensor(CPU);
+    auto* tensor = BlobGetMutableTensor(blob, CPU);
     tensor->ResizeLike(input);
     tensor->ShareData(input);
 
@@ -26,7 +26,7 @@ class StringJoinOpTest : public testing::Test {
   const std::string* checkAndGetOutput(int outputSize) {
     const auto* output = ws_.GetBlob("Y");
     EXPECT_NE(output, nullptr);
-    EXPECT_TRUE(output->IsTensorType(CPU));
+    EXPECT_TRUE(BlobIsTensorType(*output, CPU));
     const auto& outputTensor = output->Get<TensorCPU>();
     EXPECT_EQ(outputTensor.ndim(), 1);
     EXPECT_EQ(outputTensor.dim(0), outputSize);
@@ -42,7 +42,7 @@ TEST_F(StringJoinOpTest, testString1DJoin) {
   std::vector<std::string> input = {"a", "xx", "c"};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob.get(), CPU);
   tensor->Resize(input.size());
   auto* data = tensor->template mutable_data<std::string>();
   for (int i = 0; i < input.size(); ++i) {
@@ -62,7 +62,7 @@ TEST_F(StringJoinOpTest, testString2DJoin) {
                                                  {"dd", "ee", "ff"}};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob.get(), CPU);
   tensor->Resize(input.size(), input[0].size());
   auto* data = tensor->template mutable_data<std::string>();
   for (int i = 0; i < input.size(); ++i) {
@@ -82,7 +82,7 @@ TEST_F(StringJoinOpTest, testFloat1DJoin) {
   std::vector<float> input = {3.90f, 5.234f, 8.12f};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob.get(), CPU);
   tensor->Resize(input.size());
   auto* data = tensor->template mutable_data<float>();
   for (int i = 0; i < input.size(); ++i) {
@@ -102,7 +102,7 @@ TEST_F(StringJoinOpTest, testFloat2DJoin) {
                                            {4.67f, 5.90f, 6.32f}};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob.get(), CPU);
   tensor->Resize(input.size(), input[0].size());
   auto* data = tensor->template mutable_data<float>();
   for (int i = 0; i < input.size(); ++i) {
@@ -122,7 +122,7 @@ TEST_F(StringJoinOpTest, testLong2DJoin) {
   std::vector<std::vector<int64_t>> input = {{100, 200}, {1000, 2000}};
 
   auto blob = caffe2::make_unique<Blob>();
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob.get(), CPU);
   tensor->Resize(input.size(), input[0].size());
   auto* data = tensor->template mutable_data<int64_t>();
   for (int i = 0; i < input.size(); ++i) {
diff --git a/caffe2/operators/stylizer_ops.cc b/caffe2/operators/stylizer_ops.cc
index a6d395fe9ba64..bfc41a462999b 100644
--- a/caffe2/operators/stylizer_ops.cc
+++ b/caffe2/operators/stylizer_ops.cc
@@ -82,10 +82,10 @@ class PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp
     auto defaultNoiseSize = OperatorBase::GetSingleArgument<int>(
         "noise_size", 491 /* prime to avoid artifacts */);
 
-    if (!noiseBlob->IsTensorType(CPU)) {
+    if (!BlobIsTensorType(*noiseBlob, CPU)) {
       // Initialize random noise on first use.
       // Cache it to maintain temporal consistency.
-      auto* t = noiseBlob->GetMutableTensor(CPU);
+      auto* t = BlobGetMutableTensor(noiseBlob, CPU);
 
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
       // Noise space is larger for vectorized code due to the
diff --git a/caffe2/operators/tensor_protos_db_input.h b/caffe2/operators/tensor_protos_db_input.h
index cd081bf959e39..e9f5b1a8f8455 100644
--- a/caffe2/operators/tensor_protos_db_input.h
+++ b/caffe2/operators/tensor_protos_db_input.h
@@ -56,7 +56,7 @@ bool TensorProtosDBInput<Context>::Prefetch() {
         protos.mutable_protos(i)->clear_device_detail();
       }
       deserializer.Deserialize(
-          protos.protos(i), prefetched_blobs_[i].GetMutableTensor(CPU));
+          protos.protos(i), BlobGetMutableTensor(&prefetched_blobs_[i], CPU));
     }
   } else {
     vector<Tensor> temp_tensors;
@@ -74,11 +74,11 @@ bool TensorProtosDBInput<Context>::Prefetch() {
           vector<int> dims(
               protos.protos(i).dims().begin(), protos.protos(i).dims().end());
           dims.insert(dims.begin(), batch_size_);
-          prefetched_blobs_[i].GetMutableTensor(CPU)->Resize(dims);
+          BlobGetMutableTensor(&prefetched_blobs_[i], CPU)->Resize(dims);
         }
       }
       for (int i = 0; i < protos.protos_size(); ++i) {
-        TensorCPU* dst = prefetched_blobs_[i].GetMutableTensor(CPU);
+        TensorCPU* dst = BlobGetMutableTensor(&prefetched_blobs_[i], CPU);
         TensorCPU& src = temp_tensors[i];
         if (protos.protos(i).has_device_detail()) {
           protos.mutable_protos(i)->clear_device_detail();
diff --git a/caffe2/operators/tt_linear_op.h b/caffe2/operators/tt_linear_op.h
index 421c26e318b6e..1a5cdc344ce4a 100644
--- a/caffe2/operators/tt_linear_op.h
+++ b/caffe2/operators/tt_linear_op.h
@@ -52,7 +52,7 @@ class TTLinearOp final : public Operator<Context> {
     int cores_idx = 0;
 
     // Temporary buffer to facilitate multiplication of TT-cores with input
-    auto Y_buf = Y_temp_->GetMutableTensor(Context::GetDeviceType());
+    auto Y_buf = BlobGetMutableTensor(Y_temp_.get(), Context::GetDeviceType());
     Y_buf->ResizeLike(X);
     Y_buf->CopyFrom(X);
 
diff --git a/caffe2/operators/utility_ops_gpu_test.cc b/caffe2/operators/utility_ops_gpu_test.cc
index f500afaf9ed24..1099d900cbefd 100644
--- a/caffe2/operators/utility_ops_gpu_test.cc
+++ b/caffe2/operators/utility_ops_gpu_test.cc
@@ -19,7 +19,7 @@ static void AddConstInput(
   option.set_device_type(PROTO_CUDA);
   CUDAContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CUDA);
+  auto* tensor = BlobGetMutableTensor(blob, CUDA);
   tensor->Resize(shape);
   math::Set<float, CUDAContext>(
       tensor->size(), value, tensor->template mutable_data<float>(), &context);
diff --git a/caffe2/operators/utility_ops_test.cc b/caffe2/operators/utility_ops_test.cc
index 379dd52655c4f..a3a2a409674ed 100644
--- a/caffe2/operators/utility_ops_test.cc
+++ b/caffe2/operators/utility_ops_test.cc
@@ -16,7 +16,7 @@ static void AddConstInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
   math::Set<float, CPUContext>(
       tensor->size(), value, tensor->template mutable_data<float>(), &context);
diff --git a/caffe2/opt/fusion.cc b/caffe2/opt/fusion.cc
index fdf5fdc31e104..8c324a97c5093 100644
--- a/caffe2/opt/fusion.cc
+++ b/caffe2/opt/fusion.cc
@@ -44,10 +44,10 @@ bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) {
     CAFFE_ENFORCE(
         bnInputs.size() >= 5, "Invalid batch normalization input size");
 
-#define EXPOSE_TENSOR_DATA(name, index, inputs)                            \
-  auto name = repr::nn::get<repr::Tensor>(inputs[index]);                  \
-  assert(ws->HasBlob(name->getName()) && "Blob not in workspace");         \
-  auto name##Tensor = ws->GetBlob(name->getName())->GetMutableTensor(CPU); \
+#define EXPOSE_TENSOR_DATA(name, index, inputs)                                \
+  auto name = repr::nn::get<repr::Tensor>(inputs[index]);                      \
+  assert(ws->HasBlob(name->getName()) && "Blob not in workspace");             \
+  auto name##Tensor = BlobGetMutableTensor(ws->GetBlob(name->getName()), CPU); \
   auto name##Data = name##Tensor->mutable_data<float>();
 
     EXPOSE_TENSOR_DATA(filter, 1, convInputs);
@@ -76,7 +76,7 @@ bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) {
           nn->dataFlow.createEdge(convBiasNode, convNode);
 
           auto* blob = ws->CreateBlob(convBiasName);
-          caffe2::TensorCPU* tensor = blob->GetMutableTensor(caffe2::CPU);
+          caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU);
           CHECK_NOTNULL(tensor);
           // Get output channel
           size_t c = filterTensor->dim32(0);
diff --git a/caffe2/opt/fusion.h b/caffe2/opt/fusion.h
index 33dc2e4c54b1a..0973ade54b383 100644
--- a/caffe2/opt/fusion.h
+++ b/caffe2/opt/fusion.h
@@ -37,7 +37,7 @@ CAFFE2_API void fuseConvBN(repr::NNModule* nn, caffe2::Workspace* ws);
 // \param postprocess Functor to postprocess the conv node,
 // attaching additional attributes if necessary
 template <typename OperationT, typename ActivationT>
-CAFFE2_EXPORT void fuseActivation(
+C10_EXPORT void fuseActivation(
     repr::NNModule* nn,
     std::function<bool(const OperationT& conv)> should_fuse,
     std::function<void(repr::NNGraph::NodeRef conv_node)> postprocess) {
diff --git a/caffe2/opt/onnxifi_transformer.cc b/caffe2/opt/onnxifi_transformer.cc
index ce79df56ecb72..a048503fea99c 100644
--- a/caffe2/opt/onnxifi_transformer.cc
+++ b/caffe2/opt/onnxifi_transformer.cc
@@ -173,7 +173,7 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOp(
 
       // Feed into workspace as CPU Tensors
       auto* blob = ws->CreateBlob(t.name());
-      auto* cpu_tensor = blob->GetMutableTensor(CPU);
+      auto* cpu_tensor = BlobGetMutableTensor(blob, CPU);
       std::vector<int64_t> dims;
       for(const auto& d : t.dims()) {
         dims.push_back(d);
diff --git a/caffe2/opt/sink.cc b/caffe2/opt/sink.cc
index c4d73d7abb12d..ed4cd8a372537 100644
--- a/caffe2/opt/sink.cc
+++ b/caffe2/opt/sink.cc
@@ -8,7 +8,7 @@ namespace opt {
 
 using namespace nom;
 
-CAFFE2_EXPORT void sinkMaxPool(nom::repr::NNModule* nn) {
+C10_EXPORT void sinkMaxPool(nom::repr::NNModule* nn) {
   for (auto max_pool_node :
        repr::nn::nodeIterator<repr::MaxPool>(nn->dataFlow)) {
     if (repr::nn::getInputs(max_pool_node).size() != 1) {
diff --git a/caffe2/perfkernels/CMakeLists.txt b/caffe2/perfkernels/CMakeLists.txt
index 3781bbb6afb6b..a5701da807f4f 100644
--- a/caffe2/perfkernels/CMakeLists.txt
+++ b/caffe2/perfkernels/CMakeLists.txt
@@ -17,8 +17,8 @@ set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${common_srcs})
 if (NOT MSVC AND CAFFE2_COMPILER_SUPPORTS_AVX2_EXTENSIONS)
   add_library(Caffe2_perfkernels_avx OBJECT ${avx_srcs})
   add_library(Caffe2_perfkernels_avx2 OBJECT ${avx2_srcs})
-  add_dependencies(Caffe2_perfkernels_avx Caffe2_PROTO)
-  add_dependencies(Caffe2_perfkernels_avx2 Caffe2_PROTO)
+  add_dependencies(Caffe2_perfkernels_avx Caffe2_PROTO c10)
+  add_dependencies(Caffe2_perfkernels_avx2 Caffe2_PROTO c10)
   if (MSVC)
     set_target_properties(
         Caffe2_perfkernels_avx PROPERTIES COMPILE_FLAGS "/arch:AVX")
diff --git a/caffe2/predictor/predictor.cc b/caffe2/predictor/predictor.cc
index 84dac93753d37..7775e69776450 100644
--- a/caffe2/predictor/predictor.cc
+++ b/caffe2/predictor/predictor.cc
@@ -10,14 +10,14 @@ void enforceIsTensor(Workspace* ws, const std::string& name) {
   auto blob = ws->GetBlob(name);
   CAFFE_ENFORCE(blob, "Blob does not exist: ", name);
   CAFFE_ENFORCE(
-      blob->IsTensorType(CPU), "Blob is not a CPU Tensor: ", name);
+      BlobIsTensorType(*blob, CPU), "Blob is not a CPU Tensor: ", name);
 }
 
 TensorCPU* getTensor(Workspace* ws, const std::string& name) {
   enforceIsTensor(ws, name);
   auto* blob = ws->GetBlob(name);
   CAFFE_ENFORCE(blob, "Blob: ", name, " does not exist");
-  return blob->GetMutableTensor(CPU);
+  return BlobGetMutableTensor(blob, CPU);
 }
 
 void shareInputTensor(
@@ -60,7 +60,7 @@ Predictor::Predictor(PredictorConfig config) : config_(std::move(config)) {
   for (const auto& name : config_.predict_net->external_input()) {
     if (!initialized.count(name)) {
       auto* blob = config_.ws->CreateBlob(name);
-      blob->GetMutableTensor(CPU);
+      BlobGetMutableTensor(blob, CPU);
     }
   }
   CAFFE_ENFORCE(config_.ws->CreateNet(config_.predict_net));
diff --git a/caffe2/predictor/predictor_test.cc b/caffe2/predictor/predictor_test.cc
index ae4f73e9da0ad..a0245cd7a86d6 100644
--- a/caffe2/predictor/predictor_test.cc
+++ b/caffe2/predictor/predictor_test.cc
@@ -135,7 +135,7 @@ std::unique_ptr<Blob> randomTensor(
     const std::vector<int64_t>& dims,
     CPUContext* ctx) {
   auto blob = make_unique<Blob>();
-  auto* t = blob->GetMutableTensor(CPU);
+  auto* t = BlobGetMutableTensor(blob.get(), CPU);
   t->Resize(dims);
   math::RandUniform<float, CPUContext>(
       t->size(), -1.0, 1.0, t->template mutable_data<float>(), ctx);
@@ -180,7 +180,7 @@ TEST_F(PredictorTest, SimpleBatchSized) {
   auto inputData = randomTensor({1, 4}, ctx_.get());
   Predictor::TensorList input;
   input.emplace_back(CPU);
-  auto tensor = inputData->GetMutableTensor(CPU);
+  auto tensor = BlobGetMutableTensor(inputData.get(), CPU);
   input.back().ResizeLike(*tensor);
   input.back().ShareData(*tensor);
   Predictor::TensorList output;
@@ -196,7 +196,7 @@ TEST_F(PredictorTest, SimpleBatchSizedMapInput) {
   auto inputData = randomTensor({1, 4}, ctx_.get());
   Predictor::TensorMap input;
   auto iter = input.emplace("data", Tensor(CPU));
-  auto tensor = inputData->GetMutableTensor(CPU);
+  auto tensor = BlobGetMutableTensor(inputData.get(), CPU);
   iter.first->second.ResizeLike(*tensor);
   iter.first->second.ShareData(*tensor);
 
diff --git a/caffe2/python/data_parallel_model_test.py b/caffe2/python/data_parallel_model_test.py
index ebf3c3b8cd44a..1b9b4929bb0f9 100644
--- a/caffe2/python/data_parallel_model_test.py
+++ b/caffe2/python/data_parallel_model_test.py
@@ -831,6 +831,7 @@ def param_update_fun(model):
 
         return workspace.FetchBlob("{}_0/partest/i2h_w".format(model._device_prefix))
 
+    @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
     def test_equiv_recurrent(self):
         '''
         Test that the model produces exactly same results given
diff --git a/caffe2/python/onnx/tests/onnx_backend_test.py b/caffe2/python/onnx/tests/onnx_backend_test.py
index 8ffaef3004d9a..ad229a97f807d 100644
--- a/caffe2/python/onnx/tests/onnx_backend_test.py
+++ b/caffe2/python/onnx/tests/onnx_backend_test.py
@@ -59,6 +59,10 @@
 if 'JENKINS_URL' in os.environ:
     backend_test.exclude(r'(test_vgg19|test_vgg)')
 
+# FIXME: flaky test in CircleCI
+if "IN_CIRCLECI" in os.environ:
+    backend_test.exclude(r'(test_dynamic_slice_cpu)')
+
 # import all test cases at global scope to make them visible to python.unittest
 globals().update(backend_test
                  .enable_report()
diff --git a/caffe2/python/operator_test/cross_entropy_ops_test.py b/caffe2/python/operator_test/cross_entropy_ops_test.py
index 5ee60d877c33b..f97b0c5809d5f 100644
--- a/caffe2/python/operator_test/cross_entropy_ops_test.py
+++ b/caffe2/python/operator_test/cross_entropy_ops_test.py
@@ -8,6 +8,8 @@
 import hypothesis.strategies as st
 import numpy as np
 
+import unittest
+import os
 
 def sigmoid(x):
     return 1.0 / (1.0 + np.exp(-x))
@@ -248,6 +250,7 @@ def weighted_sigmoid_xentr_logit_grad_ref(g_out, outputs, fwd_inputs):
             output_to_grad='xentropy',
             grad_reference=weighted_sigmoid_xentr_logit_grad_ref)
 
+    @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
     @given(n=st.integers(2, 10),
            b=st.integers(1, 5),
            **hu.gcs_cpu_only)
diff --git a/caffe2/python/operator_test/im2col_col2im_test.py b/caffe2/python/operator_test/im2col_col2im_test.py
index 6db6cae47ad1c..46b16f4356ff5 100644
--- a/caffe2/python/operator_test/im2col_col2im_test.py
+++ b/caffe2/python/operator_test/im2col_col2im_test.py
@@ -10,6 +10,9 @@
 import hypothesis.strategies as st
 import numpy as np
 
+import unittest
+import os
+
 
 class TestReduceFrontSum(hu.HypothesisTestCase):
     @given(batch_size=st.integers(1, 3),
@@ -111,6 +114,7 @@ def test_im2col_layout(self, batch_size, stride, pad, kernel, dilation,
             atol=1e-4,
             rtol=1e-4)
 
+    @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
     @given(batch_size=st.integers(1, 3),
            stride=st.integers(1, 3),
            pad=st.integers(0, 3),
diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc
index 81197047102ff..9a1d715bfdf22 100644
--- a/caffe2/python/pybind_state.cc
+++ b/caffe2/python/pybind_state.cc
@@ -328,7 +328,7 @@ void addObjectMethods(py::module& m) {
           })
       .def(
           "tensor",
-          [](Blob* blob) { return py::cast(blob->GetMutableTensor(CPU)); },
+          [](Blob* blob) { return py::cast(BlobGetMutableTensor(blob, CPU)); },
           py::return_value_policy::reference_internal)
       .def(
           "_feed",
diff --git a/caffe2/python/pybind_state.h b/caffe2/python/pybind_state.h
index 59f39dd313032..4f81569e42936 100644
--- a/caffe2/python/pybind_state.h
+++ b/caffe2/python/pybind_state.h
@@ -43,7 +43,7 @@ void addObjectMethods(pybind11::module& m);
 // Get current workspace
 Workspace* GetCurrentWorkspace();
 
-class CAFFE2_EXPORT BlobFetcherBase {
+class C10_EXPORT BlobFetcherBase {
  public:
   struct FetchedBlob {
     pybind11::object obj;
@@ -60,7 +60,7 @@ class BlobFeederBase {
   Feed(const DeviceOption& option, PyArrayObject* array, Blob* blob) = 0;
 };
 
-CAFFE2_EXPORT CAFFE_DECLARE_TYPED_REGISTRY(
+C10_EXPORT CAFFE_DECLARE_TYPED_REGISTRY(
     BlobFetcherRegistry,
     TypeIdentifier,
     BlobFetcherBase,
@@ -234,7 +234,7 @@ class TensorFeeder : public BlobFeederBase {
     FeedTensor(
         option,
         original_array,
-        blob->GetMutableTensor(Context::GetDeviceType()));
+        BlobGetMutableTensor(blob, Context::GetDeviceType()));
   }
 };
 
@@ -366,31 +366,32 @@ class PythonOpBase : public Operator<Context> {
 
         // make sure output blob is initialized before creating the binding
         if (forced_cpu_outputs_.count(i)) {
-          blob->GetMutableTensor(Context::GetDeviceType());
+          BlobGetMutableTensor(blob, Context::GetDeviceType());
         } else {
-          blob->GetMutableTensor(Context::GetDeviceType());
+          BlobGetMutableTensor(blob, Context::GetDeviceType());
         }
 
         py::object py_obj;
         if (blob->template IsType<Tensor>()) {
           if (use_dlpack) {
             DLPackWrapper<CPUContext> wrapper(
-                blob->GetMutableTensor(Context::GetDeviceType()), cpu_option);
+                BlobGetMutableTensor(blob, Context::GetDeviceType()),
+                cpu_option);
             py_obj = py::cast(wrapper, py::return_value_policy::copy);
           } else {
             py_obj = py::cast(
-                blob->GetMutableTensor(Context::GetDeviceType()),
+                BlobGetMutableTensor(blob, Context::GetDeviceType()),
                 py::return_value_policy::reference);
           }
         } else {
           if (use_dlpack) {
             DLPackWrapper<Context> wrapper(
-                blob->GetMutableTensor(Context::GetDeviceType()),
+                BlobGetMutableTensor(blob, Context::GetDeviceType()),
                 this->device_option());
             py_obj = py::cast(wrapper, py::return_value_policy::copy);
           } else {
             py_obj = py::cast(
-                blob->GetMutableTensor(Context::GetDeviceType()),
+                BlobGetMutableTensor(blob, Context::GetDeviceType()),
                 py::return_value_policy::reference);
           }
         }
diff --git a/caffe2/python/pybind_state_ideep.cc b/caffe2/python/pybind_state_ideep.cc
index ebad6cf8d9683..f0307f7b6485d 100644
--- a/caffe2/python/pybind_state_ideep.cc
+++ b/caffe2/python/pybind_state_ideep.cc
@@ -163,8 +163,8 @@ class IDeepFeeder : public BlobFeederBase {
         DeviceOption cpu_option(option);
         cpu_option.set_device_type(DeviceTypeProto::PROTO_CPU);
         TensorFeeder<CPUContext> cpu_tensor_feeder;
-        cpu_tensor_feeder.FeedTensor(cpu_option, original_array,
-                                     blob->GetMutableTensor(CPU));
+        cpu_tensor_feeder.FeedTensor(
+            cpu_option, original_array, BlobGetMutableTensor(blob, CPU));
       }
     } catch (ideep::error &e) {
       LOG(ERROR) << "IDEEP error: " << e.message;
diff --git a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_prefix.zip b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_prefix.zip
deleted file mode 100644
index e4019f68dfd0e..0000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_prefix.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_prefix/inout.npz b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_prefix/inout.npz
deleted file mode 100644
index 0dfa5f9790c01..0000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_prefix/inout.npz and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_prefix/op.pb b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_prefix/op.pb
deleted file mode 100644
index b1f14dad9aefd..0000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_prefix/op.pb and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_suffix.zip b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_suffix.zip
deleted file mode 100644
index cc60f7242ee69..0000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_suffix.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_suffix/inout.npz b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_suffix/inout.npz
deleted file mode 100644
index 0dfa5f9790c01..0000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_suffix/inout.npz and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_suffix/op.pb b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_suffix/op.pb
deleted file mode 100644
index d59c513004803..0000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_suffix/op.pb and /dev/null differ
diff --git a/caffe2/python/serialized_test/serialized_test_util.py b/caffe2/python/serialized_test/serialized_test_util.py
index feb5d8e127cb8..67081fa77d025 100644
--- a/caffe2/python/serialized_test/serialized_test_util.py
+++ b/caffe2/python/serialized_test/serialized_test_util.py
@@ -11,9 +11,9 @@
 import inspect
 import numpy as np
 import os
-import re
 import shutil
 import sys
+import tempfile
 import threading
 from zipfile import ZipFile
 
@@ -140,16 +140,15 @@ def parse_proto(x):
 
         source_dir = self.get_output_dir()
         test_name = self.get_output_filename()
-        full_dir = os.path.join(source_dir, test_name)
-        _prepare_dir(full_dir)
+        temp_dir = tempfile.mkdtemp()
         with ZipFile(os.path.join(source_dir, test_name + '.zip')) as z:
-            loaded = z.extractall(full_dir)
+            z.extractall(temp_dir)
 
-        op_path = os.path.join(full_dir, 'op.pb')
-        inout_path = os.path.join(full_dir, 'inout.npz')
-        loaded = np.load(inout_path, encoding='bytes')
+        op_path = os.path.join(temp_dir, 'op.pb')
+        inout_path = os.path.join(temp_dir, 'inout.npz')
 
         # load serialized input and output
+        loaded = np.load(inout_path, encoding='bytes')
         loaded_inputs = loaded['inputs'].tolist()
         inputs_equal = True
         for (x, y) in zip(inputs, loaded_inputs):
@@ -157,16 +156,16 @@ def parse_proto(x):
                 inputs_equal = False
         loaded_outputs = loaded['outputs'].tolist()
 
-        # load operator
-        with open(op_path, 'rb') as f:
-            loaded_op = f.read()
-
-        op_proto = parse_proto(loaded_op)
-        device_type = loaded['device_type']
-        device_option = caffe2_pb2.DeviceOption(device_type=int(device_type))
-
         # if inputs are not the same, run serialized input through serialized op
         if not inputs_equal:
+            # load operator
+            with open(op_path, 'rb') as f:
+                loaded_op = f.read()
+
+            op_proto = parse_proto(loaded_op)
+            device_type = loaded['device_type']
+            device_option = caffe2_pb2.DeviceOption(device_type=int(device_type))
+
             outputs = hu.runOpOnInput(device_option, op_proto, loaded_inputs)
             grad_ops = _getGradientOrNone(op_proto)
 
@@ -176,12 +175,13 @@ def parse_proto(x):
 
         # assert gradient op is equal
         for i in range(len(grad_ops)):
-            with open(os.path.join(full_dir, 'grad_{}.pb'.format(i)), 'rb') as f:
+            grad_path = os.path.join(temp_dir, 'grad_{}.pb'.format(i))
+            with open(grad_path, 'rb') as f:
                 loaded_grad = f.read()
             grad_proto = parse_proto(loaded_grad)
             self.assertTrue(grad_proto == grad_ops[i])
 
-        shutil.rmtree(full_dir)
+        shutil.rmtree(temp_dir)
 
     def assertSerializedOperatorChecks(
             self,
diff --git a/caffe2/python/test_util.py b/caffe2/python/test_util.py
index 769679e46f2b7..dc1f737013223 100644
--- a/caffe2/python/test_util.py
+++ b/caffe2/python/test_util.py
@@ -16,6 +16,28 @@ def rand_array(*dims):
     return np.array(np.random.rand(*dims) - 0.5).astype(np.float32)
 
 
+def randBlob(name, type, *dims, **kwargs):
+    offset = kwargs['offset'] if 'offset' in kwargs else 0.0
+    workspace.FeedBlob(name, np.random.rand(*dims).astype(type) + offset)
+
+
+def randBlobFloat32(name, *dims, **kwargs):
+    randBlob(name, np.float32, *dims, **kwargs)
+
+
+def randBlobsFloat32(names, *dims, **kwargs):
+    for name in names:
+        randBlobFloat32(name, *dims, **kwargs)
+
+
+def str_compare(a, b, encoding="utf8"):
+    if isinstance(a, bytes):
+        a = a.decode(encoding)
+    if isinstance(b, bytes):
+        b = b.decode(encoding)
+    return a == b
+
+
 class TestCase(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
diff --git a/caffe2/python/transformations_test.py b/caffe2/python/transformations_test.py
index 383b8410ea6ae..26f5450605a1c 100644
--- a/caffe2/python/transformations_test.py
+++ b/caffe2/python/transformations_test.py
@@ -23,39 +23,32 @@
 import numpy as np
 
 from caffe2.python.transformations import Transformer
-from caffe2.python import core, workspace, test_util
+from caffe2.python import core, workspace
+from caffe2.python import test_util as tu
 
 transformer = Transformer()
 
 
-def str_compare(a, b, encoding="utf8"):
-    if isinstance(a, bytes):
-        a = a.decode(encoding)
-    if isinstance(b, bytes):
-        b = b.decode(encoding)
-    return a == b
-
-
-class TestTransformations(test_util.TestCase):
+class TestTransformations(tu.TestCase):
     def test_transformer_AddNNPACK(self):
         net = core.Net("net")
         net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
         net.Relu(["Y"], ["Y2"])
         transformer.AddNNPACK(net)
-        assert str_compare(net.Proto().op[0].engine, "NNPACK")
+        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
 
     def test_transformer_FuseNNPACKConvRelu(self):
         net = core.Net("net")
         net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
         net.Relu(["Y"], ["Y2"])
         transformer.AddNNPACK(net)  # get the NNPACK engine
-        assert str_compare(net.Proto().op[0].engine, "NNPACK")
+        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
         transformer.FuseNNPACKConvRelu(net)
         assert len(net.Proto().op) == 1
         has_activation_arg = False
         for arg in net.Proto().op[0].arg:
-            if str_compare(arg.name, "activation"):
-                assert str_compare(arg.s, "Relu")
+            if tu.str_compare(arg.name, "activation"):
+                assert tu.str_compare(arg.s, "Relu")
                 has_activation_arg = True
         assert has_activation_arg
 
@@ -65,12 +58,12 @@ def test_noFuseNNPACKConvRelu(self):
         net.Relu(["Y"], ["Y2"])
         net.Relu(["Y"], ["Y3"])
         transformer.AddNNPACK(net)  # get the NNPACK engine
-        assert str_compare(net.Proto().op[0].engine, "NNPACK")
+        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
         transformer.FuseNNPACKConvRelu(net)
         assert len(net.Proto().op) == 3
         has_activation_arg = False
         for arg in net.Proto().op[0].arg:
-            if str_compare(arg.name, "activation") and str_compare(arg.s, "Relu"):
+            if tu.str_compare(arg.name, "activation") and tu.str_compare(arg.s, "Relu"):
                 has_activation_arg = True
         assert not has_activation_arg
 
@@ -79,13 +72,13 @@ def test_transformer_FuseNNPACKConvReluNoInplace(self):
         net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
         net.Relu(["Y"], ["X"])
         transformer.AddNNPACK(net)  # get the NNPACK engine
-        assert str_compare(net.Proto().op[0].engine, "NNPACK")
+        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
         transformer.FuseNNPACKConvRelu(net)
         assert len(net.Proto().op) == 1
         has_activation_arg = False
         for arg in net.Proto().op[0].arg:
-            if str_compare(arg.name, "activation"):
-                assert str_compare(arg.s, "Relu")
+            if tu.str_compare(arg.name, "activation"):
+                assert tu.str_compare(arg.s, "Relu")
                 has_activation_arg = True
         assert has_activation_arg
         assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
@@ -95,13 +88,13 @@ def test_transformer_FuseNNPACKConvReluInplaceRelu(self):
         net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
         net.Relu(["Y"], ["Y"])
         transformer.AddNNPACK(net)  # get the NNPACK engine
-        assert str_compare(net.Proto().op[0].engine, "NNPACK")
+        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
         transformer.FuseNNPACKConvRelu(net)
         assert len(net.Proto().op) == 1
         has_activation_arg = False
         for arg in net.Proto().op[0].arg:
-            if str_compare(arg.name, "activation"):
-                assert str_compare(arg.s, "Relu")
+            if tu.str_compare(arg.name, "activation"):
+                assert tu.str_compare(arg.s, "Relu")
                 has_activation_arg = True
         assert has_activation_arg
         assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
@@ -112,13 +105,13 @@ def test_transformer_FuseNNPACKConvReluPingPongNaming(self):
         net.Relu(["Y"], ["X"])
         net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
         transformer.AddNNPACK(net)  # get the NNPACK engine
-        assert str_compare(net.Proto().op[0].engine, "NNPACK")
+        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
         transformer.FuseNNPACKConvRelu(net)
         assert len(net.Proto().op) == 2
         has_activation_arg = False
         for arg in net.Proto().op[0].arg:
-            if str_compare(arg.name, "activation"):
-                assert str_compare(arg.s, "Relu")
+            if tu.str_compare(arg.name, "activation"):
+                assert tu.str_compare(arg.s, "Relu")
                 has_activation_arg = True
         assert has_activation_arg
         assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
@@ -131,13 +124,13 @@ def test_transformer_FuseNNPACKConvReluFollowedByMultipleInputOp(self):
         net.Conv(["Y2", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
         net.Relu(["Y"], ["Y2"])
         transformer.AddNNPACK(net)  # get the NNPACK engine
-        assert str_compare(net.Proto().op[0].engine, "NNPACK")
+        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
         transformer.FuseNNPACKConvRelu(net)
         assert len(net.Proto().op) == 2
         has_activation_arg = False
         for arg in net.Proto().op[0].arg:
-            if str_compare(arg.name, "activation"):
-                assert str_compare(arg.s, "Relu")
+            if tu.str_compare(arg.name, "activation"):
+                assert tu.str_compare(arg.s, "Relu")
                 has_activation_arg = True
         assert has_activation_arg
         assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
@@ -150,13 +143,13 @@ def test_transformer_FuseNNPACKConvReluInplaceFollowedByMultipleInputOp(self):
         net.Conv(["Y", "w", "b"], ["Y2"], stride=1, pad=0, kernel=3, order="NCHW")
         net.Relu(["Y2"], ["Y2"])
         transformer.AddNNPACK(net)  # get the NNPACK engine
-        assert str_compare(net.Proto().op[0].engine, "NNPACK")
+        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
         transformer.FuseNNPACKConvRelu(net)
         assert len(net.Proto().op) == 2
         has_activation_arg = False
         for arg in net.Proto().op[0].arg:
-            if str_compare(arg.name, "activation"):
-                assert str_compare(arg.s, "Relu")
+            if tu.str_compare(arg.name, "activation"):
+                assert tu.str_compare(arg.s, "Relu")
                 has_activation_arg = True
         assert has_activation_arg
         assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
@@ -168,8 +161,8 @@ def test_transformer_SinkMaxPool(self):
         net.MaxPool(["Y"], ["Y1"], kernel=3)
         net.Relu(["Y1"], ["Y1"])
         transformer.SinkMaxPool(net)
-        assert str_compare(net.Proto().op[1].type, "Relu")
-        assert str_compare(net.Proto().op[2].type, "MaxPool")
+        assert tu.str_compare(net.Proto().op[1].type, "Relu")
+        assert tu.str_compare(net.Proto().op[2].type, "MaxPool")
 
     @given(
         size=st.integers(7, 10),
@@ -196,18 +189,16 @@ def test_transformer_FuseConvBN(self, size, input_channels, seed, order, epsilon
 
         np.random.seed(seed)
         if order == "NCHW":
-            workspace.FeedBlob("X", np.random.rand(1, c, h, w).astype(np.float32))
-            workspace.FeedBlob("w", np.random.rand(c, c, k, k).astype(np.float32))
+            tu.randBlobFloat32("X", 1, c, h, w)
+            tu.randBlobFloat32("w", c, c, k, k)
         else:
-            workspace.FeedBlob("X", np.random.rand(1, h, w, c).astype(np.float32))
-            workspace.FeedBlob("w", np.random.rand(c, k, k, c).astype(np.float32))
-        workspace.FeedBlob("b", np.random.rand(c).astype(np.float32))
-        workspace.FeedBlob("scale", np.random.rand(c).astype(np.float32))
-        workspace.FeedBlob("bias", np.random.rand(c).astype(np.float32))
-        workspace.FeedBlob("mean", np.random.rand(c).astype(np.float32))
+            tu.randBlobFloat32("X", 1, h, w, c)
+            tu.randBlobFloat32("w", c, k, k, c)
+        tu.randBlobsFloat32(["b", "scale", "bias", "mean"], c)
+
         # This is necessary because 1/sqrt(var) is used and if var is too small
         # we get floating point artifacts that cause test failures
-        workspace.FeedBlob("var", np.random.rand(c).astype(np.float32) + 0.5)
+        tu.randBlobFloat32("var", c, offset=0.5)
         workspace.RunNetOnce(net)
         preTransformOutput = workspace.FetchBlob("Y2").flatten()
         workspace.FeedBlob("Y2", np.zeros((1, 1)))
@@ -250,17 +241,15 @@ def test_transformer_FuseConvBNNoConvBias(self, size, input_channels, seed, orde
 
         np.random.seed(seed)
         if order == "NCHW":
-            workspace.FeedBlob("X", np.random.rand(1, c, h, w).astype(np.float32))
-            workspace.FeedBlob("w", np.random.rand(c, c, k, k).astype(np.float32))
+            tu.randBlobFloat32("X", 1, c, h, w)
+            tu.randBlobFloat32("w", c, c, k, k)
         else:
-            workspace.FeedBlob("X", np.random.rand(1, h, w, c).astype(np.float32))
-            workspace.FeedBlob("w", np.random.rand(c, k, k, c).astype(np.float32))
-        workspace.FeedBlob("scale", np.random.rand(c).astype(np.float32))
-        workspace.FeedBlob("bias", np.random.rand(c).astype(np.float32))
-        workspace.FeedBlob("mean", np.random.rand(c).astype(np.float32))
+            tu.randBlobFloat32("X", 1, h, w, c)
+            tu.randBlobFloat32("w", c, k, k, c)
+        tu.randBlobsFloat32(["scale", "bias", "mean"], c)
         # This is necessary because 1/sqrt(var) is used and if var is too small
         # we get floating point artifacts that cause test failures
-        workspace.FeedBlob("var", np.random.rand(c).astype(np.float32) + 0.5)
+        tu.randBlobFloat32("var", c, offset=0.5)
         workspace.RunNetOnce(net)
         preTransformOutput = workspace.FetchBlob("Y2").flatten()
         workspace.FeedBlob("Y2", np.zeros((1, 1)))
@@ -303,17 +292,15 @@ def test_transformer_FuseConvBNNoConvBiasDuplicatedName(self, size, input_channe
 
         np.random.seed(seed)
         if order == "NCHW":
-            workspace.FeedBlob("X", np.random.rand(1, c, h, w).astype(np.float32))
-            workspace.FeedBlob("w", np.random.rand(c, c, k, k).astype(np.float32))
+            tu.randBlobFloat32("X", 1, c, h, w)
+            tu.randBlobFloat32("w", c, c, k, k)
         else:
-            workspace.FeedBlob("X", np.random.rand(1, h, w, c).astype(np.float32))
-            workspace.FeedBlob("w", np.random.rand(c, k, k, c).astype(np.float32))
-        workspace.FeedBlob("scale", np.random.rand(c).astype(np.float32))
-        workspace.FeedBlob("_bias0", np.random.rand(c).astype(np.float32))
-        workspace.FeedBlob("mean", np.random.rand(c).astype(np.float32))
+            tu.randBlobFloat32("X", 1, h, w, c)
+            tu.randBlobFloat32("w", c, k, k, c)
+        tu.randBlobsFloat32(["scale", "_bias0", "mean"], c)
         # This is necessary because 1/sqrt(var) is used and if var is too small
         # we get floating point artifacts that cause test failures
-        workspace.FeedBlob("var", np.random.rand(c).astype(np.float32) + 0.5)
+        tu.randBlobFloat32("var", c, offset=0.5)
         workspace.RunNetOnce(net)
         preTransformOutput = workspace.FetchBlob("Y2").flatten()
         workspace.FeedBlob("Y2", np.zeros((1, 1)))
@@ -366,15 +353,12 @@ def test_transformer_FuseConv3DBN(
         )
 
         np.random.seed(seed)
-        workspace.FeedBlob("X", np.random.rand(1, c, t, h, w).astype(np.float32))
-        workspace.FeedBlob("w", np.random.rand(c, c, kt, kh, kw).astype(np.float32))
-        workspace.FeedBlob("b", np.random.rand(c).astype(np.float32))
-        workspace.FeedBlob("scale", np.random.rand(c).astype(np.float32))
-        workspace.FeedBlob("bias", np.random.rand(c).astype(np.float32))
-        workspace.FeedBlob("mean", np.random.rand(c).astype(np.float32))
+        tu.randBlobFloat32("X", 1, c, t, h, w)
+        tu.randBlobFloat32("w", c, c, kt, kh, kw)
+        tu.randBlobsFloat32(["b", "scale", "bias", "mean"], c)
         # This is necessary because 1/sqrt(var) is used and if var is too small
         # we get floating point artifacts that cause test failures
-        workspace.FeedBlob("var", np.random.rand(c).astype(np.float32) + 0.5)
+        tu.randBlobFloat32("var", c, offset=0.5)
         workspace.RunNetOnce(net)
         preTransformOutput = workspace.FetchBlob("Y2").flatten()
         workspace.FeedBlob("Y2", np.zeros((1, 1)))
diff --git a/caffe2/queue/blobs_queue_db.cc b/caffe2/queue/blobs_queue_db.cc
index 06a6985848ce2..bd7795c94ad2e 100644
--- a/caffe2/queue/blobs_queue_db.cc
+++ b/caffe2/queue/blobs_queue_db.cc
@@ -32,7 +32,7 @@ class CreateBlobsQueueDBOp : public Operator<CPUContext> {
   }
 
  private:
-  AT_DISABLE_COPY_AND_ASSIGN(CreateBlobsQueueDBOp);
+  C10_DISABLE_COPY_AND_ASSIGN(CreateBlobsQueueDBOp);
 };
 
 REGISTER_CPU_OPERATOR(CreateBlobsQueueDB, CreateBlobsQueueDBOp<CPUContext>);
diff --git a/caffe2/sgd/iter_op.cc b/caffe2/sgd/iter_op.cc
index df9e261f2ea7f..ac964018b99e7 100644
--- a/caffe2/sgd/iter_op.cc
+++ b/caffe2/sgd/iter_op.cc
@@ -1,5 +1,10 @@
 #include "caffe2/sgd/iter_op.h"
 
+#ifdef CAFFE2_USE_IDEEP
+#include <caffe2/ideep/operators/operator_fallback_ideep.h>
+#include <caffe2/ideep/utils/ideep_operator.h>
+#endif
+
 namespace caffe2 {
 
 void MutexSerializer::Serialize(
@@ -22,6 +27,10 @@ void MutexDeserializer::Deserialize(const BlobProto& /* unused */, Blob* blob) {
 REGISTER_CPU_OPERATOR(Iter, IterOp<CPUContext>);
 REGISTER_CPU_OPERATOR(AtomicIter, AtomicIterOp<CPUContext>);
 
+#ifdef CAFFE2_USE_IDEEP
+REGISTER_IDEEP_OPERATOR(AtomicIter, IDEEPFallbackOp<AtomicIterOp<CPUContext>>);
+#endif
+
 REGISTER_BLOB_SERIALIZER(
     (TypeMeta::Id<std::unique_ptr<std::mutex>>()),
     MutexSerializer);
diff --git a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
index 4ac3524d49d8a..d102985e2fd7a 100644
--- a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
+++ b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc
@@ -19,7 +19,7 @@ void AddNoiseInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/share/contrib/nnpack/conv_op.cc b/caffe2/share/contrib/nnpack/conv_op.cc
index 05c945106c52d..f11e05b67392c 100644
--- a/caffe2/share/contrib/nnpack/conv_op.cc
+++ b/caffe2/share/contrib/nnpack/conv_op.cc
@@ -231,11 +231,12 @@ bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
             (transformedFilterSize + sizeof(float) - 1) / sizeof(float);
 
         for (auto g = 0; g < group_; g++) {
-          transformedFilters_[g] = ws_->CreateBlob(
-                                          "__transformed_kernel_" +
-                                          to_string(__sync_fetch_and_add(
-                                              &precomputed_transform_id, 1)))
-                                       ->GetMutableTensor(CPU);
+          transformedFilters_[g] = BlobGetMutableTensor(
+              ws_->CreateBlob(
+                  "__transformed_kernel_" +
+                  to_string(
+                      __sync_fetch_and_add(&precomputed_transform_id, 1))),
+              CPU);
           transformedFilters_[g]->Resize(transformedFilterElements);
 
           status = nnp_convolution_inference(
diff --git a/caffe2/share/contrib/nnpack/nnpack_test.cc b/caffe2/share/contrib/nnpack/nnpack_test.cc
index 2f892118982da..10eb6348becc0 100644
--- a/caffe2/share/contrib/nnpack/nnpack_test.cc
+++ b/caffe2/share/contrib/nnpack/nnpack_test.cc
@@ -19,7 +19,7 @@ void AddNoiseInput(
   DeviceOption option;
   CPUContext context(option);
   Blob* blob = ws->CreateBlob(name);
-  auto* tensor = blob->GetMutableTensor(CPU);
+  auto* tensor = BlobGetMutableTensor(blob, CPU);
   tensor->Resize(shape);
 
   math::RandGaussian<float, CPUContext>(
diff --git a/caffe2/utils/GpuDefs.cuh b/caffe2/utils/GpuDefs.cuh
index cf54f9e851bfa..0f94ae9e018ba 100644
--- a/caffe2/utils/GpuDefs.cuh
+++ b/caffe2/utils/GpuDefs.cuh
@@ -8,7 +8,7 @@ namespace caffe2 {
 // Static definition of GPU warp size for unrolling and code generation
 
 #ifdef __CUDA_ARCH__
-#if __CUDA_ARCH__ <= 700
+#if __CUDA_ARCH__ <= 750
 constexpr int kWarpSize = 32;
 #else
 #error Unknown __CUDA_ARCH__; please define parameters for compute capability
diff --git a/caffe2/utils/hip/math_blas_hip_test.cc b/caffe2/utils/hip/math_blas_hip_test.cc
index 911c2b09868fc..a5df5900ee23a 100644
--- a/caffe2/utils/hip/math_blas_hip_test.cc
+++ b/caffe2/utils/hip/math_blas_hip_test.cc
@@ -26,13 +26,13 @@ TEST(MathROCBLASTest, GemmNoTransNoTrans) {
   vector<int> shapeX{5, 10};
   vector<int> shapeW{10, 6};
   vector<int> shapeY{5, 6};
-  auto* tensorX = blobX->GetMutableTensor(HIP);
+  auto* tensorX = BlobGetMutableTensor(blobX, HIP);
   tensorX->Resize(shapeX);
-  auto* tensorW = blobW->GetMutableTensor(HIP);
+  auto* tensorW = BlobGetMutableTensor(blobW, HIP);
   tensorW->Resize(shapeW);
-  auto* tensorY = blobY->GetMutableTensor(HIP);
+  auto* tensorY = BlobGetMutableTensor(blobY, HIP);
   tensorY->Resize(shapeY);
-  auto* tensorY_host = blobY_host->GetMutableTensor(CPU);
+  auto* tensorY_host = BlobGetMutableTensor(blobY_host, CPU);
   tensorY_host->Resize(shapeY);
 
   EXPECT_EQ(tensorX->size(), 50);
@@ -126,13 +126,13 @@ TEST(MathROCBLASTest, GemmNoTransTrans) {
   vector<int> shapeX{5, 10};
   vector<int> shapeW{6, 10};
   vector<int> shapeY{5, 6};
-  auto* tensorX = blobX->GetMutableTensor(HIP);
+  auto* tensorX = BlobGetMutableTensor(blobX, HIP);
   tensorX->Resize(shapeX);
-  auto* tensorW = blobW->GetMutableTensor(HIP);
+  auto* tensorW = BlobGetMutableTensor(blobW, HIP);
   tensorW->Resize(shapeW);
-  auto* tensorY = blobY->GetMutableTensor(HIP);
+  auto* tensorY = BlobGetMutableTensor(blobY, HIP);
   tensorY->Resize(shapeY);
-  auto* tensorY_host = blobY_host->GetMutableTensor(CPU);
+  auto* tensorY_host = BlobGetMutableTensor(blobY_host, CPU);
   tensorY_host->Resize(shapeY);
 
   EXPECT_EQ(tensorX->size(), 50);
@@ -225,13 +225,13 @@ TEST(MathROCBLASTest, GemvNoTrans) {
   vector<int> shapeA{5, 10};
   vector<int> shapeX{10};
   vector<int> shapeY{5};
-  auto* tensorA = blobA->GetMutableTensor(HIP);
+  auto* tensorA = BlobGetMutableTensor(blobA, HIP);
   tensorA->Resize(shapeA);
-  auto* tensorX = blobX->GetMutableTensor(HIP);
+  auto* tensorX = BlobGetMutableTensor(blobX, HIP);
   tensorX->Resize(shapeX);
-  auto* tensorY = blobY->GetMutableTensor(HIP);
+  auto* tensorY = BlobGetMutableTensor(blobY, HIP);
   tensorY->Resize(shapeY);
-  auto* tensorY_host = blobY_host->GetMutableTensor(CPU);
+  auto* tensorY_host = BlobGetMutableTensor(blobY_host, CPU);
   tensorY_host->Resize(shapeY);
 
   EXPECT_EQ(tensorA->size(), 50);
@@ -315,13 +315,13 @@ TEST(MathROCBLASTest, GemvTrans) {
   vector<int> shapeA{6, 10};
   vector<int> shapeX{6};
   vector<int> shapeY{10};
-  auto* tensorA = blobA->GetMutableTensor(HIP);
+  auto* tensorA = BlobGetMutableTensor(blobA, HIP);
   tensorA->Resize(shapeA);
-  auto* tensorX = blobX->GetMutableTensor(HIP);
+  auto* tensorX = BlobGetMutableTensor(blobX, HIP);
   tensorX->Resize(shapeX);
-  auto* tensorY = blobY->GetMutableTensor(HIP);
+  auto* tensorY = BlobGetMutableTensor(blobY, HIP);
   tensorY->Resize(shapeY);
-  auto* tensorY_host = blobY_host->GetMutableTensor(CPU);
+  auto* tensorY_host = BlobGetMutableTensor(blobY_host, CPU);
   tensorY_host->Resize(shapeY);
 
   EXPECT_EQ(tensorA->size(), 60);
diff --git a/caffe2/utils/math_cpu.cc b/caffe2/utils/math_cpu.cc
index 18e20e4fa4141..e770bcfd9afae 100644
--- a/caffe2/utils/math_cpu.cc
+++ b/caffe2/utils/math_cpu.cc
@@ -76,7 +76,7 @@ namespace math {
 // (transpose) if the argument TransA or TransB is set to CblasNoTrans or
 // CblasTrans, respectively, for each of A and B.
 template <>
-CAFFE2_EXPORT void Gemm<float, CPUContext>(
+C10_EXPORT void Gemm<float, CPUContext>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int M,
@@ -134,7 +134,7 @@ CAFFE2_EXPORT void Gemm<float, CPUContext>(
 }
 
 template <>
-CAFFE2_EXPORT void GemmEx<float, CPUContext>(
+C10_EXPORT void GemmEx<float, CPUContext>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int M,
@@ -206,7 +206,7 @@ CAFFE2_EXPORT void GemmEx<float, CPUContext>(
 }
 
 template <>
-CAFFE2_EXPORT void Gemv<float, CPUContext>(
+C10_EXPORT void Gemv<float, CPUContext>(
     const CBLAS_TRANSPOSE trans_A,
     const int M,
     const int N,
@@ -245,7 +245,7 @@ CAFFE2_EXPORT void Gemv<float, CPUContext>(
 
 #define CAFFE2_SPECIALIZED_DOT(T)                                        \
   template <>                                                            \
-  CAFFE2_EXPORT void Dot<T, CPUContext>(                                               \
+  C10_EXPORT void Dot<T, CPUContext>(                                    \
       const int N, const T* a, const T* b, T* y, CPUContext* context) {  \
     *y = ConstEigenVectorMap<T>(a, N).dot(ConstEigenVectorMap<T>(b, N)); \
   }
@@ -254,12 +254,12 @@ CAFFE2_SPECIALIZED_DOT(float)
 
 #define CAFFE2_SPECIALIZED_AXPY(T)                                          \
   template <>                                                               \
-  CAFFE2_EXPORT void Axpy<T, CPUContext>(                                                 \
+  C10_EXPORT void Axpy<T, CPUContext>(                                      \
       const int N, const T alpha, const T* x, T* Y, CPUContext* context) {  \
     EigenVectorMap<T>(Y, N) += ConstEigenVectorMap<T>(x, N) * alpha;        \
   }                                                                         \
   template <>                                                               \
-  CAFFE2_EXPORT void Axpy<T, CPUContext>(                                                 \
+  C10_EXPORT void Axpy<T, CPUContext>(                                      \
       const int N, const T* alpha, const T* x, T* Y, CPUContext* context) { \
     EigenVectorMap<T>(Y, N) += ConstEigenVectorMap<T>(x, N) * (*alpha);     \
   }
@@ -268,7 +268,7 @@ CAFFE2_SPECIALIZED_AXPY(float)
 
 #define CAFFE2_SPECIALIZED_AXPBY(T)                                     \
   template <>                                                           \
-  CAFFE2_EXPORT void Axpby<T, T, CPUContext>(                                         \
+  C10_EXPORT void Axpby<T, T, CPUContext>(                              \
       const int N,                                                      \
       const T alpha,                                                    \
       const T* x,                                                       \
@@ -279,7 +279,7 @@ CAFFE2_SPECIALIZED_AXPY(float)
     y_arr = y_arr * beta + ConstEigenVectorArrayMap<T>(x, N) * alpha;   \
   }                                                                     \
   template <>                                                           \
-  CAFFE2_EXPORT void Axpby<T, T, CPUContext>(                                         \
+  C10_EXPORT void Axpby<T, T, CPUContext>(                              \
       const int N,                                                      \
       const T* alpha,                                                   \
       const T* x,                                                       \
@@ -295,7 +295,7 @@ CAFFE2_SPECIALIZED_AXPBY(float)
 #else // CAFFE2_USE_EIGEN_FOR_BLAS
 
 template <>
-CAFFE2_EXPORT void Gemm<float, CPUContext>(
+C10_EXPORT void Gemm<float, CPUContext>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int M,
@@ -328,7 +328,7 @@ CAFFE2_EXPORT void Gemm<float, CPUContext>(
 }
 
 template <>
-CAFFE2_EXPORT void GemmEx<float, CPUContext>(
+C10_EXPORT void GemmEx<float, CPUContext>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int M,
@@ -361,7 +361,7 @@ CAFFE2_EXPORT void GemmEx<float, CPUContext>(
 }
 
 template <>
-CAFFE2_EXPORT void Gemv<float, CPUContext>(
+C10_EXPORT void Gemv<float, CPUContext>(
     const CBLAS_TRANSPOSE trans_A,
     const int M,
     const int N,
@@ -377,7 +377,7 @@ CAFFE2_EXPORT void Gemv<float, CPUContext>(
 
 #define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData, prefix)          \
   template <>                                                    \
-  CAFFE2_EXPORT void Scale<TAlpha, TData, CPUContext>(                         \
+  C10_EXPORT void Scale<TAlpha, TData, CPUContext>(              \
       const int n,                                               \
       const TAlpha alpha,                                        \
       const TData* x,                                            \
@@ -391,7 +391,7 @@ CAFFE2_EXPORT void Gemv<float, CPUContext>(
     }                                                            \
   }                                                              \
   template <>                                                    \
-  CAFFE2_EXPORT void Scale<TAlpha, TData, CPUContext>(                         \
+  C10_EXPORT void Scale<TAlpha, TData, CPUContext>(              \
       const int n,                                               \
       const TAlpha* alpha,                                       \
       const TData* x,                                            \
@@ -411,7 +411,7 @@ CAFFE2_SPECIALIZED_SCALE(float, double, d)
 
 #define CAFFE2_SPECIALIZED_DOT(T, prefix)                       \
   template <>                                                   \
-  CAFFE2_EXPORT void Dot<T, CPUContext>(                                      \
+  C10_EXPORT void Dot<T, CPUContext>(                           \
       const int N, const T* a, const T* b, T* y, CPUContext*) { \
     *y = cblas_##prefix##dot(N, a, 1, b, 1);                    \
   }
@@ -420,12 +420,12 @@ CAFFE2_SPECIALIZED_DOT(float, s)
 
 #define CAFFE2_SPECIALIZED_AXPY(T, prefix)                          \
   template <>                                                       \
-  CAFFE2_EXPORT void Axpy<T, CPUContext>(                                         \
+  C10_EXPORT void Axpy<T, CPUContext>(                              \
       const int N, const T alpha, const T* x, T* y, CPUContext*) {  \
     cblas_##prefix##axpy(N, alpha, x, 1, y, 1);                     \
   }                                                                 \
   template <>                                                       \
-  CAFFE2_EXPORT void Axpy<T, CPUContext>(                                         \
+  C10_EXPORT void Axpy<T, CPUContext>(                              \
       const int N, const T* alpha, const T* x, T* y, CPUContext*) { \
     cblas_##prefix##axpy(N, *alpha, x, 1, y, 1);                    \
   }
@@ -437,7 +437,7 @@ CAFFE2_SPECIALIZED_AXPY(float, s)
 #ifdef CAFFE2_USE_MKL
 #define CAFFE2_SPECIALIZED_AXPBY(T, prefix)              \
   template <>                                            \
-  CAFFE2_EXPORT void Axpby<T, T, CPUContext>(                          \
+  C10_EXPORT void Axpby<T, T, CPUContext>(               \
       const int N,                                       \
       const T alpha,                                     \
       const T* x,                                        \
@@ -447,7 +447,7 @@ CAFFE2_SPECIALIZED_AXPY(float, s)
     cblas_##prefix##axpby(N, alpha, x, 1, beta, y, 1);   \
   }                                                      \
   template <>                                            \
-  CAFFE2_EXPORT void Axpby<T, T, CPUContext>(                          \
+  C10_EXPORT void Axpby<T, T, CPUContext>(               \
       const int N,                                       \
       const T* alpha,                                    \
       const T* x,                                        \
@@ -459,7 +459,7 @@ CAFFE2_SPECIALIZED_AXPY(float, s)
 #else // CAFFE2_USE_MKL
 #define CAFFE2_SPECIALIZED_AXPBY(T, prefix)      \
   template <>                                    \
-  CAFFE2_EXPORT void Axpby<T, T, CPUContext>(                  \
+  C10_EXPORT void Axpby<T, T, CPUContext>(       \
       const int N,                               \
       const T alpha,                             \
       const T* x,                                \
@@ -470,7 +470,7 @@ CAFFE2_SPECIALIZED_AXPY(float, s)
     cblas_##prefix##axpy(N, alpha, x, 1, y, 1);  \
   }                                              \
   template <>                                    \
-  CAFFE2_EXPORT void Axpby<T, T, CPUContext>(                  \
+  C10_EXPORT void Axpby<T, T, CPUContext>(       \
       const int N,                               \
       const T* alpha,                            \
       const T* x,                                \
@@ -488,7 +488,7 @@ CAFFE2_SPECIALIZED_AXPBY(float, s)
 
 #define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData)                        \
   template <>                                                          \
-  CAFFE2_EXPORT void Scale<TAlpha, TData, CPUContext>(                               \
+  C10_EXPORT void Scale<TAlpha, TData, CPUContext>(                    \
       const int n,                                                     \
       const TAlpha alpha,                                              \
       const TData* x,                                                  \
@@ -498,7 +498,7 @@ CAFFE2_SPECIALIZED_AXPBY(float, s)
         ConstEigenVectorMap<TData>(x, n) * static_cast<TData>(alpha);  \
   }                                                                    \
   template <>                                                          \
-  CAFFE2_EXPORT void Scale<TAlpha, TData, CPUContext>(                               \
+  C10_EXPORT void Scale<TAlpha, TData, CPUContext>(                    \
       const int n,                                                     \
       const TAlpha* alpha,                                             \
       const TData* x,                                                  \
@@ -517,7 +517,7 @@ CAFFE2_SPECIALIZED_SCALE(std::int64_t, std::int64_t)
 #undef CAFFE2_SPECIALIZED_SCALE
 
 template <>
-CAFFE2_EXPORT void GemmBatched<float, CPUContext>(
+C10_EXPORT void GemmBatched<float, CPUContext>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int batch_size,
@@ -563,7 +563,7 @@ CAFFE2_EXPORT void GemmBatched<float, CPUContext>(
 }
 
 template <>
-CAFFE2_EXPORT void GemmStridedBatched<float, CPUContext>(
+C10_EXPORT void GemmStridedBatched<float, CPUContext>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int batch_size,
@@ -632,10 +632,11 @@ CAFFE2_EXPORT void GemmStridedBatched<float, CPUContext>(
 ////////////////////////////////////////////////////////////////////////////////
 #ifdef CAFFE2_USE_MKL
 
-#define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, OriginalFunc, ...)       \
-  template <>                                                                \
-  CAFFE2_EXPORT void Funcname<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) { \
-    OriginalFunc(N, x, y, ##__VA_ARGS__);                                    \
+#define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, OriginalFunc, ...) \
+  template <>                                                          \
+  C10_EXPORT void Funcname<T, CPUContext>(                             \
+      const int N, const T* x, T* y, CPUContext*) {                    \
+    OriginalFunc(N, x, y, ##__VA_ARGS__);                              \
   }
 DELEGATE_SIMPLE_UNARY_FUNCTION(
     float,
@@ -683,7 +684,7 @@ DELEGATE_SIMPLE_UNARY_FUNCTION(double, Inv, vdInv)
 
 #define DELEGATE_SINCOS_FUNCTION(T, OriginalFunc)           \
   template <>                                               \
-  CAFFE2_EXPORT void SinCos<T, CPUContext>(                               \
+  C10_EXPORT void SinCos<T, CPUContext>(                    \
       const int N, const T* a, T* ys, T* yc, CPUContext*) { \
     OriginalFunc(N, a, ys, yc);                             \
   }
@@ -691,10 +692,11 @@ DELEGATE_SINCOS_FUNCTION(float, vsSinCos)
 DELEGATE_SINCOS_FUNCTION(double, vdSinCos)
 #undef DELEGATE_SINCOS_FUNCTION
 
-#define DELEGATE_POWX_FUNCTION(T, OriginalFunc)                               \
-  template <>                                                                 \
-  CAFFE2_EXPORT void Powx<T, CPUContext>(const int N, const T* a, T b, T* y, CPUContext*) { \
-    OriginalFunc(N, a, b, y);                                                 \
+#define DELEGATE_POWX_FUNCTION(T, OriginalFunc)          \
+  template <>                                            \
+  C10_EXPORT void Powx<T, CPUContext>(                   \
+      const int N, const T* a, T b, T* y, CPUContext*) { \
+    OriginalFunc(N, a, b, y);                            \
   }
 DELEGATE_POWX_FUNCTION(float, vsPowx)
 DELEGATE_POWX_FUNCTION(double, vdPowx)
@@ -702,7 +704,7 @@ DELEGATE_POWX_FUNCTION(double, vdPowx)
 
 #define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Func, FuncImpl)      \
   template <>                                                   \
-  CAFFE2_EXPORT void Func<T, CPUContext>(                                     \
+  C10_EXPORT void Func<T, CPUContext>(                          \
       const int N, const T* A, const T* B, T* C, CPUContext*) { \
     FuncImpl(N, A, B, C);                                       \
   }
@@ -718,10 +720,11 @@ DELEGATE_SIMPLE_BINARY_FUNCTION(double, Div, vdDiv)
 
 #else // CAFFE2_USE_MKL
 
-#define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, expr)                    \
-  template <>                                                                \
-  CAFFE2_EXPORT void Funcname<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) { \
-    EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(x, N).expr();      \
+#define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, expr)               \
+  template <>                                                           \
+  C10_EXPORT void Funcname<T, CPUContext>(                              \
+      const int N, const T* x, T* y, CPUContext*) {                     \
+    EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(x, N).expr(); \
   }
 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Exp, exp)
 DELEGATE_SIMPLE_UNARY_FUNCTION(double, Exp, exp)
@@ -750,7 +753,7 @@ DELEGATE_SIMPLE_UNARY_FUNCTION(double, Rsqrt, rsqrt)
 
 #define DELEGATE_SINCOS_FUNCTION(T)                                     \
   template <>                                                           \
-  CAFFE2_EXPORT void SinCos<T, CPUContext>(                                           \
+  C10_EXPORT void SinCos<T, CPUContext>(                                \
       const int N, const T* x, T* ys, T* yc, CPUContext*) {             \
     EigenVectorMap<T>(ys, N) = ConstEigenVectorArrayMap<T>(x, N).sin(); \
     EigenVectorMap<T>(yc, N) = ConstEigenVectorArrayMap<T>(x, N).cos(); \
@@ -761,7 +764,8 @@ DELEGATE_SINCOS_FUNCTION(double)
 
 #define DELEGATE_TANH_FUNCTION(T)                                             \
   template <>                                                                 \
-  CAFFE2_EXPORT void Tanh<T, CPUContext>(const int N, const T* X, T* Y, CPUContext*) {      \
+  C10_EXPORT void Tanh<T, CPUContext>(                                        \
+      const int N, const T* X, T* Y, CPUContext*) {                           \
     EigenVectorMap<T>(Y, N) = T(1) -                                          \
         ((ConstEigenVectorArrayMap<T>(X, N) * T(2)).exp() + T(1)).inverse() * \
             T(2);                                                             \
@@ -770,10 +774,11 @@ DELEGATE_TANH_FUNCTION(float)
 DELEGATE_TANH_FUNCTION(double)
 #undef DELEGATE_TANH_FUNCTION
 
-#define DELEGATE_CBRT_FUNCTION(T)                                        \
-  template <>                                                            \
-  CAFFE2_EXPORT void Cbrt<T, CPUContext>(const int N, const T* X, T* Y, CPUContext*) { \
-    std::transform(X, X + N, Y, [](const T x) { return cbrt(x); });      \
+#define DELEGATE_CBRT_FUNCTION(T)                                   \
+  template <>                                                       \
+  C10_EXPORT void Cbrt<T, CPUContext>(                              \
+      const int N, const T* X, T* Y, CPUContext*) {                 \
+    std::transform(X, X + N, Y, [](const T x) { return cbrt(x); }); \
   }
 DELEGATE_CBRT_FUNCTION(float)
 DELEGATE_CBRT_FUNCTION(double)
@@ -781,28 +786,30 @@ DELEGATE_CBRT_FUNCTION(double)
 
 #define DELEGATE_POWX_FUNCTION(T)                                       \
   template <>                                                           \
-  CAFFE2_EXPORT void Powx<T, CPUContext>(                                             \
+  C10_EXPORT void Powx<T, CPUContext>(                                  \
       const int N, const T* a, const T b, T* y, CPUContext*) {          \
     EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(a, N).pow(b); \
   }
 DELEGATE_POWX_FUNCTION(float)
 #undef DELEGATE_POWX_FUNCTION
 
-#define DELEGATE_SINH_FUNCTION(T)                                        \
-  template <>                                                            \
-  CAFFE2_EXPORT void Sinh<T, CPUContext>(const int N, const T* X, T* Y, CPUContext*) { \
-    ConstEigenVectorArrayMap<T> X_arr(X, N);                             \
-    EigenVectorMap<T>(Y, N) = (X_arr.exp() - (-X_arr).exp()) / 2;        \
+#define DELEGATE_SINH_FUNCTION(T)                                 \
+  template <>                                                     \
+  C10_EXPORT void Sinh<T, CPUContext>(                            \
+      const int N, const T* X, T* Y, CPUContext*) {               \
+    ConstEigenVectorArrayMap<T> X_arr(X, N);                      \
+    EigenVectorMap<T>(Y, N) = (X_arr.exp() - (-X_arr).exp()) / 2; \
   }
 DELEGATE_SINH_FUNCTION(float)
 DELEGATE_SINH_FUNCTION(double)
 #undef DELEGATE_SINH_FUNCTION
 
-#define DELEGATE_COSH_FUNCTION(T)                                        \
-  template <>                                                            \
-  CAFFE2_EXPORT void Cosh<T, CPUContext>(const int N, const T* X, T* Y, CPUContext*) { \
-    ConstEigenVectorArrayMap<T> X_arr(X, N);                             \
-    EigenVectorMap<T>(Y, N) = (X_arr.exp() + (-X_arr).exp()) / 2;        \
+#define DELEGATE_COSH_FUNCTION(T)                                 \
+  template <>                                                     \
+  C10_EXPORT void Cosh<T, CPUContext>(                            \
+      const int N, const T* X, T* Y, CPUContext*) {               \
+    ConstEigenVectorArrayMap<T> X_arr(X, N);                      \
+    EigenVectorMap<T>(Y, N) = (X_arr.exp() + (-X_arr).exp()) / 2; \
   }
 DELEGATE_COSH_FUNCTION(float)
 DELEGATE_COSH_FUNCTION(double)
@@ -810,7 +817,8 @@ DELEGATE_COSH_FUNCTION(double)
 
 #define DELEGATE_INV_FUNCTION(T)                                           \
   template <>                                                              \
-  CAFFE2_EXPORT void Inv<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) {    \
+  C10_EXPORT void Inv<T, CPUContext>(                                      \
+      const int N, const T* x, T* y, CPUContext*) {                        \
     EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(x, N).inverse(); \
   }
 DELEGATE_INV_FUNCTION(float)
@@ -819,10 +827,11 @@ DELEGATE_INV_FUNCTION(double)
 
 #endif // CAFFE2_USE_MKL
 
-#define DELEGATE_NEG_FUNCTION(T)                                        \
-  template <>                                                           \
-  CAFFE2_EXPORT void Neg<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) { \
-    EigenVectorMap<T>(y, N) = -ConstEigenVectorMap<T>(x, N);            \
+#define DELEGATE_NEG_FUNCTION(T)                             \
+  template <>                                                \
+  C10_EXPORT void Neg<T, CPUContext>(                        \
+      const int N, const T* x, T* y, CPUContext*) {          \
+    EigenVectorMap<T>(y, N) = -ConstEigenVectorMap<T>(x, N); \
   }
 DELEGATE_NEG_FUNCTION(float)
 DELEGATE_NEG_FUNCTION(double)
@@ -830,10 +839,11 @@ DELEGATE_NEG_FUNCTION(std::int32_t)
 DELEGATE_NEG_FUNCTION(std::int64_t)
 #undef DELEGATE_NEG_FUNCTION
 
-#define DELEGATE_SIGN_FUNCTION(T)                                        \
-  template <>                                                            \
-  CAFFE2_EXPORT void Sign<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) { \
-    EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(x, N).sign();  \
+#define DELEGATE_SIGN_FUNCTION(T)                                       \
+  template <>                                                           \
+  C10_EXPORT void Sign<T, CPUContext>(                                  \
+      const int N, const T* x, T* y, CPUContext*) {                     \
+    EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(x, N).sign(); \
   }
 DELEGATE_SIGN_FUNCTION(float)
 DELEGATE_SIGN_FUNCTION(double)
@@ -841,10 +851,11 @@ DELEGATE_SIGN_FUNCTION(std::int32_t)
 DELEGATE_SIGN_FUNCTION(std::int64_t)
 #undef DELEGATE_SIGN_FUNCTION
 
-#define DELEGATE_ABS_FUNCTION(T)                                        \
-  template <>                                                           \
-  CAFFE2_EXPORT void Abs<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) { \
-    EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(x, N).abs();  \
+#define DELEGATE_ABS_FUNCTION(T)                                       \
+  template <>                                                          \
+  C10_EXPORT void Abs<T, CPUContext>(                                  \
+      const int N, const T* x, T* y, CPUContext*) {                    \
+    EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(x, N).abs(); \
   }
 #ifndef CAFFE2_USE_MKL
 DELEGATE_ABS_FUNCTION(float)
@@ -854,10 +865,11 @@ DELEGATE_ABS_FUNCTION(std::int32_t)
 DELEGATE_ABS_FUNCTION(std::int64_t)
 #undef DELEGATE_ABS_FUNCTION
 
-#define DELEGATE_CUBE_FUNCTION(T)                                        \
-  template <>                                                            \
-  CAFFE2_EXPORT void Cube<T, CPUContext>(const int N, const T* X, T* Y, CPUContext*) { \
-    EigenVectorMap<T>(Y, N) = ConstEigenVectorArrayMap<T>(X, N).cube();  \
+#define DELEGATE_CUBE_FUNCTION(T)                                       \
+  template <>                                                           \
+  C10_EXPORT void Cube<T, CPUContext>(                                  \
+      const int N, const T* X, T* Y, CPUContext*) {                     \
+    EigenVectorMap<T>(Y, N) = ConstEigenVectorArrayMap<T>(X, N).cube(); \
   }
 DELEGATE_CUBE_FUNCTION(float)
 DELEGATE_CUBE_FUNCTION(double)
@@ -867,7 +879,7 @@ DELEGATE_CUBE_FUNCTION(std::int64_t)
 
 #define EIGEN_SIMPLE_BINARY_FUNCTION(T, Func, expr)             \
   template <>                                                   \
-  CAFFE2_EXPORT void Func<T, CPUContext>(                                     \
+  C10_EXPORT void Func<T, CPUContext>(                          \
       const int N, const T* A, const T* B, T* C, CPUContext*) { \
     EigenVectorMap<T>(C, N) = ConstEigenVectorArrayMap<T>(A, N) \
         expr ConstEigenVectorArrayMap<T>(B, N);                 \
@@ -903,19 +915,20 @@ DEFINE_SIMPLE_BINARY_FUNCTION(Div, /)
 // Eigen or via custom code.
 ////////////////////////////////////////////////////////////////////////////////
 
-#define CAFFE2_SPECIALIZED_SET(T)                                             \
-  template <>                                                                 \
-  CAFFE2_EXPORT void Set<T, CPUContext>(const size_t N, const T alpha, T* Y, CPUContext*) { \
-    if (N == 0) {                                                             \
-      return;                                                                 \
-    }                                                                         \
-    if (alpha == (T)0) {                                                      \
-      if (Y != nullptr) {                                                     \
-        std::memset(Y, 0, N * sizeof(T));                                     \
-      }                                                                       \
-    } else {                                                                  \
-      EigenVectorMap<T>(Y, N).setConstant(alpha);                             \
-    }                                                                         \
+#define CAFFE2_SPECIALIZED_SET(T)                         \
+  template <>                                             \
+  C10_EXPORT void Set<T, CPUContext>(                     \
+      const size_t N, const T alpha, T* Y, CPUContext*) { \
+    if (N == 0) {                                         \
+      return;                                             \
+    }                                                     \
+    if (alpha == (T)0) {                                  \
+      if (Y != nullptr) {                                 \
+        std::memset(Y, 0, N * sizeof(T));                 \
+      }                                                   \
+    } else {                                              \
+      EigenVectorMap<T>(Y, N).setConstant(alpha);         \
+    }                                                     \
   }
 
 CAFFE2_SPECIALIZED_SET(float);
@@ -932,7 +945,7 @@ CAFFE2_SPECIALIZED_SET(uint16_t);
 
 #define CAFFE2_SPECIALIZED_REDUCEMIN(T)                \
   template <>                                          \
-  CAFFE2_EXPORT void ReduceMin<T, CPUContext>(                       \
+  C10_EXPORT void ReduceMin<T, CPUContext>(            \
       const int N,                                     \
       const T* x,                                      \
       T* y,                                            \
@@ -945,7 +958,7 @@ CAFFE2_SPECIALIZED_REDUCEMIN(float)
 
 #define CAFFE2_SPECIALIZED_REDUCEMAX(T)                \
   template <>                                          \
-  CAFFE2_EXPORT void ReduceMax<T, CPUContext>(                       \
+  C10_EXPORT void ReduceMax<T, CPUContext>(            \
       const int N,                                     \
       const T* x,                                      \
       T* y,                                            \
@@ -991,7 +1004,7 @@ struct SquaredL2NormFunctor {
 
 #define DELEGATE_ROWWISE_REDUCE_FUNCTION(Func, EigenOp)                    \
   template <typename T>                                                    \
-  CAFFE2_EXPORT void Rowwise##Func(                                                      \
+  C10_EXPORT void Rowwise##Func(                                           \
       const int rows, const int cols, const T alpha, const T* X, T* Y) {   \
     EigenVectorMap<T>(Y, rows) =                                           \
         ConstEigenMatrixMap<T>(X, cols, rows).colwise().EigenOp() * alpha; \
@@ -1006,7 +1019,7 @@ DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL2, norm)
 
 #define DELEGATE_COLWISE_REDUCE_FUNCTION(Func, EigenOp)                    \
   template <typename T>                                                    \
-  CAFFE2_EXPORT void Colwise##Func(                                                      \
+  C10_EXPORT void Colwise##Func(                                           \
       const int rows, const int cols, const T alpha, const T* X, T* Y) {   \
     EigenVectorMap<T>(Y, cols) =                                           \
         ConstEigenMatrixMap<T>(X, cols, rows).rowwise().EigenOp() * alpha; \
@@ -1020,7 +1033,7 @@ DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceL2, norm)
 #undef DELEGATE_COLWISE_REDUCE_FUNCTION
 
 template <typename T>
-CAFFE2_EXPORT void BothEndsReduceMin(
+C10_EXPORT void BothEndsReduceMin(
     const int pre,
     const int mid,
     const int nxt,
@@ -1044,7 +1057,7 @@ CAFFE2_EXPORT void BothEndsReduceMin(
 }
 
 template <typename T>
-CAFFE2_EXPORT void BothEndsReduceMax(
+C10_EXPORT void BothEndsReduceMax(
     const int pre,
     const int mid,
     const int nxt,
@@ -1066,7 +1079,7 @@ CAFFE2_EXPORT void BothEndsReduceMax(
 }
 
 template <typename T>
-CAFFE2_EXPORT void BothEndsReduceSum(
+C10_EXPORT void BothEndsReduceSum(
     const int pre,
     const int mid,
     const int nxt,
@@ -1087,7 +1100,7 @@ CAFFE2_EXPORT void BothEndsReduceSum(
 }
 
 template <typename T>
-CAFFE2_EXPORT void BothEndsReduceMean(
+C10_EXPORT void BothEndsReduceMean(
     const int pre,
     const int mid,
     const int nxt,
@@ -1108,7 +1121,7 @@ CAFFE2_EXPORT void BothEndsReduceMean(
 }
 
 template <typename T>
-CAFFE2_EXPORT void BothEndsReduceL1(
+C10_EXPORT void BothEndsReduceL1(
     const int pre,
     const int mid,
     const int nxt,
@@ -1135,7 +1148,7 @@ CAFFE2_EXPORT void BothEndsReduceL1(
 }
 
 template <typename T>
-CAFFE2_EXPORT void BothEndsReduceL2(
+C10_EXPORT void BothEndsReduceL2(
     const int pre,
     const int mid,
     const int nxt,
@@ -1155,7 +1168,7 @@ CAFFE2_EXPORT void BothEndsReduceL2(
 }
 
 template <typename T, class Reducer>
-CAFFE2_EXPORT void ReduceTensor(
+C10_EXPORT void ReduceTensor(
     const int ndim,
     const int* X_dims,
     const int* Y_dims,
@@ -1183,7 +1196,7 @@ CAFFE2_EXPORT void ReduceTensor(
 
 #define DELEGATE_REDUCE_FUNCTION(T, Func, reducer, init, is_norm)              \
   template <>                                                                  \
-  CAFFE2_EXPORT void Func<T, CPUContext>(                                                    \
+  C10_EXPORT void Func<T, CPUContext>(                                         \
       const int num_dims,                                                      \
       const int* dims,                                                         \
       const int num_axes,                                                      \
@@ -1325,7 +1338,7 @@ DELEGATE_REDUCE_FUNCTION(
 
 #define CAFFE2_SPECIALIZED_REDUCE_MEAN(T)                                      \
   template <>                                                                  \
-  CAFFE2_EXPORT void ReduceMean<T, CPUContext>(                                              \
+  C10_EXPORT void ReduceMean<T, CPUContext>(                                   \
       const int num_dims,                                                      \
       const int* dims,                                                         \
       const int num_axes,                                                      \
@@ -1392,7 +1405,7 @@ CAFFE2_SPECIALIZED_REDUCE_MEAN(double)
 
 #define CAFFE2_SPECIALIZED_REDUCE_L2(T)                                        \
   template <>                                                                  \
-  CAFFE2_EXPORT void ReduceL2<T, CPUContext>(                                                \
+  C10_EXPORT void ReduceL2<T, CPUContext>(                                     \
       const int num_dims,                                                      \
       const int* dims,                                                         \
       const int num_axes,                                                      \
@@ -1462,7 +1475,7 @@ CAFFE2_SPECIALIZED_REDUCE_L2(double)
 namespace {
 
 template <typename T>
-CAFFE2_EXPORT void BroadcastImpl(
+C10_EXPORT void BroadcastImpl(
     const int X_ndim,
     const int* X_dims,
     const int Y_ndim,
@@ -1495,7 +1508,7 @@ CAFFE2_EXPORT void BroadcastImpl(
 
 #define CAFFE2_SPECIALIZED_BROADCAST(T)                                     \
   template <>                                                               \
-  CAFFE2_EXPORT void Broadcast<T, CPUContext>(                                            \
+  C10_EXPORT void Broadcast<T, CPUContext>(                                 \
       const int X_ndim,                                                     \
       const int* X_dims,                                                    \
       const int Y_ndim,                                                     \
@@ -1515,7 +1528,7 @@ CAFFE2_SPECIALIZED_BROADCAST(double)
 namespace {
 
 template <typename T>
-CAFFE2_EXPORT void RowwiseMoments(
+C10_EXPORT void RowwiseMoments(
     const int rows,
     const int cols,
     const T* X,
@@ -1529,7 +1542,7 @@ CAFFE2_EXPORT void RowwiseMoments(
 }
 
 template <typename T>
-CAFFE2_EXPORT void ColwiseMoments(
+C10_EXPORT void ColwiseMoments(
     const int rows,
     const int cols,
     const T* X,
@@ -1551,7 +1564,7 @@ CAFFE2_EXPORT void ColwiseMoments(
 }
 
 template <typename T>
-CAFFE2_EXPORT void BothEndsMoments(
+C10_EXPORT void BothEndsMoments(
     const int pre,
     const int mid,
     const int nxt,
@@ -1576,7 +1589,7 @@ CAFFE2_EXPORT void BothEndsMoments(
 }
 
 template <typename T>
-CAFFE2_EXPORT void MomentsImpl(
+C10_EXPORT void MomentsImpl(
     const int num_dims,
     const int* dims,
     const int num_axes,
@@ -1643,7 +1656,7 @@ CAFFE2_EXPORT void MomentsImpl(
 
 #define CAFFE2_SPECIALIZED_MOMENTS(T)                                \
   template <>                                                        \
-  CAFFE2_EXPORT void Moments<T, CPUContext>(                                       \
+  C10_EXPORT void Moments<T, CPUContext>(                            \
       const int num_dims,                                            \
       const int* dims,                                               \
       const int num_axes,                                            \
@@ -1674,7 +1687,7 @@ CAFFE2_SPECIALIZED_INV_STD(float)
 
 #define CAFFE2_SPECIALIZED_ROWWISEMAX(T)                         \
   template <>                                                    \
-  CAFFE2_EXPORT void RowwiseMax<T, CPUContext>(                                \
+  C10_EXPORT void RowwiseMax<T, CPUContext>(                     \
       const int N, const int D, const T* x, T* y, CPUContext*) { \
     EigenVectorMap<T>(y, N) =                                    \
         ConstEigenMatrixMap<T>(x, D, N).colwise().maxCoeff();    \
@@ -1684,7 +1697,7 @@ CAFFE2_SPECIALIZED_ROWWISEMAX(float)
 
 #define CAFFE2_SPECIALIZED_COLWISEMAX(T)                         \
   template <>                                                    \
-  CAFFE2_EXPORT void ColwiseMax<T, CPUContext>(                                \
+  C10_EXPORT void ColwiseMax<T, CPUContext>(                     \
       const int N, const int D, const T* x, T* y, CPUContext*) { \
     EigenVectorMap<T>(y, D) =                                    \
         ConstEigenMatrixMap<T>(x, D, N).rowwise().maxCoeff();    \
@@ -1694,7 +1707,7 @@ CAFFE2_SPECIALIZED_COLWISEMAX(float)
 
 #define CAFFE2_SPECIALIZED_ELEMWISEMAX(T)                                   \
   template <>                                                               \
-  CAFFE2_EXPORT void ElemwiseMax<T, CPUContext>(                                          \
+  C10_EXPORT void ElemwiseMax<T, CPUContext>(                               \
       const int N, const T* x, const T* y, T* z, CPUContext* /*context*/) { \
     std::transform(x, x + N, y, z, [](const T& x_i, const T& y_i) {         \
       return std::max(x_i, y_i);                                            \
@@ -1705,7 +1718,7 @@ CAFFE2_SPECIALIZED_ELEMWISEMAX(float)
 
 #define CAFFE2_SPECIALIZED_MAXIMUM(T)                                          \
   template <>                                                                  \
-  CAFFE2_EXPORT void Maximum<T, CPUContext>(                                                 \
+  C10_EXPORT void Maximum<T, CPUContext>(                                      \
       const int N, const float alpha, const T* x, T* y, CPUContext* context) { \
     std::transform(                                                            \
         x, x + N, y, [&alpha](const T& x_i) { return std::max(x_i, alpha); }); \
@@ -1718,7 +1731,7 @@ CAFFE2_SPECIALIZED_MAXIMUM(float)
 
 #define DELEGATE_EIGEN_2D_BROADCAST_1ST_BINARY_FUNCTION(T, Func, expr) \
   template <>                                                          \
-  CAFFE2_EXPORT void Rowwise##Func<T, CPUContext, true>(                             \
+  C10_EXPORT void Rowwise##Func<T, CPUContext, true>(                  \
       const int rows,                                                  \
       const int cols,                                                  \
       const T* A,                                                      \
@@ -1735,7 +1748,7 @@ CAFFE2_SPECIALIZED_MAXIMUM(float)
     }                                                                  \
   }                                                                    \
   template <>                                                          \
-  CAFFE2_EXPORT void Colwise##Func<T, CPUContext, true>(                             \
+  C10_EXPORT void Colwise##Func<T, CPUContext, true>(                  \
       const int rows,                                                  \
       const int cols,                                                  \
       const T* A,                                                      \
@@ -1755,7 +1768,7 @@ CAFFE2_SPECIALIZED_MAXIMUM(float)
 
 #define DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Func, expr) \
   template <>                                                          \
-  CAFFE2_EXPORT void Rowwise##Func<T, CPUContext, false>(                            \
+  C10_EXPORT void Rowwise##Func<T, CPUContext, false>(                 \
       const int rows,                                                  \
       const int cols,                                                  \
       const T* A,                                                      \
@@ -1772,7 +1785,7 @@ CAFFE2_SPECIALIZED_MAXIMUM(float)
     }                                                                  \
   }                                                                    \
   template <>                                                          \
-  CAFFE2_EXPORT void Colwise##Func<T, CPUContext, false>(                            \
+  C10_EXPORT void Colwise##Func<T, CPUContext, false>(                 \
       const int rows,                                                  \
       const int cols,                                                  \
       const T* A,                                                      \
@@ -1808,7 +1821,7 @@ DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Mul, *)
 
 #define DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(T)           \
   template <>                                               \
-  CAFFE2_EXPORT void RowwiseSub<T, CPUContext, true>(                     \
+  C10_EXPORT void RowwiseSub<T, CPUContext, true>(          \
       const int rows,                                       \
       const int cols,                                       \
       const T* A,                                           \
@@ -1820,7 +1833,7 @@ DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Mul, *)
         ConstEigenVectorArrayMap<T>(A, cols);               \
   }                                                         \
   template <>                                               \
-  CAFFE2_EXPORT void ColwiseSub<T, CPUContext, true>(                     \
+  C10_EXPORT void ColwiseSub<T, CPUContext, true>(          \
       const int rows,                                       \
       const int cols,                                       \
       const T* A,                                           \
@@ -1842,7 +1855,7 @@ DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(std::int64_t)
 
 #define DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION(T)                  \
   template <>                                                      \
-  CAFFE2_EXPORT void RowwiseDiv<T, CPUContext, true>(                            \
+  C10_EXPORT void RowwiseDiv<T, CPUContext, true>(                 \
       const int rows,                                              \
       const int cols,                                              \
       const T* A,                                                  \
@@ -1854,7 +1867,7 @@ DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(std::int64_t)
         ConstEigenVectorArrayMap<T>(A, cols);                      \
   }                                                                \
   template <>                                                      \
-  CAFFE2_EXPORT void ColwiseDiv<T, CPUContext, true>(                            \
+  C10_EXPORT void ColwiseDiv<T, CPUContext, true>(                 \
       const int rows,                                              \
       const int cols,                                              \
       const T* A,                                                  \
@@ -1878,7 +1891,7 @@ DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(std::int64_t, Div, /)
 #undef DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION
 
 template <>
-CAFFE2_EXPORT void Not<bool, CPUContext>(
+C10_EXPORT void Not<bool, CPUContext>(
     const int N,
     const bool* x,
     bool* y,
@@ -1893,7 +1906,7 @@ CAFFE2_EXPORT void Not<bool, CPUContext>(
 
 #define CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(T)             \
   template <>                                                   \
-  CAFFE2_EXPORT void AddStripedBatch(                                         \
+  C10_EXPORT void AddStripedBatch(                              \
       const int N,                                              \
       const T* first,                                           \
       T* y,                                                     \
@@ -1911,7 +1924,7 @@ CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(float);
 namespace {
 
 template <typename TIn, typename TOut, class BinaryOperator, bool kBroadcast1st>
-CAFFE2_EXPORT void RowwiseBinaryOp(
+C10_EXPORT void RowwiseBinaryOp(
     const int rows,
     const int cols,
     const BinaryOperator& op,
@@ -1929,7 +1942,7 @@ CAFFE2_EXPORT void RowwiseBinaryOp(
 }
 
 template <typename TIn, typename TOut, class BinaryOperator, bool kBroadcast1st>
-CAFFE2_EXPORT void ColwiseBinaryOp(
+C10_EXPORT void ColwiseBinaryOp(
     const int rows,
     const int cols,
     const BinaryOperator& op,
@@ -1947,7 +1960,7 @@ CAFFE2_EXPORT void ColwiseBinaryOp(
 }
 
 template <typename TIn, typename TOut, class BinaryOperator>
-CAFFE2_EXPORT void BroadcastBinaryOpImpl(
+C10_EXPORT void BroadcastBinaryOpImpl(
     const int ndim,
     const int* A_dims,
     const int* B_dims,
@@ -1971,7 +1984,7 @@ CAFFE2_EXPORT void BroadcastBinaryOpImpl(
 
 #define DELEGATE_1D_BINARY_FUNCTION(TIn, TOut, Func, Op)               \
   template <>                                                          \
-  CAFFE2_EXPORT void Func<TIn, CPUContext>(                                          \
+  C10_EXPORT void Func<TIn, CPUContext>(                               \
       const int N, const TIn* A, const TIn* B, TOut* C, CPUContext*) { \
     std::transform(A, A + N, B, C, Op<TIn>());                         \
   }
@@ -2011,7 +2024,7 @@ DEFINE_1D_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor)
 
 #define DELEGATE_2D_BROADCAST_BINARY_FUNCTION(TIn, TOut, Func, Op)             \
   template <>                                                                  \
-  CAFFE2_EXPORT void Rowwise##Func<TIn, CPUContext, true>(                                   \
+  C10_EXPORT void Rowwise##Func<TIn, CPUContext, true>(                        \
       const int rows,                                                          \
       const int cols,                                                          \
       const TIn* A,                                                            \
@@ -2021,7 +2034,7 @@ DEFINE_1D_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor)
     RowwiseBinaryOp<TIn, TOut, Op<TIn>, true>(rows, cols, Op<TIn>(), A, B, C); \
   }                                                                            \
   template <>                                                                  \
-  CAFFE2_EXPORT void Rowwise##Func<TIn, CPUContext, false>(                                  \
+  C10_EXPORT void Rowwise##Func<TIn, CPUContext, false>(                       \
       const int rows,                                                          \
       const int cols,                                                          \
       const TIn* A,                                                            \
@@ -2032,7 +2045,7 @@ DEFINE_1D_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor)
         rows, cols, Op<TIn>(), A, B, C);                                       \
   }                                                                            \
   template <>                                                                  \
-  CAFFE2_EXPORT void Colwise##Func<TIn, CPUContext, true>(                                   \
+  C10_EXPORT void Colwise##Func<TIn, CPUContext, true>(                        \
       const int rows,                                                          \
       const int cols,                                                          \
       const TIn* A,                                                            \
@@ -2042,7 +2055,7 @@ DEFINE_1D_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor)
     ColwiseBinaryOp<TIn, TOut, Op<TIn>, true>(rows, cols, Op<TIn>(), A, B, C); \
   }                                                                            \
   template <>                                                                  \
-  CAFFE2_EXPORT void Colwise##Func<TIn, CPUContext, false>(                                  \
+  C10_EXPORT void Colwise##Func<TIn, CPUContext, false>(                       \
       const int rows,                                                          \
       const int cols,                                                          \
       const TIn* A,                                                            \
@@ -2086,28 +2099,28 @@ DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor)
 
 #undef DELEGATE_2D_BROADCAST_BINARY_FUNCTION
 
-#define DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(T)   \
-  template <>                                     \
-  CAFFE2_EXPORT void RowwiseDiv<T, CPUContext, true>(           \
-      const int rows,                             \
-      const int cols,                             \
-      const T* A,                                 \
-      const T* B,                                 \
-      T* C,                                       \
-      CPUContext*) {                              \
-    RowwiseBinaryOp<T, T, std::divides<T>, true>( \
-        rows, cols, std::divides<T>(), A, B, C);  \
-  }                                               \
-  template <>                                     \
-  CAFFE2_EXPORT void ColwiseDiv<T, CPUContext, true>(           \
-      const int rows,                             \
-      const int cols,                             \
-      const T* A,                                 \
-      const T* B,                                 \
-      T* C,                                       \
-      CPUContext*) {                              \
-    ColwiseBinaryOp<T, T, std::divides<T>, true>( \
-        rows, cols, std::divides<T>(), A, B, C);  \
+#define DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(T)    \
+  template <>                                      \
+  C10_EXPORT void RowwiseDiv<T, CPUContext, true>( \
+      const int rows,                              \
+      const int cols,                              \
+      const T* A,                                  \
+      const T* B,                                  \
+      T* C,                                        \
+      CPUContext*) {                               \
+    RowwiseBinaryOp<T, T, std::divides<T>, true>(  \
+        rows, cols, std::divides<T>(), A, B, C);   \
+  }                                                \
+  template <>                                      \
+  C10_EXPORT void ColwiseDiv<T, CPUContext, true>( \
+      const int rows,                              \
+      const int cols,                              \
+      const T* A,                                  \
+      const T* B,                                  \
+      T* C,                                        \
+      CPUContext*) {                               \
+    ColwiseBinaryOp<T, T, std::divides<T>, true>(  \
+        rows, cols, std::divides<T>(), A, B, C);   \
   }
 DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(std::int32_t)
 DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(std::int64_t)
@@ -2115,7 +2128,7 @@ DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(std::int64_t)
 
 #define DELEGATE_BROADCAST_BINARY_FUNCTION(TIn, TOut, Func, Op)              \
   template <>                                                                \
-  CAFFE2_EXPORT void Func<TIn, CPUContext>(                                                \
+  C10_EXPORT void Func<TIn, CPUContext>(                                     \
       const int A_ndim,                                                      \
       const int* A_dims,                                                     \
       const int B_ndim,                                                      \
@@ -2258,7 +2271,7 @@ DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor)
 
 #define CAFFE2_RAND_UNIFORM_REAL(T)                                      \
   template <>                                                            \
-  CAFFE2_EXPORT void RandUniform<T, CPUContext>(                                       \
+  C10_EXPORT void RandUniform<T, CPUContext>(                            \
       const size_t n, const T a, const T b, T* r, CPUContext* context) { \
     std::uniform_real_distribution<T> distribution(a, b);                \
     for (size_t i = 0; i < n; ++i) {                                     \
@@ -2271,7 +2284,7 @@ CAFFE2_RAND_UNIFORM_REAL(double);
 
 #define CAFFE2_RAND_UNIFORM_CHAR(T)                                        \
   template <>                                                              \
-  CAFFE2_EXPORT void RandUniform<T, CPUContext>(                                         \
+  C10_EXPORT void RandUniform<T, CPUContext>(                              \
       const size_t n, const T a, const T b, T* r, CPUContext* context) {   \
     std::uniform_int_distribution<short> distribution((short)a, (short)b); \
     for (size_t i = 0; i < n; ++i) {                                       \
@@ -2284,7 +2297,7 @@ CAFFE2_RAND_UNIFORM_CHAR(uint8_t);
 
 #define CAFFE2_RAND_UNIFORM_INT(T)                                       \
   template <>                                                            \
-  CAFFE2_EXPORT void RandUniform<T, CPUContext>(                                       \
+  C10_EXPORT void RandUniform<T, CPUContext>(                            \
       const size_t n, const T a, const T b, T* r, CPUContext* context) { \
     std::uniform_int_distribution<T> distribution(a, b);                 \
     for (size_t i = 0; i < n; ++i) {                                     \
@@ -2310,7 +2323,7 @@ CAFFE2_RAND_UNIFORM_INT(uint64_t);
 // each value.
 #define CAFFE2_RAND_FIXED_SUM(T)                                        \
   template <>                                                           \
-  CAFFE2_EXPORT void RandFixedSum<T, CPUContext>(                                     \
+  C10_EXPORT void RandFixedSum<T, CPUContext>(                          \
       const size_t n,                                                   \
       const T a,                                                        \
       const T b,                                                        \
@@ -2404,7 +2417,7 @@ Ind_t generate_stack_distance(
 }
 
 template <class Type, class Val_t, class Ind_t, class Context_t, bool cdf_app>
-CAFFE2_EXPORT void generate_trace_lru(
+C10_EXPORT void generate_trace_lru(
     std::vector<Ind_t>& uni_ref,
     std::vector<Ind_t>& cum_val,
     std::vector<Val_t>& cum_dis,
@@ -2481,7 +2494,7 @@ CAFFE2_EXPORT void generate_trace_lru(
 // case we need to know the table id, to sample from the right distribution
 #define CAFFE2_RAND_SYNTHETIC_DATA(T)                                         \
   template <>                                                                 \
-  CAFFE2_EXPORT void RandSyntheticData<T, CPUContext>(                                      \
+  C10_EXPORT void RandSyntheticData<T, CPUContext>(                           \
       const size_t n, const T a, const T b, T* r, CPUContext* context) {      \
     /* unique memory references */                                            \
     std::vector<int> mem_ref = {1, 2, 3, 4, 5, 6};                            \
@@ -2518,32 +2531,33 @@ CAFFE2_RAND_SYNTHETIC_DATA(uint32_t);
 CAFFE2_RAND_SYNTHETIC_DATA(uint64_t);
 #undef CAFFE2_RAND_SYNTHETIC_DATA
 
-#define CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(T)                      \
-  template <>                                                          \
-  CAFFE2_EXPORT void RandUniformUnique<T, CPUContext>(                               \
-      const size_t n,                                                  \
-      const T a,                                                       \
-      const T b,                                                       \
-      T* r,                                                            \
-      const size_t m,                                                  \
-      const T* avoid,                                                  \
-      CPUContext* context) {                                           \
-    CAFFE_ENFORCE_LE(                                                  \
-        n, b - a - m + 1, "Cannot satisfy the unique requirement");    \
-    std::unordered_set<T> avoid_set(n);                                \
-    if (m) {                                                           \
-      avoid_set.insert(avoid, avoid + m);                              \
-      CAFFE_ENFORCE_EQ(m, avoid_set.size(), "ACAFFE2_EXPORT void should be unique"); \
-    }                                                                  \
-    std::uniform_int_distribution<T> distribution(a, b);               \
-    T v = 0;                                                           \
-    for (size_t i = 0; i < n; ++i) {                                   \
-      do {                                                             \
-        v = distribution(context->RandGenerator());                    \
-      } while (avoid_set.count(v));                                    \
-      r[i] = v;                                                        \
-      avoid_set.insert(v);                                             \
-    }                                                                  \
+#define CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(T)                    \
+  template <>                                                        \
+  C10_EXPORT void RandUniformUnique<T, CPUContext>(                  \
+      const size_t n,                                                \
+      const T a,                                                     \
+      const T b,                                                     \
+      T* r,                                                          \
+      const size_t m,                                                \
+      const T* avoid,                                                \
+      CPUContext* context) {                                         \
+    CAFFE_ENFORCE_LE(                                                \
+        n, b - a - m + 1, "Cannot satisfy the unique requirement");  \
+    std::unordered_set<T> avoid_set(n);                              \
+    if (m) {                                                         \
+      avoid_set.insert(avoid, avoid + m);                            \
+      CAFFE_ENFORCE_EQ(                                              \
+          m, avoid_set.size(), "AC10_EXPORT void should be unique"); \
+    }                                                                \
+    std::uniform_int_distribution<T> distribution(a, b);             \
+    T v = 0;                                                         \
+    for (size_t i = 0; i < n; ++i) {                                 \
+      do {                                                           \
+        v = distribution(context->RandGenerator());                  \
+      } while (avoid_set.count(v));                                  \
+      r[i] = v;                                                      \
+      avoid_set.insert(v);                                           \
+    }                                                                \
   }
 
 CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(int32_t);
@@ -2551,7 +2565,7 @@ CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(int64_t);
 #undef CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE
 
 template <>
-CAFFE2_EXPORT void RandGaussian<float, CPUContext>(
+C10_EXPORT void RandGaussian<float, CPUContext>(
     const size_t n,
     const float mean,
     const float std,
@@ -2565,7 +2579,7 @@ CAFFE2_EXPORT void RandGaussian<float, CPUContext>(
 
 #define CAFFE2_SPECIALIZED_SUM(T)            \
   template <>                                \
-  CAFFE2_EXPORT void Sum<T, CPUContext>(                   \
+  C10_EXPORT void Sum<T, CPUContext>(        \
       const int N,                           \
       const T* x,                            \
       T* y,                                  \
@@ -2581,7 +2595,7 @@ CAFFE2_SPECIALIZED_SUM(int64_t);
 #undef CAFFE2_SPECIALIZED_SUM
 
 template <>
-CAFFE2_EXPORT void SumSqr<float, CPUContext>(
+C10_EXPORT void SumSqr<float, CPUContext>(
     const int N,
     const float* x,
     float* y,
@@ -2591,7 +2605,7 @@ CAFFE2_EXPORT void SumSqr<float, CPUContext>(
 }
 
 template <>
-CAFFE2_EXPORT void Select<float, CPUContext>(
+C10_EXPORT void Select<float, CPUContext>(
     const int N,
     const int D,
     const float* x,
@@ -2605,7 +2619,7 @@ CAFFE2_EXPORT void Select<float, CPUContext>(
 }
 
 template <>
-CAFFE2_EXPORT void CopyMatrix<CPUContext>(
+C10_EXPORT void CopyMatrix<CPUContext>(
     const size_t itemsize,
     const int M,
     const int N,
@@ -2648,7 +2662,7 @@ CAFFE2_EXPORT void CopyMatrix<CPUContext>(
 
 #define DELEGATE_COPY_MATRIX_FUNCTION(T, Func)  \
   template <>                                   \
-  CAFFE2_EXPORT void CopyMatrix<T, CPUContext>(               \
+  C10_EXPORT void CopyMatrix<T, CPUContext>(    \
       const int M,                              \
       const int N,                              \
       const T* A,                               \
@@ -2659,7 +2673,7 @@ CAFFE2_EXPORT void CopyMatrix<CPUContext>(
     Func('R', 'N', M, N, T(1), A, lda, B, ldb); \
   }                                             \
   template <>                                   \
-  CAFFE2_EXPORT void CopyMatrix<T, CPUContext>(               \
+  C10_EXPORT void CopyMatrix<T, CPUContext>(    \
       const int M,                              \
       const int N,                              \
       const T* A,                               \
@@ -2690,7 +2704,7 @@ DELEGATE_COPY_MATRIX_FUNCTION(double, mkl_domatcopy)
 
 #define CAFFE2_SPECIALIZED_COPY_MATRIX(T)                                \
   template <>                                                            \
-  CAFFE2_EXPORT void CopyMatrix<T, CPUContext>(                                        \
+  C10_EXPORT void CopyMatrix<T, CPUContext>(                             \
       const int M,                                                       \
       const int N,                                                       \
       const T* A,                                                        \
@@ -2720,7 +2734,7 @@ DELEGATE_COPY_MATRIX_FUNCTION(double, mkl_domatcopy)
     }                                                                    \
   }                                                                      \
   template <>                                                            \
-  CAFFE2_EXPORT void CopyMatrix<T, CPUContext>(                                        \
+  C10_EXPORT void CopyMatrix<T, CPUContext>(                             \
       const int M,                                                       \
       const int N,                                                       \
       const T* A,                                                        \
@@ -2759,7 +2773,7 @@ CAFFE2_SPECIALIZED_COPY_MATRIX(std::uint16_t)
 namespace {
 
 template <typename T>
-CAFFE2_EXPORT void Im2ColZeroPaddingAndNoDilationNCHW(
+C10_EXPORT void Im2ColZeroPaddingAndNoDilationNCHW(
     const int C,
     const int H,
     const int W,
@@ -2806,7 +2820,7 @@ CAFFE2_EXPORT void Im2ColZeroPaddingAndNoDilationNCHW(
 }
 
 template <typename T>
-CAFFE2_EXPORT void Col2ImZeroPaddingAndNoDilationNCHW(
+C10_EXPORT void Col2ImZeroPaddingAndNoDilationNCHW(
     const int C,
     const int H,
     const int W,
@@ -2842,7 +2856,7 @@ CAFFE2_EXPORT void Col2ImZeroPaddingAndNoDilationNCHW(
 }
 
 template <typename T>
-CAFFE2_EXPORT void Im2ColZeroPaddingAndNoDilationNHWC(
+C10_EXPORT void Im2ColZeroPaddingAndNoDilationNHWC(
     const int C,
     const int H,
     const int W,
@@ -2867,7 +2881,7 @@ CAFFE2_EXPORT void Im2ColZeroPaddingAndNoDilationNHWC(
 }
 
 template <typename T>
-CAFFE2_EXPORT void Col2ImZeroPaddingAndNoDilationNHWC(
+C10_EXPORT void Col2ImZeroPaddingAndNoDilationNHWC(
     const int C,
     const int H,
     const int W,
@@ -2894,7 +2908,7 @@ CAFFE2_EXPORT void Col2ImZeroPaddingAndNoDilationNHWC(
 }
 
 template <typename T, bool kCol2Im>
-CAFFE2_EXPORT void Im2ColNdNCHWImpl(
+C10_EXPORT void Im2ColNdNCHWImpl(
     const int N,
     const int img_size,
     const int col_size,
@@ -2950,7 +2964,7 @@ CAFFE2_EXPORT void Im2ColNdNCHWImpl(
 } // namespace
 
 template <>
-CAFFE2_EXPORT void Im2ColNd<float, CPUContext, StorageOrder::NCHW>(
+C10_EXPORT void Im2ColNd<float, CPUContext, StorageOrder::NCHW>(
     const int N,
     const int img_size,
     const int col_size,
@@ -2978,7 +2992,7 @@ CAFFE2_EXPORT void Im2ColNd<float, CPUContext, StorageOrder::NCHW>(
 }
 
 template <>
-CAFFE2_EXPORT void Col2ImNd<float, CPUContext, StorageOrder::NCHW>(
+C10_EXPORT void Col2ImNd<float, CPUContext, StorageOrder::NCHW>(
     const int N,
     const int img_size,
     const int col_size,
@@ -3006,7 +3020,7 @@ CAFFE2_EXPORT void Col2ImNd<float, CPUContext, StorageOrder::NCHW>(
 }
 
 template <>
-CAFFE2_EXPORT void Im2Col<float, CPUContext, StorageOrder::NCHW>(
+C10_EXPORT void Im2Col<float, CPUContext, StorageOrder::NCHW>(
     const int C,
     const int H,
     const int W,
@@ -3072,7 +3086,7 @@ CAFFE2_EXPORT void Im2Col<float, CPUContext, StorageOrder::NCHW>(
 }
 
 template <>
-CAFFE2_EXPORT void Im2Col<float, CPUContext, StorageOrder::NHWC>(
+C10_EXPORT void Im2Col<float, CPUContext, StorageOrder::NHWC>(
     const int C,
     const int H,
     const int W,
@@ -3172,7 +3186,7 @@ CAFFE2_EXPORT void Im2Col<float, CPUContext, StorageOrder::NHWC>(
 }
 
 template <>
-CAFFE2_EXPORT void Col2Im<float, CPUContext, StorageOrder::NCHW>(
+C10_EXPORT void Col2Im<float, CPUContext, StorageOrder::NCHW>(
     const int C,
     const int H,
     const int W,
@@ -3239,7 +3253,7 @@ CAFFE2_EXPORT void Col2Im<float, CPUContext, StorageOrder::NCHW>(
 }
 
 template <>
-CAFFE2_EXPORT void Col2Im<float, CPUContext, StorageOrder::NHWC>(
+C10_EXPORT void Col2Im<float, CPUContext, StorageOrder::NHWC>(
     const int C,
     const int H,
     const int W,
@@ -3335,7 +3349,7 @@ CAFFE2_EXPORT void Col2Im<float, CPUContext, StorageOrder::NHWC>(
 }
 
 template <>
-CAFFE2_EXPORT void BiasCHW<float, CPUContext>(
+C10_EXPORT void BiasCHW<float, CPUContext>(
     const float* bias,
     const float* /*bias_multiplier*/,
     const int bias_channels,
@@ -3420,7 +3434,7 @@ CAFFE2_EXPORT void BiasCHW<float, CPUContext>(
 
 #define CAFFE2_SPECIALIZED_COPYVECTOR(T)                            \
   template <>                                                       \
-  CAFFE2_EXPORT void CopyVector<T, CPUContext>(                                   \
+  C10_EXPORT void CopyVector<T, CPUContext>(                        \
       const int N, const T* src, T* dst, CPUContext* /*context*/) { \
     if (src != dst && N > 0) {                                      \
       memcpy(dst, src, sizeof(T) * N);                              \
@@ -3633,7 +3647,7 @@ void TransposeCPUImpl(
 
 #define CAFFE2_SPECIALIZED_TRANSPOSE(T)       \
   template <>                                 \
-  CAFFE2_EXPORT void Transpose<T, CPUContext>(              \
+  C10_EXPORT void Transpose<T, CPUContext>(   \
       const int ndim,                         \
       const int* dims,                        \
       const int* axes,                        \
diff --git a/caffe2/utils/math_gpu_test.cc b/caffe2/utils/math_gpu_test.cc
index 9be1c3db6c1d0..4b0247a0786fc 100644
--- a/caffe2/utils/math_gpu_test.cc
+++ b/caffe2/utils/math_gpu_test.cc
@@ -41,9 +41,9 @@ void executeGpuBinaryOpTest(
   Blob* bloby = ws.CreateBlob("Y");
   Blob* bloby_host = ws.CreateBlob("Y_host");
 
-  auto* tensorx0 = blobx0->GetMutableTensor(CUDA);
-  auto* tensorx1 = blobx1->GetMutableTensor(CUDA);
-  auto* tensory = bloby->GetMutableTensor(CUDA);
+  auto* tensorx0 = BlobGetMutableTensor(blobx0, CUDA);
+  auto* tensorx1 = BlobGetMutableTensor(blobx1, CUDA);
+  auto* tensory = BlobGetMutableTensor(bloby, CUDA);
 
   vector<int> shapex0_vector{shapex0};
   vector<int> shapex1_vector{shapex1};
@@ -71,7 +71,7 @@ void executeGpuBinaryOpTest(
   context.FinishDeviceComputation();
 
   // Copy result to CPU so we can inspect it
-  auto* tensory_host = bloby_host->GetMutableTensor(CPU);
+  auto* tensory_host = BlobGetMutableTensor(bloby_host, CPU);
   tensory_host->CopyFrom(*tensory, &context);
   context.FinishDeviceComputation();
 
@@ -94,7 +94,7 @@ TEST(MathUtilGPUTest, testAddStripedBatch) {
   vector<int> shapex{33 * 9, 25};
   vector<int> shapey{33, 25};
 
-  auto* tensorx = blobx->GetMutableTensor(CUDA);
+  auto* tensorx = BlobGetMutableTensor(blobx, CUDA);
   tensorx->Resize(shapex);
   int stripe = 33 * 25;
   vector<float> tot(33, 0.0);
@@ -110,7 +110,7 @@ TEST(MathUtilGPUTest, testAddStripedBatch) {
     }
   }
 
-  auto* tensory = bloby->GetMutableTensor(CUDA);
+  auto* tensory = BlobGetMutableTensor(bloby, CUDA);
   tensory->Resize(shapey);
   math::Set<float, CUDAContext>(
       stripe, 0.0, tensory->mutable_data<float>(), &context);
@@ -125,7 +125,7 @@ TEST(MathUtilGPUTest, testAddStripedBatch) {
   context.FinishDeviceComputation();
 
   // Copy result to CPU so we can inspect it
-  auto* tensory_host = bloby_host->GetMutableTensor(CPU);
+  auto* tensory_host = BlobGetMutableTensor(bloby_host, CPU);
   tensory_host->CopyFrom(*tensory, &context);
   context.FinishDeviceComputation();
 
@@ -258,9 +258,9 @@ class GemmBatchedGPUTest
     Blob* X_blob = ws_.CreateBlob("X");
     Blob* W_blob = ws_.CreateBlob("W");
     Blob* Y_blob = ws_.CreateBlob("Y");
-    X_ = X_blob->GetMutableTensor(CUDA);
-    W_ = W_blob->GetMutableTensor(CUDA);
-    Y_ = Y_blob->GetMutableTensor(CUDA);
+    X_ = BlobGetMutableTensor(X_blob, CUDA);
+    W_ = BlobGetMutableTensor(W_blob, CUDA);
+    Y_ = BlobGetMutableTensor(Y_blob, CUDA);
     X_->Resize(std::vector<int64_t>{3, 5, 10});
     W_->Resize(std::vector<int64_t>{3, 6, 10});
     Y_->Resize(std::vector<int64_t>{3, 5, 6});
@@ -381,8 +381,8 @@ class ReduceTensorGPUTest : public testing::Test {
     cuda_context_ = make_unique<CUDAContext>(option_);
     Blob* blob_x = ws_.CreateBlob("X");
     Blob* blob_y = ws_.CreateBlob("Y");
-    X_ = blob_x->GetMutableTensor(CUDA);
-    Y_ = blob_y->GetMutableTensor(CUDA);
+    X_ = BlobGetMutableTensor(blob_x, CUDA);
+    Y_ = BlobGetMutableTensor(blob_y, CUDA);
   }
 
   void SetUpData(
@@ -402,7 +402,7 @@ class ReduceTensorGPUTest : public testing::Test {
 
   void VerifyResult(const std::vector<float>& expected_output) {
     Blob* blob_y_host = ws_.CreateBlob("Y_host");
-    auto* Y_host = blob_y_host->GetMutableTensor(CPU);
+    auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
     Y_host->CopyFrom(*Y_, cuda_context_.get());
     cuda_context_->FinishDeviceComputation();
     ASSERT_EQ(expected_output.size(), Y_host->size());
@@ -664,8 +664,8 @@ class BroadcastGPUTest : public testing::Test {
     cuda_context_ = make_unique<CUDAContext>(option_);
     Blob* blob_x = ws_.CreateBlob("X");
     Blob* blob_y = ws_.CreateBlob("Y");
-    X_ = blob_x->GetMutableTensor(CUDA);
-    Y_ = blob_y->GetMutableTensor(CUDA);
+    X_ = BlobGetMutableTensor(blob_x, CUDA);
+    Y_ = BlobGetMutableTensor(blob_y, CUDA);
   }
 
   void SetUpData(
@@ -681,7 +681,7 @@ class BroadcastGPUTest : public testing::Test {
 
   void VerifyResult(const std::vector<float>& expected_output) {
     Blob* blob_y_host = ws_.CreateBlob("Y_host");
-    auto* Y_host = blob_y_host->GetMutableTensor(CPU);
+    auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
     Y_host->CopyFrom(*Y_, cuda_context_.get());
     cuda_context_->FinishDeviceComputation();
     ASSERT_EQ(expected_output.size(), Y_host->size());
@@ -741,9 +741,9 @@ class MomentsGPUTest : public testing::Test {
     Blob* blob_x = ws_.CreateBlob("X");
     Blob* blob_mean = ws_.CreateBlob("mean");
     Blob* blob_variance = ws_.CreateBlob("variance");
-    X_ = blob_x->GetMutableTensor(CUDA);
-    mean_ = blob_mean->GetMutableTensor(CUDA);
-    variance_ = blob_variance->GetMutableTensor(CUDA);
+    X_ = BlobGetMutableTensor(blob_x, CUDA);
+    mean_ = BlobGetMutableTensor(blob_mean, CUDA);
+    variance_ = BlobGetMutableTensor(blob_variance, CUDA);
   }
 
   void SetUpData(
@@ -766,10 +766,10 @@ class MomentsGPUTest : public testing::Test {
       const std::vector<float>& mean_data,
       const std::vector<float>& variance_data) {
     Blob* blob_mean_host = ws_.CreateBlob("mean_host");
-    auto* mean_host = blob_mean_host->GetMutableTensor(CPU);
+    auto* mean_host = BlobGetMutableTensor(blob_mean_host, CPU);
     mean_host->CopyFrom(*mean_, cuda_context_.get());
     Blob* blob_variance_host = ws_.CreateBlob("variance_host");
-    auto* variance_host = blob_variance_host->GetMutableTensor(CPU);
+    auto* variance_host = BlobGetMutableTensor(blob_variance_host, CPU);
     variance_host->CopyFrom(*variance_, cuda_context_.get());
     cuda_context_->FinishDeviceComputation();
 
@@ -868,8 +868,8 @@ class TransposeGPUTest : public testing::Test {
     cuda_context_ = make_unique<CUDAContext>(option_);
     Blob* blob_x = ws_.CreateBlob("X");
     Blob* blob_y = ws_.CreateBlob("Y");
-    X_ = blob_x->GetMutableTensor(CUDA);
-    Y_ = blob_y->GetMutableTensor(CUDA);
+    X_ = BlobGetMutableTensor(blob_x, CUDA);
+    Y_ = BlobGetMutableTensor(blob_y, CUDA);
   }
 
   void SetUpData(
@@ -890,7 +890,7 @@ class TransposeGPUTest : public testing::Test {
 
   void VerifyResult(const std::vector<float>& expected_output) {
     Blob* blob_y_host = ws_.CreateBlob("Y_host");
-    auto* Y_host = blob_y_host->GetMutableTensor(CPU);
+    auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
     Y_host->CopyFrom(*Y_, cuda_context_.get());
     cuda_context_->FinishDeviceComputation();
     ASSERT_EQ(expected_output.size(), Y_host->size());
diff --git a/caffe2/utils/proto_convert.cc b/caffe2/utils/proto_convert.cc
index 24984203bcb81..790bd274291dc 100644
--- a/caffe2/utils/proto_convert.cc
+++ b/caffe2/utils/proto_convert.cc
@@ -3,7 +3,7 @@
 
 namespace caffe2 {
 
-CAFFE2_EXPORT void ArgumentToAttributeProto(
+C10_EXPORT void ArgumentToAttributeProto(
     const Argument& arg,
     ::torch::AttributeProto* attr) {
   CAFFE_ENFORCE(arg.has_name());
@@ -29,7 +29,7 @@ CAFFE2_EXPORT void ArgumentToAttributeProto(
   }
 }
 
-CAFFE2_EXPORT void AttributeProtoToArgument(
+C10_EXPORT void AttributeProtoToArgument(
     const ::torch::AttributeProto& attr,
     Argument* arg) {
   CAFFE_ENFORCE(attr.has_name());
@@ -94,7 +94,7 @@ CAFFE2_EXPORT void AttributeProtoToArgument(
   }
 }
 
-CAFFE2_EXPORT void OperatorDefToNodeProto(
+C10_EXPORT void OperatorDefToNodeProto(
     const OperatorDef& def,
     ::torch::NodeProto* node) {
   node->mutable_input()->CopyFrom(def.input());
@@ -141,7 +141,7 @@ CAFFE2_EXPORT void OperatorDefToNodeProto(
   }
 }
 
-CAFFE2_EXPORT void NodeProtoToOperatorDef(
+C10_EXPORT void NodeProtoToOperatorDef(
     const ::torch::NodeProto& node,
     OperatorDef* def) {
   def->mutable_input()->CopyFrom(node.input());
diff --git a/caffe2/utils/proto_utils.cc b/caffe2/utils/proto_utils.cc
index 1daacff3eda2f..dc8e088eba97c 100644
--- a/caffe2/utils/proto_utils.cc
+++ b/caffe2/utils/proto_utils.cc
@@ -21,11 +21,11 @@ using ::google::protobuf::MessageLite;
 
 namespace caffe2 {
 
-CAFFE2_EXPORT std::string DeviceTypeName(const int32_t& d) {
+C10_EXPORT std::string DeviceTypeName(const int32_t& d) {
   return at::DeviceTypeName(static_cast<at::DeviceType>(d));
 }
 
-CAFFE2_EXPORT int DeviceId(const DeviceOption& option) {
+C10_EXPORT int DeviceId(const DeviceOption& option) {
   switch (option.device_type()) {
     case PROTO_CPU:
       return option.numa_node_id();
@@ -40,7 +40,7 @@ CAFFE2_EXPORT int DeviceId(const DeviceOption& option) {
   }
 }
 
-CAFFE2_EXPORT bool IsSameDevice(const DeviceOption& lhs, const DeviceOption& rhs) {
+C10_EXPORT bool IsSameDevice(const DeviceOption& lhs, const DeviceOption& rhs) {
   return (
       lhs.device_type() == rhs.device_type() &&
       lhs.cuda_gpu_id() == rhs.cuda_gpu_id() &&
@@ -49,7 +49,7 @@ CAFFE2_EXPORT bool IsSameDevice(const DeviceOption& lhs, const DeviceOption& rhs
       lhs.numa_node_id() == rhs.numa_node_id());
 }
 
-CAFFE2_EXPORT bool ReadStringFromFile(const char* filename, string* str) {
+C10_EXPORT bool ReadStringFromFile(const char* filename, string* str) {
   std::ifstream ifs(filename, std::ios::in);
   if (!ifs) {
     VLOG(1) << "File cannot be opened: " << filename
@@ -64,7 +64,7 @@ CAFFE2_EXPORT bool ReadStringFromFile(const char* filename, string* str) {
   return true;
 }
 
-CAFFE2_EXPORT bool WriteStringToFile(const string& str, const char* filename) {
+C10_EXPORT bool WriteStringToFile(const string& str, const char* filename) {
   std::ofstream ofs(filename, std::ios::out | std::ios::trunc);
   if (!ofs.is_open()) {
     VLOG(1) << "File cannot be created: " << filename
@@ -102,11 +102,13 @@ class IfstreamInputStream : public ::google::protobuf::io::CopyingInputStream {
 };
 }  // namespace
 
-CAFFE2_EXPORT string ProtoDebugString(const MessageLite& proto) {
+C10_EXPORT string ProtoDebugString(const MessageLite& proto) {
   return proto.SerializeAsString();
 }
 
-CAFFE2_EXPORT bool ParseProtoFromLargeString(const string& str, MessageLite* proto) {
+C10_EXPORT bool ParseProtoFromLargeString(
+    const string& str,
+    MessageLite* proto) {
   ::google::protobuf::io::ArrayInputStream input_stream(str.data(), str.size());
   ::google::protobuf::io::CodedInputStream coded_stream(&input_stream);
   // Set PlanDef message size limit to 2G.
@@ -114,7 +116,9 @@ CAFFE2_EXPORT bool ParseProtoFromLargeString(const string& str, MessageLite* pro
   return proto->ParseFromCodedStream(&coded_stream);
 }
 
-CAFFE2_EXPORT bool ReadProtoFromBinaryFile(const char* filename, MessageLite* proto) {
+C10_EXPORT bool ReadProtoFromBinaryFile(
+    const char* filename,
+    MessageLite* proto) {
   ::google::protobuf::io::CopyingInputStreamAdaptor stream(
       new IfstreamInputStream(filename));
   stream.SetOwnsCopyingStream(true);
@@ -125,7 +129,7 @@ CAFFE2_EXPORT bool ReadProtoFromBinaryFile(const char* filename, MessageLite* pr
   return proto->ParseFromCodedStream(&coded_stream);
 }
 
-CAFFE2_EXPORT void WriteProtoToBinaryFile(
+C10_EXPORT void WriteProtoToBinaryFile(
     const MessageLite& /*proto*/,
     const char* /*filename*/) {
   LOG(FATAL) << "Not implemented yet.";
@@ -144,16 +148,16 @@ using ::google::protobuf::io::CodedOutputStream;
 using ::google::protobuf::Message;
 
 namespace TextFormat {
-CAFFE2_EXPORT bool ParseFromString(const string& spec, Message* proto) {
+C10_EXPORT bool ParseFromString(const string& spec, Message* proto) {
   return ::google::protobuf::TextFormat::ParseFromString(spec, proto);
 }
 } // namespace TextFormat
 
-CAFFE2_EXPORT string ProtoDebugString(const Message& proto) {
+C10_EXPORT string ProtoDebugString(const Message& proto) {
   return proto.ShortDebugString();
 }
 
-CAFFE2_EXPORT bool ParseProtoFromLargeString(const string& str, Message* proto) {
+C10_EXPORT bool ParseProtoFromLargeString(const string& str, Message* proto) {
   ::google::protobuf::io::ArrayInputStream input_stream(str.data(), str.size());
   ::google::protobuf::io::CodedInputStream coded_stream(&input_stream);
   // Set PlanDef message size limit to 2G.
@@ -161,7 +165,7 @@ CAFFE2_EXPORT bool ParseProtoFromLargeString(const string& str, Message* proto)
   return proto->ParseFromCodedStream(&coded_stream);
 }
 
-CAFFE2_EXPORT bool ReadProtoFromTextFile(const char* filename, Message* proto) {
+C10_EXPORT bool ReadProtoFromTextFile(const char* filename, Message* proto) {
   int fd = open(filename, O_RDONLY);
   CAFFE_ENFORCE_NE(fd, -1, "File not found: ", filename);
   FileInputStream* input = new FileInputStream(fd);
@@ -171,7 +175,9 @@ CAFFE2_EXPORT bool ReadProtoFromTextFile(const char* filename, Message* proto) {
   return success;
 }
 
-CAFFE2_EXPORT void WriteProtoToTextFile(const Message& proto, const char* filename) {
+C10_EXPORT void WriteProtoToTextFile(
+    const Message& proto,
+    const char* filename) {
   int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
   FileOutputStream* output = new FileOutputStream(fd);
   CAFFE_ENFORCE(google::protobuf::TextFormat::Print(proto, output));
@@ -179,7 +185,9 @@ CAFFE2_EXPORT void WriteProtoToTextFile(const Message& proto, const char* filena
   close(fd);
 }
 
-CAFFE2_EXPORT bool ReadProtoFromBinaryFile(const char* filename, MessageLite* proto) {
+C10_EXPORT bool ReadProtoFromBinaryFile(
+    const char* filename,
+    MessageLite* proto) {
 #if defined (_MSC_VER)  // for MSC compiler binary flag needs to be specified
   int fd = open(filename, O_RDONLY | O_BINARY);
 #else
@@ -198,7 +206,9 @@ CAFFE2_EXPORT bool ReadProtoFromBinaryFile(const char* filename, MessageLite* pr
   return success;
 }
 
-CAFFE2_EXPORT void WriteProtoToBinaryFile(const MessageLite& proto, const char* filename) {
+C10_EXPORT void WriteProtoToBinaryFile(
+    const MessageLite& proto,
+    const char* filename) {
   int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
   CAFFE_ENFORCE_NE(
       fd, -1, "File cannot be created: ", filename, " error number: ", errno);
@@ -213,8 +223,7 @@ CAFFE2_EXPORT void WriteProtoToBinaryFile(const MessageLite& proto, const char*
 
 #endif  // CAFFE2_USE_LITE_PROTO
 
-
-CAFFE2_EXPORT ArgumentHelper::ArgumentHelper(const OperatorDef& def) {
+C10_EXPORT ArgumentHelper::ArgumentHelper(const OperatorDef& def) {
   for (auto& arg : def.arg()) {
     if (arg_map_.count(arg.name())) {
       if (arg.SerializeAsString() != arg_map_[arg.name()].SerializeAsString()) {
@@ -235,7 +244,7 @@ CAFFE2_EXPORT ArgumentHelper::ArgumentHelper(const OperatorDef& def) {
   }
 }
 
-CAFFE2_EXPORT ArgumentHelper::ArgumentHelper(const NetDef& netdef) {
+C10_EXPORT ArgumentHelper::ArgumentHelper(const NetDef& netdef) {
   for (auto& arg : netdef.arg()) {
     CAFFE_ENFORCE(
         arg_map_.count(arg.name()) == 0,
@@ -245,7 +254,7 @@ CAFFE2_EXPORT ArgumentHelper::ArgumentHelper(const NetDef& netdef) {
   }
 }
 
-CAFFE2_EXPORT bool ArgumentHelper::HasArgument(const string& name) const {
+C10_EXPORT bool ArgumentHelper::HasArgument(const string& name) const {
   return arg_map_.count(name);
 }
 
@@ -267,41 +276,42 @@ std::ostream& operator<<(std::ostream& output, const NetDef& n) {
   return output;
 }
 
-#define INSTANTIATE_GET_SINGLE_ARGUMENT(                                      \
-    T, fieldname, enforce_lossless_conversion)                                \
-  template <>                                                                 \
-  CAFFE2_EXPORT T ArgumentHelper::GetSingleArgument<T>(                       \
-      const string& name, const T& default_value) const {                     \
-    if (arg_map_.count(name) == 0) {                                          \
-      VLOG(1) << "Using default parameter value " << default_value            \
-              << " for parameter " << name;                                   \
-      return default_value;                                                   \
-    }                                                                         \
-    CAFFE_ENFORCE(                                                            \
-        arg_map_.at(name).has_##fieldname(),                                  \
-        "Argument ",                                                          \
-        name,                                                                 \
-        " does not have the right field: expected field " #fieldname);        \
-    auto value = arg_map_.at(name).fieldname();                               \
-    if (enforce_lossless_conversion) {                                        \
-      auto supportsConversion =                                               \
-          SupportsLosslessConversion<decltype(value), T>(value);              \
-      CAFFE_ENFORCE(                                                          \
-          supportsConversion,                                                 \
-          "Value",                                                            \
-          value,                                                              \
-          " of argument ",                                                    \
-          name,                                                               \
-          "cannot be represented correctly in a target type");                \
-    }                                                                         \
-    return static_cast<T>(value);                                             \
-  }                                                                           \
-  template <>                                                                 \
-  CAFFE2_EXPORT bool ArgumentHelper::HasSingleArgumentOfType<T>(const string& name) const { \
-    if (arg_map_.count(name) == 0) {                                          \
-      return false;                                                           \
-    }                                                                         \
-    return arg_map_.at(name).has_##fieldname();                               \
+#define INSTANTIATE_GET_SINGLE_ARGUMENT(                               \
+    T, fieldname, enforce_lossless_conversion)                         \
+  template <>                                                          \
+  C10_EXPORT T ArgumentHelper::GetSingleArgument<T>(                   \
+      const string& name, const T& default_value) const {              \
+    if (arg_map_.count(name) == 0) {                                   \
+      VLOG(1) << "Using default parameter value " << default_value     \
+              << " for parameter " << name;                            \
+      return default_value;                                            \
+    }                                                                  \
+    CAFFE_ENFORCE(                                                     \
+        arg_map_.at(name).has_##fieldname(),                           \
+        "Argument ",                                                   \
+        name,                                                          \
+        " does not have the right field: expected field " #fieldname); \
+    auto value = arg_map_.at(name).fieldname();                        \
+    if (enforce_lossless_conversion) {                                 \
+      auto supportsConversion =                                        \
+          SupportsLosslessConversion<decltype(value), T>(value);       \
+      CAFFE_ENFORCE(                                                   \
+          supportsConversion,                                          \
+          "Value",                                                     \
+          value,                                                       \
+          " of argument ",                                             \
+          name,                                                        \
+          "cannot be represented correctly in a target type");         \
+    }                                                                  \
+    return static_cast<T>(value);                                      \
+  }                                                                    \
+  template <>                                                          \
+  C10_EXPORT bool ArgumentHelper::HasSingleArgumentOfType<T>(          \
+      const string& name) const {                                      \
+    if (arg_map_.count(name) == 0) {                                   \
+      return false;                                                    \
+    }                                                                  \
+    return arg_map_.at(name).has_##fieldname();                        \
   }
 
 INSTANTIATE_GET_SINGLE_ARGUMENT(float, f, false)
@@ -321,7 +331,7 @@ INSTANTIATE_GET_SINGLE_ARGUMENT(NetDef, n, false)
 #define INSTANTIATE_GET_REPEATED_ARGUMENT(                             \
     T, fieldname, enforce_lossless_conversion)                         \
   template <>                                                          \
-  CAFFE2_EXPORT vector<T> ArgumentHelper::GetRepeatedArgument<T>(      \
+  C10_EXPORT vector<T> ArgumentHelper::GetRepeatedArgument<T>(         \
       const string& name, const std::vector<T>& default_value) const { \
     if (arg_map_.count(name) == 0) {                                   \
       return default_value;                                            \
@@ -358,14 +368,14 @@ INSTANTIATE_GET_REPEATED_ARGUMENT(string, strings, false)
 INSTANTIATE_GET_REPEATED_ARGUMENT(NetDef, nets, false)
 #undef INSTANTIATE_GET_REPEATED_ARGUMENT
 
-#define CAFFE2_MAKE_SINGULAR_ARGUMENT(T, fieldname)                            \
-template <>                                                                    \
-CAFFE2_EXPORT Argument MakeArgument(const string& name, const T& value) {      \
-  Argument arg;                                                                \
-  arg.set_name(name);                                                          \
-  arg.set_##fieldname(value);                                                  \
-  return arg;                                                                  \
-}
+#define CAFFE2_MAKE_SINGULAR_ARGUMENT(T, fieldname)                      \
+  template <>                                                            \
+  C10_EXPORT Argument MakeArgument(const string& name, const T& value) { \
+    Argument arg;                                                        \
+    arg.set_name(name);                                                  \
+    arg.set_##fieldname(value);                                          \
+    return arg;                                                          \
+  }
 
 CAFFE2_MAKE_SINGULAR_ARGUMENT(bool, i)
 CAFFE2_MAKE_SINGULAR_ARGUMENT(float, f)
@@ -375,28 +385,29 @@ CAFFE2_MAKE_SINGULAR_ARGUMENT(string, s)
 #undef CAFFE2_MAKE_SINGULAR_ARGUMENT
 
 template <>
-CAFFE2_EXPORT bool ArgumentHelper::RemoveArgument(OperatorDef& def, int index);
+C10_EXPORT bool ArgumentHelper::RemoveArgument(OperatorDef& def, int index);
 template <>
 bool ArgumentHelper::RemoveArgument(NetDef& def, int index);
 
 template <>
-CAFFE2_EXPORT Argument MakeArgument(const string& name, const MessageLite& value) {
+C10_EXPORT Argument MakeArgument(const string& name, const MessageLite& value) {
   Argument arg;
   arg.set_name(name);
   arg.set_s(value.SerializeAsString());
   return arg;
 }
 
-#define CAFFE2_MAKE_REPEATED_ARGUMENT(T, fieldname)                            \
-template <>                                                                    \
-CAFFE2_EXPORT Argument MakeArgument(const string& name, const vector<T>& value) {\
-  Argument arg;                                                                \
-  arg.set_name(name);                                                          \
-  for (const auto& v : value) {                                                \
-    arg.add_##fieldname(v);                                                    \
-  }                                                                            \
-  return arg;                                                                  \
-}
+#define CAFFE2_MAKE_REPEATED_ARGUMENT(T, fieldname) \
+  template <>                                       \
+  C10_EXPORT Argument MakeArgument(                 \
+      const string& name, const vector<T>& value) { \
+    Argument arg;                                   \
+    arg.set_name(name);                             \
+    for (const auto& v : value) {                   \
+      arg.add_##fieldname(v);                       \
+    }                                               \
+    return arg;                                     \
+  }
 
 CAFFE2_MAKE_REPEATED_ARGUMENT(float, floats)
 CAFFE2_MAKE_REPEATED_ARGUMENT(int, ints)
@@ -404,7 +415,7 @@ CAFFE2_MAKE_REPEATED_ARGUMENT(int64_t, ints)
 CAFFE2_MAKE_REPEATED_ARGUMENT(string, strings)
 #undef CAFFE2_MAKE_REPEATED_ARGUMENT
 
-CAFFE2_EXPORT bool HasOutput(const OperatorDef& op, const std::string& output) {
+C10_EXPORT bool HasOutput(const OperatorDef& op, const std::string& output) {
   for (const auto& outp : op.output()) {
     if (outp == output) {
       return true;
@@ -413,7 +424,7 @@ CAFFE2_EXPORT bool HasOutput(const OperatorDef& op, const std::string& output) {
   return false;
 }
 
-CAFFE2_EXPORT bool HasInput(const OperatorDef& op, const std::string& input) {
+C10_EXPORT bool HasInput(const OperatorDef& op, const std::string& input) {
   for (const auto& inp : op.input()) {
     if (inp == input) {
       return true;
@@ -423,7 +434,7 @@ CAFFE2_EXPORT bool HasInput(const OperatorDef& op, const std::string& input) {
 }
 
 // Return the argument index or -1 if it does not exist.
-CAFFE2_EXPORT int GetArgumentIndex(
+C10_EXPORT int GetArgumentIndex(
     const google::protobuf::RepeatedPtrField<Argument>& args,
     const string& name) {
   int index = 0;
@@ -436,7 +447,9 @@ CAFFE2_EXPORT int GetArgumentIndex(
   return -1;
 }
 
-CAFFE2_EXPORT const Argument& GetArgument(const OperatorDef& def, const string& name) {
+C10_EXPORT const Argument& GetArgument(
+    const OperatorDef& def,
+    const string& name) {
   int index = GetArgumentIndex(def.arg(), name);
   if (index != -1) {
     return def.arg(index);
@@ -449,7 +462,7 @@ CAFFE2_EXPORT const Argument& GetArgument(const OperatorDef& def, const string&
   }
 }
 
-CAFFE2_EXPORT const Argument& GetArgument(const NetDef& def, const string& name) {
+C10_EXPORT const Argument& GetArgument(const NetDef& def, const string& name) {
   int index = GetArgumentIndex(def.arg(), name);
   if (index != -1) {
     return def.arg(index);
@@ -462,7 +475,7 @@ CAFFE2_EXPORT const Argument& GetArgument(const NetDef& def, const string& name)
   }
 }
 
-CAFFE2_EXPORT bool GetFlagArgument(
+C10_EXPORT bool GetFlagArgument(
     const google::protobuf::RepeatedPtrField<Argument>& args,
     const string& name,
     bool default_value) {
@@ -476,21 +489,19 @@ CAFFE2_EXPORT bool GetFlagArgument(
   return default_value;
 }
 
-CAFFE2_EXPORT bool GetFlagArgument(
+C10_EXPORT bool GetFlagArgument(
     const OperatorDef& def,
     const string& name,
     bool default_value) {
   return GetFlagArgument(def.arg(), name, default_value);
 }
 
-CAFFE2_EXPORT bool GetFlagArgument(
-    const NetDef& def,
-    const string& name,
-    bool default_value) {
+C10_EXPORT bool
+GetFlagArgument(const NetDef& def, const string& name, bool default_value) {
   return GetFlagArgument(def.arg(), name, default_value);
 }
 
-CAFFE2_EXPORT Argument* GetMutableArgument(
+C10_EXPORT Argument* GetMutableArgument(
     const string& name,
     const bool create_if_missing,
     OperatorDef* def) {
diff --git a/caffe2/utils/proto_utils.h b/caffe2/utils/proto_utils.h
index dc7c365e86c9d..500ddf73434ab 100644
--- a/caffe2/utils/proto_utils.h
+++ b/caffe2/utils/proto_utils.h
@@ -194,7 +194,7 @@ CAFFE2_API bool HasInput(const OperatorDef& op, const std::string& input);
  * does not copy the operator def, so one would need to make sure that the
  * lifetime of the OperatorDef object outlives that of the ArgumentHelper.
  */
-class CAFFE2_EXPORT ArgumentHelper {
+class C10_EXPORT ArgumentHelper {
  public:
   template <typename Def>
   static bool HasArgument(const Def& def, const string& name) {
diff --git a/caffe2/utils/threadpool/WorkersPool.h b/caffe2/utils/threadpool/WorkersPool.h
index 27b75d8ccd3a6..b2fc9f03b0777 100644
--- a/caffe2/utils/threadpool/WorkersPool.h
+++ b/caffe2/utils/threadpool/WorkersPool.h
@@ -360,7 +360,7 @@ class WorkersPool {
     counter_to_decrement_when_ready_.Wait();
   }
 
-  AT_DISABLE_COPY_AND_ASSIGN(WorkersPool);
+  C10_DISABLE_COPY_AND_ASSIGN(WorkersPool);
   std::vector<std::unique_ptr<Worker, AlignedDeleter<Worker>>> workers_;
   // The BlockingCounter used to wait for the workers.
   BlockingCounter counter_to_decrement_when_ready_;
diff --git a/caffe2/utils/zmq_helper.h b/caffe2/utils/zmq_helper.h
index cfd1d53a98af6..bd45be9192dca 100644
--- a/caffe2/utils/zmq_helper.h
+++ b/caffe2/utils/zmq_helper.h
@@ -26,7 +26,7 @@ class ZmqContext {
  private:
   void* ptr_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(ZmqContext);
+  C10_DISABLE_COPY_AND_ASSIGN(ZmqContext);
 };
 
 class ZmqMessage {
@@ -48,7 +48,7 @@ class ZmqMessage {
 
  private:
   zmq_msg_t msg_;
-  AT_DISABLE_COPY_AND_ASSIGN(ZmqMessage);
+  C10_DISABLE_COPY_AND_ASSIGN(ZmqMessage);
 };
 
 class ZmqSocket {
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 869a563d05a27..45e9c99c3265c 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -39,23 +39,6 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
   endif()
 endif()
 
-# ---[ git: used to generate git build string.
-find_package(Git)
-if(GIT_FOUND)
-  execute_process(COMMAND ${GIT_EXECUTABLE} describe --tags --always --dirty
-                  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE
-                  WORKING_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}/.."
-                  OUTPUT_VARIABLE CAFFE2_GIT_VERSION
-                  RESULT_VARIABLE __git_result)
-  if(NOT ${__git_result} EQUAL 0)
-    set(CAFFE2_GIT_VERSION "unknown")
-  endif()
-else()
-  message(
-      WARNING
-      "Cannot find git, so Caffe2 won't have any git build info available")
-endif()
-
 # ---[ BLAS
 if(NOT BUILD_ATEN_MOBILE)
   set(BLAS "MKL" CACHE STRING "Selected BLAS library")
@@ -419,13 +402,15 @@ find_package(pybind11 CONFIG)
 if((DEFINED pybind11_DIR) AND pybind11_DIR)
   get_target_property(pybind11_INCLUDE_DIRS pybind11::pybind11 INTERFACE_INCLUDE_DIRECTORIES)
 else()
-  message("pybind11 config not found. Fallback to legacy find.")
   find_package(pybind11)
 endif()
 
 if(pybind11_FOUND)
+    message(STATUS "System pybind11 found")
+    message(STATUS "pybind11l include dirs: " ${pybind11_INCLUDE_DIRS})
     include_directories(SYSTEM ${pybind11_INCLUDE_DIRS})
 else()
+    message(STATUS "Using third_party/pybind11.")
     include_directories(SYSTEM ${CMAKE_CURRENT_LIST_DIR}/../third_party/pybind11/include)
 endif()
 
diff --git a/cmake/Modules/FindMKL.cmake b/cmake/Modules/FindMKL.cmake
index b296e5f2e47ae..441a8e20cf068 100644
--- a/cmake/Modules/FindMKL.cmake
+++ b/cmake/Modules/FindMKL.cmake
@@ -314,15 +314,15 @@ if (USE_MKL AND USE_IDEEP)
   set(IDEEP_ROOT "${PROJECT_SOURCE_DIR}/third_party/ideep")
   set(MKLDNN_ROOT "${IDEEP_ROOT}/mkl-dnn")
   find_path(IDEEP_INCLUDE_DIR ideep.hpp PATHS ${IDEEP_ROOT} PATH_SUFFIXES include)
-  find_path(MKLDNN_INCLUDE_DIR mkldnn.hpp mkldnn.h PATHS ${MKLDNN_ROOT} PATH_SUFFIXES include)
-  if (NOT MKLDNN_INCLUDE_DIR)
+  find_path(MKLDNN_INCLUDE_DIR_HACK mkldnn.hpp mkldnn.h PATHS ${MKLDNN_ROOT} PATH_SUFFIXES include)
+  if (NOT MKLDNN_INCLUDE_DIR_HACK)
     execute_process(COMMAND git submodule update --init mkl-dnn WORKING_DIRECTORY ${IDEEP_ROOT})
-    find_path(MKLDNN_INCLUDE_DIR mkldnn.hpp mkldnn.h PATHS ${MKLDNN_ROOT} PATH_SUFFIXES include)
+    find_path(MKLDNN_INCLUDE_DIR_HACK mkldnn.hpp mkldnn.h PATHS ${MKLDNN_ROOT} PATH_SUFFIXES include)
   endif()
 
-  if (MKLDNN_INCLUDE_DIR)
-    list(APPEND IDEEP_INCLUDE_DIR ${MKLDNN_INCLUDE_DIR})
-    list(APPEND __ideep_looked_for MKLDNN_INCLUDE_DIR)
+  if (MKLDNN_INCLUDE_DIR_HACK)
+    list(APPEND IDEEP_INCLUDE_DIR ${MKLDNN_INCLUDE_DIR_HACK})
+    list(APPEND __ideep_looked_for MKLDNN_INCLUDE_DIR_HACK)
     # to avoid adding conflicting submodels
     set(ORIG_WITH_TEST ${WITH_TEST})
     set(WITH_TEST OFF)
@@ -379,7 +379,7 @@ if (USE_MKL AND USE_IDEEP)
     endif()
 
     caffe_clear_vars(__ideep_looked_for __mklml_inner_libs)
-  endif() # MKLDNN_INCLUDE_DIR
+  endif() # MKLDNN_INCLUDE_DIR_HACK
 endif() # USE_IDEEP
 
 # Do nothing if MKL_FOUND was set before!
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index 16d18ac7634d0..58eae123dd137 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -5,7 +5,6 @@ function (caffe2_print_configuration_summary)
   message(STATUS "General:")
   message(STATUS "  CMake version         : ${CMAKE_VERSION}")
   message(STATUS "  CMake command         : ${CMAKE_COMMAND}")
-  message(STATUS "  Git version           : ${CAFFE2_GIT_VERSION}")
   message(STATUS "  System                : ${CMAKE_SYSTEM_NAME}")
   message(STATUS "  C++ compiler          : ${CMAKE_CXX_COMPILER}")
   message(STATUS "  C++ compiler version  : ${CMAKE_CXX_COMPILER_VERSION}")
@@ -18,6 +17,8 @@ function (caffe2_print_configuration_summary)
   message(STATUS "  CMAKE_INSTALL_PREFIX  : ${CMAKE_INSTALL_PREFIX}")
   message(STATUS "")
 
+  message(STATUS "  TORCH_VERSION         : ${TORCH_VERSION}")
+  message(STATUS "  CAFFE2_VERSION        : ${CAFFE2_VERSION}")
   message(STATUS "  BUILD_ATEN_MOBILE     : ${BUILD_ATEN_MOBILE}")
   message(STATUS "  BUILD_BINARY          : ${BUILD_BINARY}")
   message(STATUS "  BUILD_CUSTOM_PROTOBUF : ${BUILD_CUSTOM_PROTOBUF}")
diff --git a/cmake/TorchConfig.cmake.in b/cmake/TorchConfig.cmake.in
index 066a7e63f9c57..2b847815603a9 100644
--- a/cmake/TorchConfig.cmake.in
+++ b/cmake/TorchConfig.cmake.in
@@ -24,9 +24,13 @@ endif()
 
 # Include directories.
 if (EXISTS "${TORCH_INSTALL_PREFIX}/lib/include")
-  set(TORCH_INCLUDE_DIRS "${TORCH_INSTALL_PREFIX}/lib/include")
+  set(TORCH_INCLUDE_DIRS
+    ${TORCH_INSTALL_PREFIX}/lib/include
+    ${TORCH_INSTALL_PREFIX}/lib/include/torch/csrc/api/include)
 else()
-  set(TORCH_INCLUDE_DIRS "${TORCH_INSTALL_PREFIX}/include")
+  set(TORCH_INCLUDE_DIRS
+    ${TORCH_INSTALL_PREFIX}/include
+    ${TORCH_INSTALL_PREFIX}/include/torch/csrc/api/include)
 endif()
 
 # Library dependencies.
@@ -45,7 +49,7 @@ if (@USE_CUDA@)
     set(TORCH_CUDA_LIBRARIES
       ${NVTOOLEXT_HOME}/lib/x64/nvToolsExt64_1.lib
       ${CUDA_LIBRARIES})
-    list(APPEND TORCH_INCLUDE_DIRS "${NVTOOLEXT_HOME}/include")
+    list(APPEND TORCH_INCLUDE_DIRS ${NVTOOLEXT_HOME}/include)
   elseif(APPLE)
     set(TORCH_CUDA_LIBRARIES
       ${CUDA_TOOLKIT_ROOT_DIR}/lib/libcudart.dylib
@@ -66,8 +70,8 @@ endif()
 set(TORCH_CXX_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=@GLIBCXX_USE_CXX11_ABI@")
 
 set_target_properties(torch PROPERTIES
-    IMPORTED_LOCATION ${TORCH_LIBRARY}
-    INTERFACE_INCLUDE_DIRECTORIES ${TORCH_INCLUDE_DIRS}
-    INTERFACE_COMPILE_OPTIONS ${TORCH_CXX_FLAGS}
+    IMPORTED_LOCATION "${TORCH_LIBRARY}"
+    INTERFACE_INCLUDE_DIRECTORIES "${TORCH_INCLUDE_DIRS}"
+    INTERFACE_COMPILE_OPTIONS "${TORCH_CXX_FLAGS}"
     CXX_STANDARD 11
 )
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index c212805a7b0dc..5505ae1f5c71b 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -113,6 +113,21 @@ function(caffe_parse_header_single_define LIBNAME HDR_PATH VARNAME)
   endif()
 endfunction()
 
+################################################################################################
+# Parses a version string that might have values beyond major, minor, and patch
+# and set version variables for the library.
+# Usage:
+#   caffe2_parse_version_str(<library_name> <version_string>)
+function(caffe2_parse_version_str LIBNAME VERSIONSTR)
+  string(REGEX REPLACE "^([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MAJOR "${VERSIONSTR}")
+  string(REGEX REPLACE "^[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MINOR  "${VERSIONSTR}")
+  string(REGEX REPLACE "[0-9]+\\.[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_PATCH "${VERSIONSTR}")
+  set(${LIBNAME}_VERSION_MAJOR ${${LIBNAME}_VERSION_MAJOR} ${ARGN} PARENT_SCOPE)
+  set(${LIBNAME}_VERSION_MINOR ${${LIBNAME}_VERSION_MINOR} ${ARGN} PARENT_SCOPE)
+  set(${LIBNAME}_VERSION_PATCH ${${LIBNAME}_VERSION_PATCH} ${ARGN} PARENT_SCOPE)
+  set(${LIBNAME}_VERSION "${${LIBNAME}_VERSION_MAJOR}.${${LIBNAME}_VERSION_MINOR}.${${LIBNAME}_VERSION_PATCH}" PARENT_SCOPE)
+endfunction()
+
 ##############################################################################
 # Helper function to automatically generate __init__.py files where python
 # sources reside but there are no __init__.py present.
diff --git a/modules/observers/macros.h b/modules/observers/macros.h
new file mode 100644
index 0000000000000..e69b055d2a1d5
--- /dev/null
+++ b/modules/observers/macros.h
@@ -0,0 +1,7 @@
+#include "c10/macros/Macros.h"
+
+#ifdef CAFFE2_BUILD_OBSERVER_LIB
+#define CAFFE2_OBSERVER_API C10_EXPORT
+#else
+#define CAFFE2_OBSERVER_API C10_IMPORT
+#endif
diff --git a/modules/observers/net_observer_reporter.h b/modules/observers/net_observer_reporter.h
index 3650e4584f992..5619b69a636e7 100644
--- a/modules/observers/net_observer_reporter.h
+++ b/modules/observers/net_observer_reporter.h
@@ -4,6 +4,7 @@
 
 #include "caffe2/core/common.h"
 #include "caffe2/core/net.h"
+#include "observers/macros.h"
 
 namespace caffe2 {
 
diff --git a/modules/observers/net_observer_reporter_print.h b/modules/observers/net_observer_reporter_print.h
index eb712b8e71ea2..098a7f7573399 100644
--- a/modules/observers/net_observer_reporter_print.h
+++ b/modules/observers/net_observer_reporter_print.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "observers/macros.h"
 #include "observers/net_observer_reporter.h"
 
 #include "caffe2/core/common.h"
diff --git a/modules/observers/observer_config.h b/modules/observers/observer_config.h
index e1a6b3a0ead8b..cc967263a66b9 100644
--- a/modules/observers/observer_config.h
+++ b/modules/observers/observer_config.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "observers/macros.h"
 #include "observers/net_observer_reporter.h"
 
 #include "caffe2/core/common.h"
diff --git a/modules/observers/perf_observer.h b/modules/observers/perf_observer.h
index 6fb4063ffe480..11fb870a61961 100644
--- a/modules/observers/perf_observer.h
+++ b/modules/observers/perf_observer.h
@@ -4,6 +4,7 @@
 #include "caffe2/core/net.h"
 #include "caffe2/core/observer.h"
 #include "caffe2/core/timer.h"
+#include "observers/macros.h"
 
 #include <unordered_map>
 
diff --git a/modules/rocksdb/rocksdb.cc b/modules/rocksdb/rocksdb.cc
index b4752b67ca569..4f8918df41389 100644
--- a/modules/rocksdb/rocksdb.cc
+++ b/modules/rocksdb/rocksdb.cc
@@ -67,7 +67,7 @@ class RocksDBTransaction : public Transaction {
   rocksdb::DB* db_;
   std::unique_ptr<rocksdb::WriteBatch> batch_;
 
-  AT_DISABLE_COPY_AND_ASSIGN(RocksDBTransaction);
+  C10_DISABLE_COPY_AND_ASSIGN(RocksDBTransaction);
 };
 
 class RocksDB : public DB {
diff --git a/setup.py b/setup.py
index 381123b2b9ced..94455ed1cf7be 100644
--- a/setup.py
+++ b/setup.py
@@ -346,6 +346,7 @@ def build_libs(libs):
         build_libs_cmd = ['bash', os.path.join('..', 'tools', 'build_pytorch_libs.sh')]
     my_env = os.environ.copy()
     my_env["PYTORCH_PYTHON"] = sys.executable
+    my_env["PYTORCH_BUILD_VERSION"] = version
     my_env["CMAKE_PREFIX_PATH"] = full_site_packages
     my_env["NUM_JOBS"] = str(NUM_JOBS)
     my_env["ONNX_NAMESPACE"] = ONNX_NAMESPACE
@@ -471,18 +472,9 @@ def check_file(f):
             if not same:
                 shutil.copyfile(orig_file, sym_file)
 
-        # Copy headers necessary to compile C++ extensions.
-        #
-        # This is not perfect solution as build does not depend on any of
-        # the auto-generated code and auto-generated files will not be
-        # included in this copy. If we want to use auto-generated files,
-        # we need to find a better way to do this.
-        # More information can be found in conversation thread of PR #5772
-
         self.copy_tree('torch/lib/tmp_install/share', 'torch/share')
         self.copy_tree('third_party/pybind11/include/pybind11/',
                        'torch/lib/include/pybind11')
-        self.copy_file('torch/csrc/torch.h', 'torch/lib/include/torch/torch.h')
 
 
 build_dep_cmds = {}
@@ -1208,9 +1200,17 @@ def make_relative_rpath(path):
                 'lib/include/ATen/cudnn/*.h',
                 'lib/include/ATen/detail/*.h',
                 'lib/include/caffe2/utils/*.h',
+                'lib/include/c10/*.h',
+                'lib/include/c10/macros/*.h',
                 'lib/include/torch/*.h',
                 'lib/include/torch/csrc/*.h',
-                'lib/include/torch/csrc/api/include/torch/detail/ordered_dict.h',
+                'lib/include/torch/csrc/api/include/torch/*.h',
+                'lib/include/torch/csrc/api/include/torch/detail/*.h',
+                'lib/include/torch/csrc/api/include/torch/nn/*.h',
+                'lib/include/torch/csrc/api/include/torch/nn/modules/*.h',
+                'lib/include/torch/csrc/api/include/torch/nn/parallel/*.h',
+                'lib/include/torch/csrc/api/include/torch/optim/*.h',
+                'lib/include/torch/csrc/api/include/torch/serialize/*.h',
                 'lib/include/torch/csrc/autograd/*.h',
                 'lib/include/torch/csrc/autograd/generated/*.h',
                 'lib/include/torch/csrc/cuda/*.h',
diff --git a/test/cpp/api/any.cpp b/test/cpp/api/any.cpp
index 0d8e98c4157ab..22eda0d1004d2 100644
--- a/test/cpp/api/any.cpp
+++ b/test/cpp/api/any.cpp
@@ -71,7 +71,7 @@ TEST_F(
   ASSERT_TRUE(
       any.forward(std::string("a"), std::string("ab"), std::string("abc"))
           .sum()
-          .toCInt() == 6);
+          .item<int32_t>() == 6);
 }
 
 TEST_F(AnyModuleTest, WrongArgumentType) {
@@ -232,10 +232,10 @@ TEST_F(AnyModuleTest, ConvertsVariableToTensorCorrectly) {
   // mismatch).
   AnyModule any(M{});
   ASSERT_TRUE(
-      any.forward(torch::autograd::Variable(torch::ones(5))).sum().toCFloat() ==
+      any.forward(torch::autograd::Variable(torch::ones(5))).sum().item<float>() ==
       5);
   // at::Tensors that are not variables work too.
-  ASSERT_EQ(any.forward(at::ones(5)).sum().toCFloat(), 5);
+  ASSERT_EQ(any.forward(at::ones(5)).sum().item<float>(), 5);
 }
 
 namespace torch {
diff --git a/test/cpp/api/integration.cpp b/test/cpp/api/integration.cpp
index 131b0440a41a1..b2d10097b2393 100644
--- a/test/cpp/api/integration.cpp
+++ b/test/cpp/api/integration.cpp
@@ -63,10 +63,10 @@ class CartPole {
   }
 
   void step(int action) {
-    auto x = state[0].toCFloat();
-    auto x_dot = state[1].toCFloat();
-    auto theta = state[2].toCFloat();
-    auto theta_dot = state[3].toCFloat();
+    auto x = state[0].item<float>();
+    auto x_dot = state[1].item<float>();
+    auto theta = state[2].item<float>();
+    auto theta_dot = state[3].item<float>();
 
     auto force = (action == 1) ? force_mag : -force_mag;
     auto costheta = std::cos(theta);
@@ -222,7 +222,7 @@ bool test_mnist(
   torch::NoGradGuard guard;
   auto result = std::get<1>(forward_op(tedata).max(1));
   torch::Tensor correct = (result == telabel).toType(torch::kFloat32);
-  return correct.sum().toCFloat() > telabel.size(0) * 0.8;
+  return correct.sum().item<float>() > telabel.size(0) * 0.8;
 }
 
 struct IntegrationTest : torch::test::SeedingFixture {};
@@ -251,7 +251,7 @@ TEST_F(IntegrationTest, CartPole) {
     auto out = forward(state);
     auto probs = torch::Tensor(std::get<0>(out));
     auto value = torch::Tensor(std::get<1>(out));
-    auto action = probs.multinomial(1)[0].toCInt();
+    auto action = probs.multinomial(1)[0].item<int32_t>();
     // Compute the log prob of a multinomial distribution.
     // This should probably be actually implemented in autogradpp...
     auto p = probs / probs.sum(-1, true);
@@ -274,7 +274,7 @@ TEST_F(IntegrationTest, CartPole) {
     std::vector<torch::Tensor> policy_loss;
     std::vector<torch::Tensor> value_loss;
     for (auto i = 0U; i < saved_log_probs.size(); i++) {
-      auto r = rewards[i] - saved_values[i].toCFloat();
+      auto r = rewards[i] - saved_values[i].item<float>();
       policy_loss.push_back(-r * saved_log_probs[i]);
       value_loss.push_back(
           torch::smooth_l1_loss(saved_values[i], torch::ones(1) * rewards[i]));
diff --git a/test/cpp/api/jit.cpp b/test/cpp/api/jit.cpp
index 34b3e8f630c2a..9aa6968df71f5 100644
--- a/test/cpp/api/jit.cpp
+++ b/test/cpp/api/jit.cpp
@@ -20,10 +20,10 @@ TEST(TorchScriptTest, CanCompileMultipleFunctions) {
   auto a = torch::ones(1);
   auto b = torch::ones(1);
 
-  ASSERT_EQ(1, module->run_method("test_mul", a, b).toTensor().toCLong());
+  ASSERT_EQ(1, module->run_method("test_mul", a, b).toTensor().item<int64_t>());
 
-  ASSERT_EQ(2, module->run_method("test_relu", a, b).toTensor().toCLong());
+  ASSERT_EQ(2, module->run_method("test_relu", a, b).toTensor().item<int64_t>());
 
   ASSERT_TRUE(
-      0x200 == module->run_method("test_while", a, b).toTensor().toCLong());
+      0x200 == module->run_method("test_while", a, b).toTensor().item<int64_t>());
 }
diff --git a/test/cpp/api/misc.cpp b/test/cpp/api/misc.cpp
index ca716d0ac0c95..b85cb9dcc1a86 100644
--- a/test/cpp/api/misc.cpp
+++ b/test/cpp/api/misc.cpp
@@ -49,5 +49,5 @@ TEST(NNInitTest, CanInitializeTensorThatRequiresGrad) {
       tensor.fill_(1),
       "a leaf Variable that requires grad "
       "has been used in an in-place operation");
-  ASSERT_EQ(torch::nn::init::ones_(tensor).sum().toCInt(), 12);
+  ASSERT_EQ(torch::nn::init::ones_(tensor).sum().item<int32_t>(), 12);
 }
diff --git a/test/cpp/api/module.cpp b/test/cpp/api/module.cpp
index f2bca9501ae64..70d05d4240e77 100644
--- a/test/cpp/api/module.cpp
+++ b/test/cpp/api/module.cpp
@@ -41,13 +41,13 @@ TEST_F(ModuleTest, ZeroGrad) {
   for (auto& parameter : module->parameters()) {
     auto grad = parameter->grad();
     ASSERT_TRUE(grad.defined());
-    ASSERT_NE(grad.sum().toCFloat(), 0);
+    ASSERT_NE(grad.sum().item<float>(), 0);
   }
   module->zero_grad();
   for (auto& parameter : module->parameters()) {
     auto grad = parameter->grad();
     ASSERT_TRUE(grad.defined());
-    ASSERT_EQ(grad.sum().toCFloat(), 0);
+    ASSERT_EQ(grad.sum().item<float>(), 0);
   }
 }
 
@@ -72,7 +72,7 @@ TEST_F(ModuleTest, ZeroGradWithUndefined) {
   ASSERT_TRUE(module.x.grad().defined());
   ASSERT_FALSE(module.y.grad().defined());
 
-  ASSERT_EQ(module.x.grad().sum().toCFloat(), 0);
+  ASSERT_EQ(module.x.grad().sum().item<float>(), 0);
 }
 
 TEST_F(ModuleTest, CanGetName) {
diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp
index 11e54a97a1885..fd9416eb3b9b6 100644
--- a/test/cpp/api/modules.cpp
+++ b/test/cpp/api/modules.cpp
@@ -134,7 +134,7 @@ TEST_F(ModulesTest, SimpleContainer) {
   ASSERT_EQ(x.ndimension(), 2);
   ASSERT_EQ(x.size(0), 1000);
   ASSERT_EQ(x.size(1), 100);
-  ASSERT_EQ(x.min().toCFloat(), 0);
+  ASSERT_EQ(x.min().item<float>(), 0);
 }
 
 TEST_F(ModulesTest, EmbeddingBasic) {
@@ -181,12 +181,12 @@ TEST_F(ModulesTest, Dropout) {
   y.backward();
   ASSERT_EQ(y.ndimension(), 1);
   ASSERT_EQ(y.size(0), 100);
-  ASSERT_LT(y.sum().toCFloat(), 130); // Probably
-  ASSERT_GT(y.sum().toCFloat(), 70); // Probably
+  ASSERT_LT(y.sum().item<float>(), 130); // Probably
+  ASSERT_GT(y.sum().item<float>(), 70); // Probably
 
   dropout->eval();
   y = dropout->forward(x);
-  ASSERT_EQ(y.sum().toCFloat(), 100);
+  ASSERT_EQ(y.sum().item<float>(), 100);
 }
 
 TEST_F(ModulesTest, Parameters) {
@@ -228,15 +228,15 @@ TEST_F(ModulesTest, FunctionalCallsSuppliedFunction) {
 
 TEST_F(ModulesTest, FunctionalWithTorchFunction) {
   auto functional = Functional(torch::relu);
-  ASSERT_EQ(functional(torch::ones({})).toCFloat(), 1);
-  ASSERT_EQ(functional(torch::ones({})).toCFloat(), 1);
-  ASSERT_EQ(functional(torch::ones({}) * -1).toCFloat(), 0);
+  ASSERT_EQ(functional(torch::ones({})).item<float>(), 1);
+  ASSERT_EQ(functional(torch::ones({})).item<float>(), 1);
+  ASSERT_EQ(functional(torch::ones({}) * -1).item<float>(), 0);
 }
 
 TEST_F(ModulesTest, FunctionalArgumentBinding) {
   auto functional =
       Functional(torch::elu, /*alpha=*/1, /*scale=*/0, /*input_scale=*/1);
-  ASSERT_EQ(functional(torch::ones({})).toCFloat(), 0);
+  ASSERT_EQ(functional(torch::ones({})).item<float>(), 0);
 }
 
 TEST_F(ModulesTest, BatchNormStateful) {
diff --git a/test/cpp/api/optim.cpp b/test/cpp/api/optim.cpp
index 03f7ed92a9b35..944a31ca7e997 100644
--- a/test/cpp/api/optim.cpp
+++ b/test/cpp/api/optim.cpp
@@ -44,7 +44,7 @@ bool test_optimizer_xor(Options options) {
     auto labels = torch::empty({kBatchSize});
     for (size_t i = 0; i < kBatchSize; i++) {
       inputs[i] = torch::randint(2, {2}, torch::kInt64);
-      labels[i] = inputs[i][0].toCLong() ^ inputs[i][1].toCLong();
+      labels[i] = inputs[i][0].item<int64_t>() ^ inputs[i][1].item<int64_t>();
     }
     inputs.set_requires_grad(true);
     optimizer.zero_grad();
@@ -54,7 +54,7 @@ bool test_optimizer_xor(Options options) {
 
     optimizer.step();
 
-    running_loss = running_loss * 0.99 + loss.toCFloat() * 0.01;
+    running_loss = running_loss * 0.99 + loss.item<float>() * 0.01;
     if (epoch > kMaximumNumberOfEpochs) {
       std::cout << "Loss is too high after epoch " << epoch << ": "
                 << running_loss << std::endl;
@@ -286,14 +286,14 @@ TEST(OptimTest, ZeroGrad) {
 
   for (const auto& parameter : model->parameters()) {
     ASSERT_TRUE(parameter->grad().defined());
-    ASSERT_GT(parameter->grad().sum().toCFloat(), 0);
+    ASSERT_GT(parameter->grad().sum().item<float>(), 0);
   }
 
   optimizer.zero_grad();
 
   for (const auto& parameter : model->parameters()) {
     ASSERT_TRUE(parameter->grad().defined());
-    ASSERT_EQ(parameter->grad().sum().toCFloat(), 0);
+    ASSERT_EQ(parameter->grad().sum().item<float>(), 0);
   }
 }
 
diff --git a/test/cpp/api/parallel.cpp b/test/cpp/api/parallel.cpp
index 71bcc542f8439..a191078236447 100644
--- a/test/cpp/api/parallel.cpp
+++ b/test/cpp/api/parallel.cpp
@@ -38,7 +38,7 @@ TEST_F(ParallelTest, DifferentiableScatter_MultiCUDA) {
 
   ASSERT_TRUE(input.grad().defined());
   ASSERT_TRUE(input.grad().device().is_cpu());
-  ASSERT_EQ(input.grad().sum().toCInt(), 10);
+  ASSERT_EQ(input.grad().sum().item<int32_t>(), 10);
 }
 
 TEST_F(ParallelTest, DifferentiableGather_MultiCUDA) {
@@ -62,11 +62,11 @@ TEST_F(ParallelTest, DifferentiableGather_MultiCUDA) {
 
   ASSERT_TRUE(a.grad().defined());
   ASSERT_EQ(a.grad().device(), torch::Device(torch::kCUDA, 0));
-  ASSERT_EQ(a.grad().sum().toCInt(), 5);
+  ASSERT_EQ(a.grad().sum().item<int32_t>(), 5);
 
   ASSERT_TRUE(b.grad().defined());
   ASSERT_EQ(b.grad().device(), torch::Device(torch::kCUDA, 1));
-  ASSERT_EQ(b.grad().sum().toCInt(), 5);
+  ASSERT_EQ(b.grad().sum().item<int32_t>(), 5);
 }
 
 TEST_F(ParallelTest, Replicate_MultiCUDA) {
@@ -226,6 +226,6 @@ TEST_F(ParallelTest, DataParallelUsesAllAvailableCUDADevices_CUDA) {
   const auto device_count = torch::cuda::device_count();
   ASSERT_EQ(output.numel(), device_count);
   for (size_t i = 0; i < device_count; ++i) {
-    ASSERT_EQ(output[i].toCInt(), i);
+    ASSERT_EQ(output[i].item<int32_t>(), i);
   }
 }
diff --git a/test/cpp/api/rnn.cpp b/test/cpp/api/rnn.cpp
index 96ffd37eb0f62..e0d511fb09938 100644
--- a/test/cpp/api/rnn.cpp
+++ b/test/cpp/api/rnn.cpp
@@ -56,7 +56,7 @@ bool test_RNN_xor(Func&& model_maker, bool cuda = false) {
     loss.backward();
     optimizer.step();
 
-    running_loss = running_loss * 0.99 + loss.toCFloat() * 0.01;
+    running_loss = running_loss * 0.99 + loss.item<float>() * 0.01;
     if (epoch > max_epoch) {
       return false;
     }
@@ -81,7 +81,7 @@ void check_lstm_sizes(RNNOutput output) {
   ASSERT_EQ(output.state.size(3), 64); // 64 hidden dims
 
   // Something is in the hiddens
-  ASSERT_GT(output.state.norm().toCFloat(), 0);
+  ASSERT_GT(output.state.norm().item<float>(), 0);
 }
 
 struct RNNTest : torch::test::SeedingFixture {};
@@ -103,7 +103,7 @@ TEST_F(RNNTest, CheckOutputSizes) {
   torch::Tensor diff = next.state - output.state;
 
   // Hiddens changed
-  ASSERT_GT(diff.abs().sum().toCFloat(), 1e-3);
+  ASSERT_GT(diff.abs().sum().item<float>(), 1e-3);
 }
 
 TEST_F(RNNTest, CheckOutputValuesMatchPyTorch) {
@@ -137,7 +137,7 @@ TEST_F(RNNTest, CheckOutputValuesMatchPyTorch) {
                    0.6620, 0.7860, 0.6501, 0.7741, 0.7889, 0.9003,
                    0.7769, 0.8905, 0.7635, 0.8794, 0.7484, 0.8666};
   for (size_t i = 0; i < 3 * 4 * 2; i++) {
-    ASSERT_LT(std::abs(flat[i].toCFloat() - c_out[i]), 1e-3);
+    ASSERT_LT(std::abs(flat[i].item<float>() - c_out[i]), 1e-3);
   }
 
   ASSERT_EQ(out.state.ndimension(), 4); // (hx, cx) x layers x B x 2
@@ -163,7 +163,7 @@ TEST_F(RNNTest, CheckOutputValuesMatchPyTorch) {
                    1.0931,
                    1.4911};
   for (size_t i = 0; i < 16; i++) {
-    ASSERT_LT(std::abs(flat[i].toCFloat() - h_out[i]), 1e-3);
+    ASSERT_LT(std::abs(flat[i].item<float>() - h_out[i]), 1e-3);
   }
 }
 
@@ -206,7 +206,7 @@ TEST_F(RNNTest, Sizes_CUDA) {
   torch::Tensor diff = next.state - output.state;
 
   // Hiddens changed
-  ASSERT_GT(diff.abs().sum().toCFloat(), 1e-3);
+  ASSERT_GT(diff.abs().sum().item<float>(), 1e-3);
 }
 
 TEST_F(RNNTest, EndToEndLSTM_CUDA) {
diff --git a/test/cpp/api/serialize.cpp b/test/cpp/api/serialize.cpp
index a37c00c2e3eff..0612029f53bca 100644
--- a/test/cpp/api/serialize.cpp
+++ b/test/cpp/api/serialize.cpp
@@ -90,7 +90,7 @@ TEST(Serialize, XOR) {
     auto labels = torch::empty({batch_size});
     for (size_t i = 0; i < batch_size; i++) {
       inputs[i] = torch::randint(2, {2}, torch::kInt64);
-      labels[i] = inputs[i][0].toCLong() ^ inputs[i][1].toCLong();
+      labels[i] = inputs[i][0].item<int64_t>() ^ inputs[i][1].item<int64_t>();
     }
     auto x = model->forward<torch::Tensor>(inputs);
     return torch::binary_cross_entropy(x, labels);
@@ -112,7 +112,7 @@ TEST(Serialize, XOR) {
     loss.backward();
     optimizer.step();
 
-    running_loss = running_loss * 0.99 + loss.sum().toCFloat() * 0.01;
+    running_loss = running_loss * 0.99 + loss.sum().item<float>() * 0.01;
     ASSERT_LT(epoch, 3000);
     epoch++;
   }
@@ -122,7 +122,7 @@ TEST(Serialize, XOR) {
   torch::load(model2, tempfile.str());
 
   auto loss = getLoss(model2, 100);
-  ASSERT_LT(loss.toCFloat(), 0.1);
+  ASSERT_LT(loss.item<float>(), 0.1);
 }
 
 TEST(Serialize, Optim) {
@@ -188,9 +188,9 @@ TEST(Serialize, Optim) {
     const auto& name = p.key;
     // Model 1 and 3 should be the same
     ASSERT_TRUE(
-        param1[name].norm().toCFloat() == param3[name].norm().toCFloat());
+        param1[name].norm().item<float>() == param3[name].norm().item<float>());
     ASSERT_TRUE(
-        param1[name].norm().toCFloat() != param2[name].norm().toCFloat());
+        param1[name].norm().item<float>() != param2[name].norm().item<float>());
   }
 }
 
@@ -202,7 +202,7 @@ TEST(Serialize, Optim) {
 //     auto labels = torch::empty({batch_size});
 //     for (size_t i = 0; i < batch_size; i++) {
 //       inputs[i] = torch::randint(2, {2}, torch::kInt64);
-//       labels[i] = inputs[i][0].toCLong() ^ inputs[i][1].toCLong();
+//       labels[i] = inputs[i][0].item<int64_t>() ^ inputs[i][1].item<int64_t>();
 //     }
 //     auto x = model->forward<torch::Tensor>(inputs);
 //     return torch::binary_cross_entropy(x, labels);
@@ -224,7 +224,7 @@ TEST(Serialize, Optim) {
 //     loss.backward();
 //     optimizer.step();
 //
-//     running_loss = running_loss * 0.99 + loss.sum().toCFloat() * 0.01;
+//     running_loss = running_loss * 0.99 + loss.sum().item<float>() * 0.01;
 //     ASSERT_LT(epoch, 3000);
 //     epoch++;
 //   }
@@ -234,7 +234,7 @@ TEST(Serialize, Optim) {
 //   torch::load(model2, tempfile.str());
 //
 //   auto loss = getLoss(model2, 100);
-//   ASSERT_LT(loss.toCFloat(), 0.1);
+//   ASSERT_LT(loss.item<float>(), 0.1);
 //
 //   model2->to(torch::kCUDA);
 //   torch::test::TempFile tempfile2;
@@ -242,5 +242,5 @@ TEST(Serialize, Optim) {
 //   torch::load(model3, tempfile2.str());
 //
 //   loss = getLoss(model3, 100);
-//   ASSERT_LT(loss.toCFloat(), 0.1);
+//   ASSERT_LT(loss.item<float>(), 0.1);
 // }
diff --git a/test/cpp/api/tensor.cpp b/test/cpp/api/tensor.cpp
index ad14298d86c96..3996132cc8479 100644
--- a/test/cpp/api/tensor.cpp
+++ b/test/cpp/api/tensor.cpp
@@ -104,7 +104,7 @@ TEST(TensorTest, ContainsCorrectValueForSingleValue) {
   auto tensor = at::tensor(123);
   ASSERT_EQ(tensor.numel(), 1);
   ASSERT_EQ(tensor.dtype(), at::kInt);
-  ASSERT_EQ(tensor[0].toCInt(), 123);
+  ASSERT_EQ(tensor[0].item<int32_t>(), 123);
 
   tensor = at::tensor(123.456f);
   ASSERT_EQ(tensor.numel(), 1);
@@ -189,7 +189,7 @@ TEST(TensorTest, FromBlob) {
   auto tensor = torch::from_blob(v.data(), v.size(), torch::kInt32);
   ASSERT_TRUE(tensor.is_variable());
   ASSERT_EQ(tensor.numel(), 3);
-  ASSERT_EQ(tensor[0].toCInt(), 1);
-  ASSERT_EQ(tensor[1].toCInt(), 2);
-  ASSERT_EQ(tensor[2].toCInt(), 3);
+  ASSERT_EQ(tensor[0].item<int32_t>(), 1);
+  ASSERT_EQ(tensor[1].item<int32_t>(), 2);
+  ASSERT_EQ(tensor[2].item<int32_t>(), 3);
 }
diff --git a/test/cpp_extensions/complex_registration_extension.cpp b/test/cpp_extensions/complex_registration_extension.cpp
index db75e3f67f777..21b05d060b190 100644
--- a/test/cpp_extensions/complex_registration_extension.cpp
+++ b/test/cpp_extensions/complex_registration_extension.cpp
@@ -1,4 +1,4 @@
-#include <torch/torch.h>
+#include <torch/extension.h>
 
 #include <ATen/CPUFloatType.h>
 #include <ATen/Type.h>
diff --git a/test/cpp_extensions/cpp_api_extension.cpp b/test/cpp_extensions/cpp_api_extension.cpp
new file mode 100644
index 0000000000000..066ad64160fa5
--- /dev/null
+++ b/test/cpp_extensions/cpp_api_extension.cpp
@@ -0,0 +1,38 @@
+#include <torch/extension.h>
+#include <torch/python.h>
+#include <torch/torch.h>
+
+struct Net : torch::nn::Module {
+  Net(int64_t in, int64_t out)
+      : fc(in, out),
+        bn(torch::nn::BatchNormOptions(out).stateful(true)),
+        dropout(0.5) {
+    register_module("fc", fc);
+    register_module("bn", bn);
+    register_module("dropout", dropout);
+  }
+
+  torch::Tensor forward(torch::Tensor x) {
+    return dropout->forward(bn->forward(torch::relu(fc->forward(x))));
+  }
+
+  void set_bias(torch::Tensor bias) {
+    fc->bias = bias;
+  }
+
+  torch::Tensor get_bias() const {
+    return fc->bias;
+  }
+
+  torch::nn::Linear fc;
+  torch::nn::BatchNorm bn;
+  torch::nn::Dropout dropout;
+};
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  torch::python::bind_module<Net>(m, "Net")
+      .def(py::init<int64_t, int64_t>())
+      .def("forward", &Net::forward)
+      .def("set_bias", &Net::set_bias)
+      .def("get_bias", &Net::get_bias);
+}
diff --git a/test/cpp_extensions/cuda_extension.cpp b/test/cpp_extensions/cuda_extension.cpp
index 963850acc2795..9946b4f9cb97d 100644
--- a/test/cpp_extensions/cuda_extension.cpp
+++ b/test/cpp_extensions/cuda_extension.cpp
@@ -1,4 +1,4 @@
-#include <torch/torch.h>
+#include <torch/extension.h>
 
 // Declare the function from cuda_extension.cu. It will be compiled
 // separately with nvcc and linked with the object file of cuda_extension.cpp
diff --git a/test/cpp_extensions/cudnn_extension.cpp b/test/cpp_extensions/cudnn_extension.cpp
index 7c3be3e471630..498e01a116a15 100644
--- a/test/cpp_extensions/cudnn_extension.cpp
+++ b/test/cpp_extensions/cudnn_extension.cpp
@@ -10,7 +10,7 @@
  * 5) Return something (optional).
  */
 
-#include <torch/torch.h>
+#include <torch/extension.h>
 
 #include <ATen/cudnn/Descriptors.h> // for TensorDescriptor
 #include <ATen/cuda/Exceptions.h> // for CUDNN_CHECK
diff --git a/test/cpp_extensions/doubler.h b/test/cpp_extensions/doubler.h
index 2b22dca1284cd..d9e6aaea8c346 100644
--- a/test/cpp_extensions/doubler.h
+++ b/test/cpp_extensions/doubler.h
@@ -1,4 +1,4 @@
-#include <torch/torch.h>
+#include <torch/extension.h>
 
 struct Doubler {
   Doubler(int A, int B) {
diff --git a/test/cpp_extensions/extension.cpp b/test/cpp_extensions/extension.cpp
index 8e79397296910..3ba27d92f32d7 100644
--- a/test/cpp_extensions/extension.cpp
+++ b/test/cpp_extensions/extension.cpp
@@ -1,4 +1,4 @@
-#include <torch/torch.h>
+#include <torch/extension.h>
 
 at::Tensor sigmoid_add(at::Tensor x, at::Tensor y) {
   return x.sigmoid() + y.sigmoid();
diff --git a/test/cpp_extensions/half_support.cu b/test/cpp_extensions/half_support.cu
index a3621bfe7c55f..9d420438fb526 100644
--- a/test/cpp_extensions/half_support.cu
+++ b/test/cpp_extensions/half_support.cu
@@ -1,4 +1,4 @@
-#include <torch/torch.h>
+#include <torch/extension.h>
 
 #include <THC/THCNumerics.cuh>
 
diff --git a/test/cpp_extensions/jit_extension.cpp b/test/cpp_extensions/jit_extension.cpp
index e62be5b38ba1d..576e7fc9a1d3c 100644
--- a/test/cpp_extensions/jit_extension.cpp
+++ b/test/cpp_extensions/jit_extension.cpp
@@ -1,4 +1,4 @@
-#include <torch/torch.h>
+#include <torch/extension.h>
 
 #include "doubler.h"
 
diff --git a/test/cpp_extensions/jit_extension2.cpp b/test/cpp_extensions/jit_extension2.cpp
index e197308c3d59e..cfd472137187a 100644
--- a/test/cpp_extensions/jit_extension2.cpp
+++ b/test/cpp_extensions/jit_extension2.cpp
@@ -1,4 +1,4 @@
-#include <torch/torch.h>
+#include <torch/extension.h>
 
 using namespace at;
 
diff --git a/test/expect/TestJit.test_cpp_cuda.expect b/test/expect/TestJit.test_cpp_cuda.expect
index 451f1f9329601..8453308a0dfb5 100644
--- a/test/expect/TestJit.test_cpp_cuda.expect
+++ b/test/expect/TestJit.test_cpp_cuda.expect
@@ -65,6 +65,8 @@ graph(%0 : Dynamic
       %3 : Dynamic
       %4 : Dynamic) {
   %23 : Dynamic, %24 : Dynamic = prim::DifferentiableGraph_0(%0, %3, %1, %4, %2)
+  %7 : int = prim::Constant[value=1]()
+  %19 : int = prim::Constant[value=1]()
   return (%24, %23);
 }
 with prim::DifferentiableGraph_0 = graph(%1 : Dynamic
@@ -74,20 +76,20 @@ with prim::DifferentiableGraph_0 = graph(%1 : Dynamic
       %17 : Dynamic) {
   %0 : Dynamic = aten::mm(%1, %2)
   %3 : Dynamic = aten::mm(%4, %5)
-  %6 : int = prim::Constant[value=1]()
-  %7 : Dynamic = aten::add(%0, %3, %6)
-  %8 : Dynamic, %9 : Dynamic, %10 : Dynamic, %11 : Dynamic = prim::ConstantChunk[chunks=4, dim=1](%7)
+  %7 : int = prim::Constant[value=1]()
+  %6 : Dynamic = aten::add(%0, %3, %7)
+  %8 : Dynamic, %9 : Dynamic, %10 : Dynamic, %11 : Dynamic = prim::ConstantChunk[chunks=4, dim=1](%6)
   %12 : Dynamic = aten::sigmoid(%8)
   %13 : Dynamic = aten::sigmoid(%11)
   %14 : Dynamic = aten::tanh(%10)
   %15 : Dynamic = aten::sigmoid(%9)
   %16 : Dynamic = aten::mul(%15, %17)
   %18 : Dynamic = aten::mul(%12, %14)
-  %19 : int = prim::Constant[value=1]()
-  %20 : Dynamic = aten::add(%16, %18, %19)
-  %21 : Dynamic = aten::tanh(%20)
+  %20 : int = prim::Constant[value=1]()
+  %19 : Dynamic = aten::add(%16, %18, %20)
+  %21 : Dynamic = aten::tanh(%19)
   %22 : Dynamic = aten::mul(%13, %21)
-  return (%20, %22);
+  return (%19, %22);
 }
 
 testDifferentiate
diff --git a/test/expect/TestScript.test_lstm_fusion_cuda-backward.expect b/test/expect/TestScript.test_lstm_fusion_cuda-backward.expect
index efb3d272bb4c2..cbdbc744b5e85 100644
--- a/test/expect/TestScript.test_lstm_fusion_cuda-backward.expect
+++ b/test/expect/TestScript.test_lstm_fusion_cuda-backward.expect
@@ -17,20 +17,18 @@ graph(%0 : Float(*, *)
       %cellgate : Float(*, *)
       %outgate : Float(*, *)
       %18 : Float(*, *)) {
-  %19 : int = prim::Constant[value=1]()
-  %20 : Float(*, *), %21 : Float(*, *) = prim::FusionGroup_0[device=0](%ingate, %forgetgate, %cellgate, %outgate, %11, %1, %18, %0)
-  %22 : Float(*, *) = aten::mul(%20, %19)
-  %23 : Float(*, *) = aten::t(%13)
-  %24 : Float(*, *) = aten::mm(%22, %23)
-  %25 : Float(*, *) = aten::t(%10)
-  %26 : Float(*, *) = aten::mm(%25, %22)
-  %27 : Float(*, *) = aten::t(%26)
-  %28 : Float(*, *) = aten::t(%12)
-  %29 : Float(*, *) = aten::mm(%20, %28)
-  %30 : Float(*, *) = aten::t(%9)
-  %31 : Float(*, *) = aten::mm(%30, %20)
-  %32 : Float(*, *) = aten::t(%31)
-  return (%32, %29, %27, %24, %22, %22, %21);
+  %19 : Float(*, *), %20 : Float(*, *) = prim::FusionGroup_0[device=0](%ingate, %forgetgate, %cellgate, %outgate, %11, %1, %18, %0)
+  %21 : Float(*, *) = aten::t(%13)
+  %22 : Float(*, *) = aten::mm(%19, %21)
+  %23 : Float(*, *) = aten::t(%10)
+  %24 : Float(*, *) = aten::mm(%23, %19)
+  %25 : Float(*, *) = aten::t(%24)
+  %26 : Float(*, *) = aten::t(%12)
+  %27 : Float(*, *) = aten::mm(%19, %26)
+  %28 : Float(*, *) = aten::t(%9)
+  %29 : Float(*, *) = aten::mm(%28, %19)
+  %30 : Float(*, *) = aten::t(%29)
+  return (%30, %27, %25, %22, %19, %19, %20);
 }
 with prim::FusionGroup_0 = graph(%0 : Float(*, *)
       %1 : Float(*, *)
@@ -52,31 +50,29 @@ with prim::FusionGroup_0 = graph(%0 : Float(*, *)
   %17 : Float(*, *) = aten::add(%7, %14, %16)
   %18 : Float(*, *) = aten::mul(%17, %1)
   %19 : Float(*, *) = aten::mul(%5, %6)
-  %20 : int = prim::Constant[value=1]()
-  %21 : Float(*, *) = aten::mul(%17, %20)
-  %22 : Float(*, *) = aten::mul(%21, %2)
-  %23 : Float(*, *) = aten::mul(%21, %0)
-  %24 : Float(*, *) = aten::mul(%17, %4)
-  %25 : Float(*, *) = aten::neg(%3)
-  %26 : int = prim::Constant[value=1]()
-  %27 : Float(*, *) = aten::add(%25, %26, %26)
-  %28 : Float(*, *) = aten::mul(%19, %3)
-  %29 : Float(*, *) = aten::mul(%28, %27)
-  %30 : Float(*, *) = aten::mul(%2, %2)
-  %31 : Float(*, *) = aten::neg(%30)
-  %32 : int = prim::Constant[value=1]()
-  %33 : Float(*, *) = aten::add(%31, %32, %32)
-  %34 : Float(*, *) = aten::mul(%23, %33)
-  %35 : Float(*, *) = aten::neg(%1)
-  %36 : int = prim::Constant[value=1]()
-  %37 : Float(*, *) = aten::add(%35, %36, %36)
-  %38 : Float(*, *) = aten::mul(%24, %1)
-  %39 : Float(*, *) = aten::mul(%38, %37)
-  %40 : Float(*, *) = aten::neg(%0)
-  %41 : int = prim::Constant[value=1]()
-  %42 : Float(*, *) = aten::add(%40, %41, %41)
-  %43 : Float(*, *) = aten::mul(%22, %0)
-  %44 : Float(*, *) = aten::mul(%43, %42)
-  %45 : Float(*, *) = prim::FusedConcat[dim=1](%44, %39, %34, %29)
-  return (%45, %18);
+  %20 : Float(*, *) = aten::mul(%17, %2)
+  %21 : Float(*, *) = aten::mul(%17, %0)
+  %22 : Float(*, *) = aten::mul(%17, %4)
+  %23 : Float(*, *) = aten::neg(%3)
+  %24 : int = prim::Constant[value=1]()
+  %25 : Float(*, *) = aten::add(%23, %24, %24)
+  %26 : Float(*, *) = aten::mul(%19, %3)
+  %27 : Float(*, *) = aten::mul(%26, %25)
+  %28 : Float(*, *) = aten::mul(%2, %2)
+  %29 : Float(*, *) = aten::neg(%28)
+  %30 : int = prim::Constant[value=1]()
+  %31 : Float(*, *) = aten::add(%29, %30, %30)
+  %32 : Float(*, *) = aten::mul(%21, %31)
+  %33 : Float(*, *) = aten::neg(%1)
+  %34 : int = prim::Constant[value=1]()
+  %35 : Float(*, *) = aten::add(%33, %34, %34)
+  %36 : Float(*, *) = aten::mul(%22, %1)
+  %37 : Float(*, *) = aten::mul(%36, %35)
+  %38 : Float(*, *) = aten::neg(%0)
+  %39 : int = prim::Constant[value=1]()
+  %40 : Float(*, *) = aten::add(%38, %39, %39)
+  %41 : Float(*, *) = aten::mul(%20, %0)
+  %42 : Float(*, *) = aten::mul(%41, %40)
+  %43 : Float(*, *) = prim::FusedConcat[dim=1](%42, %37, %32, %27)
+  return (%43, %18);
 }
diff --git a/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect b/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect
index 1221d05e51925..b0dc85644751d 100644
--- a/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect
+++ b/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect
@@ -27,14 +27,17 @@ graph(%0 : Float(*, *)
       %outgate : Float(*, *)
       %27 : Float(*, *)) {
   %28 : Float(*, *) = prim::FusionGroup_0[device=0](%ingate, %forgetgate, %cellgate, %outgate, %17, %0, %27, %1)
-  %29 : Float(*, *), %30 : Float(*, *), %31 : Float(*, *), %32 : Float(*, *), %33 : Float(*, *), %34 : Float(*, *) = prim::FusionGroup_1[device=0](%14, %15, %Wx, %28, %Uz, %22, %16)
-  %35 : Float(*, *) = aten::t(%13)
-  %36 : Float(*, *) = aten::mm(%35, %31)
-  %37 : Float(*, *) = aten::t(%36)
-  %38 : Float(*, *) = aten::t(%12)
-  %39 : Float(*, *) = aten::mm(%38, %29)
-  %40 : Float(*, *) = aten::t(%39)
-  return (%40, %37, %30, %32, %33, %34);
+  %29 : Float(*, *) = aten::mul(%28, %Uz)
+  %30 : Float(*, *) = aten::mul(%28, %Wx)
+  %31 : Float(*, *) = prim::FusionGroup_1[device=0](%28, %22, %16)
+  %32 : Float(*, *), %33 : Float(*, *) = prim::FusionGroup_2[device=0](%14, %28, %15, %Wx, %Uz)
+  %34 : Float(*, *) = aten::t(%13)
+  %35 : Float(*, *) = aten::mm(%34, %31)
+  %36 : Float(*, *) = aten::t(%35)
+  %37 : Float(*, *) = aten::t(%12)
+  %38 : Float(*, *) = aten::mm(%37, %32)
+  %39 : Float(*, *) = aten::t(%38)
+  return (%39, %36, %33, %30, %29, %28);
 }
 with prim::FusionGroup_0 = graph(%0 : Float(*, *)
       %1 : Float(*, *)
@@ -53,58 +56,51 @@ with prim::FusionGroup_0 = graph(%0 : Float(*, *)
   %14 : Float(*, *) = aten::mul(%9, %13)
   %15 : int = prim::Constant[value=1]()
   %16 : Float(*, *) = aten::add(%5, %14, %15)
-  %17 : int = prim::Constant[value=1]()
-  %18 : Float(*, *) = aten::mul(%16, %17)
-  %19 : Float(*, *) = aten::mul(%18, %2)
-  %20 : Float(*, *) = aten::mul(%18, %0)
-  %21 : Float(*, *) = aten::mul(%16, %4)
-  %22 : Float(*, *) = aten::neg(%3)
-  %23 : int = prim::Constant[value=1]()
-  %24 : Float(*, *) = aten::add(%22, %23, %23)
-  %25 : Float(*, *) = aten::mul(%8, %3)
-  %26 : Float(*, *) = aten::mul(%25, %24)
-  %27 : Float(*, *) = aten::mul(%2, %2)
-  %28 : Float(*, *) = aten::neg(%27)
-  %29 : int = prim::Constant[value=1]()
-  %30 : Float(*, *) = aten::add(%28, %29, %29)
-  %31 : Float(*, *) = aten::mul(%20, %30)
-  %32 : Float(*, *) = aten::neg(%1)
-  %33 : int = prim::Constant[value=1]()
-  %34 : Float(*, *) = aten::add(%32, %33, %33)
-  %35 : Float(*, *) = aten::mul(%21, %1)
-  %36 : Float(*, *) = aten::mul(%35, %34)
-  %37 : Float(*, *) = aten::neg(%0)
-  %38 : int = prim::Constant[value=1]()
-  %39 : Float(*, *) = aten::add(%37, %38, %38)
-  %40 : Float(*, *) = aten::mul(%19, %0)
-  %41 : Float(*, *) = aten::mul(%40, %39)
-  %42 : Float(*, *) = prim::FusedConcat[dim=1](%41, %36, %31, %26)
-  return (%42);
+  %17 : Float(*, *) = aten::mul(%16, %2)
+  %18 : Float(*, *) = aten::mul(%16, %0)
+  %19 : Float(*, *) = aten::mul(%16, %4)
+  %20 : Float(*, *) = aten::neg(%3)
+  %21 : int = prim::Constant[value=1]()
+  %22 : Float(*, *) = aten::add(%20, %21, %21)
+  %23 : Float(*, *) = aten::mul(%8, %3)
+  %24 : Float(*, *) = aten::mul(%23, %22)
+  %25 : Float(*, *) = aten::mul(%2, %2)
+  %26 : Float(*, *) = aten::neg(%25)
+  %27 : int = prim::Constant[value=1]()
+  %28 : Float(*, *) = aten::add(%26, %27, %27)
+  %29 : Float(*, *) = aten::mul(%18, %28)
+  %30 : Float(*, *) = aten::neg(%1)
+  %31 : int = prim::Constant[value=1]()
+  %32 : Float(*, *) = aten::add(%30, %31, %31)
+  %33 : Float(*, *) = aten::mul(%19, %1)
+  %34 : Float(*, *) = aten::mul(%33, %32)
+  %35 : Float(*, *) = aten::neg(%0)
+  %36 : int = prim::Constant[value=1]()
+  %37 : Float(*, *) = aten::add(%35, %36, %36)
+  %38 : Float(*, *) = aten::mul(%17, %0)
+  %39 : Float(*, *) = aten::mul(%38, %37)
+  %40 : Float(*, *) = prim::FusedConcat[dim=1](%39, %34, %29, %24)
+  return (%40);
 }
-with prim::FusionGroup_1 = graph(%0 : Float(*)
-      %1 : Float(*)
-      %2 : Float(*, *)
+with prim::FusionGroup_1 = graph(%0 : Float(*, *)
+      %1 : Float(*, *)
+      %2 : Float(*)) {
+  %3 : Float(*, *) = aten::mul(%0, %2)
+  %4 : Float(*, *) = aten::mul(%0, %1)
+  %5 : int = prim::Constant[value=1]()
+  %6 : Float(*, *) = aten::add(%3, %4, %5)
+  return (%6);
+}
+with prim::FusionGroup_2 = graph(%0 : Float(*)
+      %1 : Float(*, *)
+      %2 : Float(*)
       %3 : Float(*, *)
-      %4 : Float(*, *)
-      %5 : Float(*, *)
-      %6 : Float(*)) {
-  %7 : int = prim::Constant[value=1]()
-  %8 : int = prim::Constant[value=1]()
+      %4 : Float(*, *)) {
+  %5 : Float(*, *) = aten::mul(%1, %4)
+  %6 : Float(*, *) = aten::mul(%5, %3)
+  %7 : Float(*, *) = aten::mul(%1, %2)
+  %8 : Float(*, *) = aten::mul(%5, %0)
   %9 : int = prim::Constant[value=1]()
-  %10 : int = prim::Constant[value=1]()
-  %11 : Float(*, *) = aten::mul(%3, %10)
-  %12 : Float(*, *) = aten::mul(%11, %4)
-  %13 : Float(*, *) = aten::mul(%11, %2)
-  %14 : Float(*, *) = aten::mul(%11, %6)
-  %15 : Float(*, *) = aten::mul(%3, %5)
-  %16 : int = prim::Constant[value=1]()
-  %17 : int = prim::Constant[value=1]()
-  %18 : Float(*, *) = aten::add(%14, %15, %17)
-  %19 : Float(*, *) = aten::mul(%3, %4)
-  %20 : Float(*, *) = aten::mul(%19, %2)
-  %21 : Float(*, *) = aten::mul(%11, %1)
-  %22 : Float(*, *) = aten::mul(%19, %0)
-  %23 : int = prim::Constant[value=1]()
-  %24 : Float(*, *) = aten::add(%21, %22, %23)
-  return (%24, %20, %18, %13, %12, %11);
+  %10 : Float(*, *) = aten::add(%7, %8, %9)
+  return (%10, %6);
 }
diff --git a/test/expect/TestScript.test_scalar_fusion.expect b/test/expect/TestScript.test_scalar_fusion.expect
index 9d45a9f765d63..e2fd92a0f5739 100644
--- a/test/expect/TestScript.test_scalar_fusion.expect
+++ b/test/expect/TestScript.test_scalar_fusion.expect
@@ -1,12 +1,13 @@
 graph(%x : Float()
       %y : Float()) {
-  %2 : Float() = prim::FusionGroup_0[device=-1](%x, %y)
+  %2 : Float() = prim::FusionGroup_0[device=-1](%y, %x)
   return (%2);
 }
 with prim::FusionGroup_0 = graph(%0 : Float()
       %1 : Float()) {
-  %2 : Float() = aten::type_as(%1, %0)
-  %3 : int = prim::Constant[value=1]()
-  %4 : Float() = aten::add(%0, %2, %3)
-  return (%4);
+  %2 : int = prim::Constant[value=2]()
+  %3 : Float() = aten::mul(%2, %1)
+  %4 : int = prim::Constant[value=1]()
+  %5 : Float() = aten::add(%3, %0, %4)
+  return (%5);
 }
diff --git a/test/onnx/model_defs/squeezenet.py b/test/onnx/model_defs/squeezenet.py
index e4ace18194ab7..2ee956b605cd1 100644
--- a/test/onnx/model_defs/squeezenet.py
+++ b/test/onnx/model_defs/squeezenet.py
@@ -79,9 +79,9 @@ def __init__(self, version=1.0, num_classes=1000, ceil_mode=False):
         for m in self.modules():
             if isinstance(m, nn.Conv2d):
                 if m is final_conv:
-                    init.normal(m.weight.data, mean=0.0, std=0.01)
+                    init.normal_(m.weight.data, mean=0.0, std=0.01)
                 else:
-                    init.kaiming_uniform(m.weight.data)
+                    init.kaiming_uniform_(m.weight.data)
                 if m.bias is not None:
                     m.bias.data.zero_()
 
diff --git a/test/onnx/model_defs/super_resolution.py b/test/onnx/model_defs/super_resolution.py
index d0ba46a22d05a..619d5f4a5b581 100644
--- a/test/onnx/model_defs/super_resolution.py
+++ b/test/onnx/model_defs/super_resolution.py
@@ -24,7 +24,7 @@ def forward(self, x):
         return x
 
     def _initialize_weights(self):
-        init.orthogonal(self.conv1.weight, init.calculate_gain('relu'))
-        init.orthogonal(self.conv2.weight, init.calculate_gain('relu'))
-        init.orthogonal(self.conv3.weight, init.calculate_gain('relu'))
-        init.orthogonal(self.conv4.weight)
+        init.orthogonal_(self.conv1.weight, init.calculate_gain('relu'))
+        init.orthogonal_(self.conv2.weight, init.calculate_gain('relu'))
+        init.orthogonal_(self.conv3.weight, init.calculate_gain('relu'))
+        init.orthogonal_(self.conv4.weight)
diff --git a/test/test_autograd.py b/test/test_autograd.py
index e755351c336b6..965fdab9c8b54 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -299,6 +299,33 @@ def hook(*grads):
         self.assertFalse(hook_called[0])
         self.assertIsNone(x.grad)
 
+    def test_grad_nonleaf_register_hook(self):
+        # This checks an edge case for register_hook.
+        # We want to capture grad of a nonleaf tensor,
+        # but avoid segfault during backward of other nonleaf tensors
+        x = torch.randn(5, requires_grad=True)
+        x_list = x.unbind()
+
+        x0 = x_list[0]
+        hook_results = [None]
+
+        def hook(grad):
+            hook_results[0] = grad
+        x0.register_hook(hook)
+
+        x_list[0].backward()
+        self.assertEqual(hook_results[0], torch.tensor(1.))
+        expected_grad = torch.tensor([1., 0, 0, 0, 0])
+        self.assertEqual(x.grad, expected_grad)
+        self.assertIsNone(x_list[0].grad)
+
+        for i in range(1, 5, 1):
+            x_list[i].backward()
+            self.assertEqual(hook_results[0], None)
+            expected_grad[i] = 1.0
+            self.assertEqual(x.grad, expected_grad)
+            self.assertIsNone(x_list[i].grad)
+
     def test_sharded_grad(self):
         leaves = [torch.zeros(5, 5, requires_grad=True) for _ in range(10)]
         intermediates = [l * i + l * l for i, l in enumerate(leaves)]
diff --git a/test/test_cpp_extensions.py b/test/test_cpp_extensions.py
index 3702205e4c449..e5b1121784f07 100755
--- a/test/test_cpp_extensions.py
+++ b/test/test_cpp_extensions.py
@@ -23,6 +23,9 @@
     TEST_CUDNN = TEST_CUDA and CUDNN_HEADER_EXISTS and torch.backends.cudnn.is_available()
 
 
+IS_WINDOWS = sys.platform == 'win32'
+
+
 class TestCppExtension(common.TestCase):
     def setUp(self):
         if sys.platform != 'win32':
@@ -189,7 +192,7 @@ def test_inline_jit_compile_extension_multiple_sources_and_no_functions(self):
         '''
 
         cpp_source2 = '''
-        #include <torch/torch.h>
+        #include <torch/extension.h>
         at::Tensor sin_add(at::Tensor x, at::Tensor y);
         PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
           m.def("sin_add", &sin_add, "sin(x) + sin(y)");
@@ -265,7 +268,7 @@ def test_lenient_flag_handling_in_jit_extensions(self):
             cpp_sources=cpp_source,
             functions='tanh_add',
             extra_cflags=['-g\n\n', '-O0 -Wall'],
-            extra_include_paths=['       cpp_extensions\n', '../'],
+            extra_include_paths=['       cpp_extensions\n'],
             verbose=True)
 
         x = torch.zeros(100, dtype=torch.float32)
@@ -341,6 +344,50 @@ def compile(code):
         module = compile('int f() { return 789; }')
         self.assertEqual(module.f(), 789)
 
+    @unittest.skipIf(IS_WINDOWS, "C++ API not yet supported on Windows")
+    def test_cpp_api_extension(self):
+        here = os.path.abspath(__file__)
+        pytorch_root = os.path.dirname(os.path.dirname(here))
+        api_include = os.path.join(pytorch_root, 'torch', 'csrc', 'api', 'include')
+        module = torch.utils.cpp_extension.load(
+            name='cpp_api_extension',
+            sources='cpp_extensions/cpp_api_extension.cpp',
+            extra_include_paths=api_include,
+            extra_cflags=[] if IS_WINDOWS else ['-UTORCH_API_INCLUDE_EXTENSION_H'],
+            verbose=True)
+
+        net = module.Net(3, 5)
+
+        self.assertTrue(net.training)
+        net.eval()
+        self.assertFalse(net.training)
+        net.train()
+        self.assertTrue(net.training)
+        net.eval()
+
+        input = torch.randn(2, 3, dtype=torch.float32)
+        output = net.forward(input)
+        self.assertEqual(output, net.forward(input))
+        self.assertEqual(list(output.shape), [2, 5])
+
+        bias = net.get_bias()
+        self.assertEqual(list(bias.shape), [5])
+        net.set_bias(bias + 1)
+        self.assertEqual(net.get_bias(), bias + 1)
+        output2 = net.forward(input)
+
+        self.assertNotEqual(output + 1, output2)
+
+        self.assertEqual(len(net.parameters()), 4)
+
+        p = net.named_parameters()
+        self.assertEqual(type(p), dict)
+        self.assertEqual(len(p), 4)
+        self.assertIn('fc.weight', p)
+        self.assertIn('fc.bias', p)
+        self.assertIn('bn.weight', p)
+        self.assertIn('bn.bias', p)
+
 
 if __name__ == '__main__':
     common.run_tests()
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 5d8412192cba8..560aebbdc64e0 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -30,9 +30,11 @@
     TestCase = object  # noqa: F811
 
 TEST_MAGMA = TEST_CUDA
+TEST_LARGE_TENSOR = TEST_CUDA
 if TEST_CUDA:
     torch.ones(1).cuda()  # has_magma shows up after cuda is initialized
     TEST_MAGMA = torch.cuda.has_magma
+    TEST_LARGE_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 9e9
 
 floating_set = {torch.FloatTensor, torch.DoubleTensor, torch.cuda.FloatTensor,
                 torch.cuda.DoubleTensor, torch.HalfTensor, torch.cuda.HalfTensor}
@@ -889,23 +891,20 @@ def test_type_conversions(self):
         self.assertIsInstance(y.cuda().float().cpu().int(), torch.IntStorage)
 
     def test_mul_intertype_scalar(self):
-        x = torch.tensor(1.5, device='cuda')
-        y = torch.tensor(3, dtype=torch.int32, device='cuda')
-
-        self.assertEqual(x * y, 4.5)
-        self.assertEqual(y * x, 4.5)
-        with self.assertRaisesRegex(RuntimeError, 'expected type'):
-            y *= x
-        x *= y
-        self.assertEqual(x, 4.5)
-
-        x = torch.tensor(1.5, device='cuda', dtype=torch.float16)
-        self.assertEqual(x * y, 4.5)
-        # half * int currently promotes to double
-        with self.assertRaisesRegex(RuntimeError, 'expected type'):
+        def test_mul(dtype):
+            x = torch.tensor(1.5, dtype=dtype, device='cuda')
+            y = torch.tensor(3, dtype=torch.int32, device='cuda')
+
+            self.assertEqual(x * y, 4.5)
+            self.assertEqual(y * x, 4.5)
+            with self.assertRaisesRegex(RuntimeError, 'expected type'):
+                y *= x
             x *= y
-        with self.assertRaisesRegex(RuntimeError, 'expected type'):
-            y *= x
+            self.assertEqual(x, 4.5)
+
+        test_mul(torch.float16)
+        test_mul(torch.float32)
+        test_mul(torch.float64)
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
     @skipIfRocm
@@ -918,6 +917,28 @@ def test_type_conversions_same_gpu(self):
     def test_neg(self):
         TestTorch._test_neg(self, lambda t: t.cuda())
 
+    @unittest.skipIf(not TEST_LARGE_TENSOR, "not enough memory")
+    def test_arithmetic_large_tensor(self):
+        x = torch.empty(2**30, device='cuda')
+
+        x.fill_(1)
+        self.assertEqual(x.sum(), 2**30)
+
+        x += 1
+        self.assertEqual(x.sum(), 2**31)
+
+        x.fill_(1)
+        x -= 0.5
+        self.assertEqual(x.sum(), 2**29)
+
+        x.fill_(1)
+        x *= 2
+        self.assertEqual(x.sum(), 2**31)
+
+        x.fill_(1)
+        x /= 2
+        self.assertEqual(x.sum(), 2**29)
+
     def _test_broadcast(self, input):
         if not TEST_MULTIGPU:
             raise unittest.SkipTest("only one GPU detected")
diff --git a/test/test_distributions.py b/test/test_distributions.py
index 5c710daa3a62b..86a63c6608637 100644
--- a/test/test_distributions.py
+++ b/test/test_distributions.py
@@ -102,12 +102,12 @@ def is_all_nan(tensor):
     ]),
     Example(Beta, [
         {
-            'concentration1': torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True),
-            'concentration0': torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True),
+            'concentration1': torch.randn(2, 3).exp().requires_grad_(),
+            'concentration0': torch.randn(2, 3).exp().requires_grad_(),
         },
         {
-            'concentration1': torch.tensor(torch.exp(torch.randn(4)), requires_grad=True),
-            'concentration0': torch.tensor(torch.exp(torch.randn(4)), requires_grad=True),
+            'concentration1': torch.randn(4).exp().requires_grad_(),
+            'concentration0': torch.randn(4).exp().requires_grad_(),
         },
     ]),
     Example(Categorical, [
@@ -146,29 +146,29 @@ def is_all_nan(tensor):
          'scale': torch.tensor([[1.0], [1.0]])}
     ]),
     Example(Chi2, [
-        {'df': torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)},
-        {'df': torch.tensor(torch.exp(torch.randn(1)), requires_grad=True)},
+        {'df': torch.randn(2, 3).exp().requires_grad_()},
+        {'df': torch.randn(1).exp().requires_grad_()},
     ]),
     Example(StudentT, [
-        {'df': torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)},
-        {'df': torch.tensor(torch.exp(torch.randn(1)), requires_grad=True)},
+        {'df': torch.randn(2, 3).exp().requires_grad_()},
+        {'df': torch.randn(1).exp().requires_grad_()},
     ]),
     Example(Dirichlet, [
-        {'concentration': torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)},
-        {'concentration': torch.tensor(torch.exp(torch.randn(4)), requires_grad=True)},
+        {'concentration': torch.randn(2, 3).exp().requires_grad_()},
+        {'concentration': torch.randn(4).exp().requires_grad_()},
     ]),
     Example(Exponential, [
-        {'rate': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True)},
-        {'rate': torch.tensor(torch.randn(1).abs(), requires_grad=True)},
+        {'rate': torch.randn(5, 5).abs().requires_grad_()},
+        {'rate': torch.randn(1).abs().requires_grad_()},
     ]),
     Example(FisherSnedecor, [
         {
-            'df1': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True),
-            'df2': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True),
+            'df1': torch.randn(5, 5).abs().requires_grad_(),
+            'df2': torch.randn(5, 5).abs().requires_grad_(),
         },
         {
-            'df1': torch.tensor(torch.randn(1).abs(), requires_grad=True),
-            'df2': torch.tensor(torch.randn(1).abs(), requires_grad=True),
+            'df1': torch.randn(1).abs().requires_grad_(),
+            'df2': torch.randn(1).abs().requires_grad_(),
         },
         {
             'df1': torch.tensor([1.0]),
@@ -177,22 +177,22 @@ def is_all_nan(tensor):
     ]),
     Example(Gamma, [
         {
-            'concentration': torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True),
-            'rate': torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True),
+            'concentration': torch.randn(2, 3).exp().requires_grad_(),
+            'rate': torch.randn(2, 3).exp().requires_grad_(),
         },
         {
-            'concentration': torch.tensor(torch.exp(torch.randn(1)), requires_grad=True),
-            'rate': torch.tensor(torch.exp(torch.randn(1)), requires_grad=True),
+            'concentration': torch.randn(1).exp().requires_grad_(),
+            'rate': torch.randn(1).exp().requires_grad_(),
         },
     ]),
     Example(Gumbel, [
         {
             'loc': torch.randn(5, 5, requires_grad=True),
-            'scale': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True),
+            'scale': torch.randn(5, 5).abs().requires_grad_(),
         },
         {
             'loc': torch.randn(1, requires_grad=True),
-            'scale': torch.tensor(torch.randn(1).abs(), requires_grad=True),
+            'scale': torch.randn(1).abs().requires_grad_(),
         },
     ]),
     Example(HalfCauchy, [
@@ -200,45 +200,45 @@ def is_all_nan(tensor):
         {'scale': torch.tensor([[1.0], [1.0]])}
     ]),
     Example(HalfNormal, [
-        {'scale': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True)},
-        {'scale': torch.tensor(torch.randn(1).abs(), requires_grad=True)},
+        {'scale': torch.randn(5, 5).abs().requires_grad_()},
+        {'scale': torch.randn(1).abs().requires_grad_()},
         {'scale': torch.tensor([1e-5, 1e-5], requires_grad=True)}
     ]),
     Example(Independent, [
         {
             'base_distribution': Normal(torch.randn(2, 3, requires_grad=True),
-                                        torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)),
+                                        torch.randn(2, 3).abs().requires_grad_()),
             'reinterpreted_batch_ndims': 0,
         },
         {
             'base_distribution': Normal(torch.randn(2, 3, requires_grad=True),
-                                        torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)),
+                                        torch.randn(2, 3).abs().requires_grad_()),
             'reinterpreted_batch_ndims': 1,
         },
         {
             'base_distribution': Normal(torch.randn(2, 3, requires_grad=True),
-                                        torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)),
+                                        torch.randn(2, 3).abs().requires_grad_()),
             'reinterpreted_batch_ndims': 2,
         },
         {
             'base_distribution': Normal(torch.randn(2, 3, 5, requires_grad=True),
-                                        torch.tensor(torch.randn(2, 3, 5).abs(), requires_grad=True)),
+                                        torch.randn(2, 3, 5).abs().requires_grad_()),
             'reinterpreted_batch_ndims': 2,
         },
         {
             'base_distribution': Normal(torch.randn(2, 3, 5, requires_grad=True),
-                                        torch.tensor(torch.randn(2, 3, 5).abs(), requires_grad=True)),
+                                        torch.randn(2, 3, 5).abs().requires_grad_()),
             'reinterpreted_batch_ndims': 3,
         },
     ]),
     Example(Laplace, [
         {
             'loc': torch.randn(5, 5, requires_grad=True),
-            'scale': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True),
+            'scale': torch.randn(5, 5).abs().requires_grad_(),
         },
         {
             'loc': torch.randn(1, requires_grad=True),
-            'scale': torch.tensor(torch.randn(1).abs(), requires_grad=True),
+            'scale': torch.randn(1).abs().requires_grad_(),
         },
         {
             'loc': torch.tensor([1.0, 0.0], requires_grad=True),
@@ -248,11 +248,11 @@ def is_all_nan(tensor):
     Example(LogNormal, [
         {
             'loc': torch.randn(5, 5, requires_grad=True),
-            'scale': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True),
+            'scale': torch.randn(5, 5).abs().requires_grad_(),
         },
         {
             'loc': torch.randn(1, requires_grad=True),
-            'scale': torch.tensor(torch.randn(1).abs(), requires_grad=True),
+            'scale': torch.randn(1).abs().requires_grad_(),
         },
         {
             'loc': torch.tensor([1.0, 0.0], requires_grad=True),
@@ -310,11 +310,11 @@ def is_all_nan(tensor):
     Example(Normal, [
         {
             'loc': torch.randn(5, 5, requires_grad=True),
-            'scale': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True),
+            'scale': torch.randn(5, 5).abs().requires_grad_(),
         },
         {
             'loc': torch.randn(1, requires_grad=True),
-            'scale': torch.tensor(torch.randn(1).abs(), requires_grad=True),
+            'scale': torch.randn(1).abs().requires_grad_(),
         },
         {
             'loc': torch.tensor([1.0, 0.0], requires_grad=True),
@@ -332,8 +332,8 @@ def is_all_nan(tensor):
             'alpha': 1.0
         },
         {
-            'scale': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True),
-            'alpha': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True)
+            'scale': torch.randn(5, 5).abs().requires_grad_(),
+            'alpha': torch.randn(5, 5).abs().requires_grad_()
         },
         {
             'scale': torch.tensor([1.0]),
@@ -342,10 +342,10 @@ def is_all_nan(tensor):
     ]),
     Example(Poisson, [
         {
-            'rate': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True),
+            'rate': torch.randn(5, 5).abs().requires_grad_(),
         },
         {
-            'rate': torch.tensor(torch.randn(3).abs(), requires_grad=True),
+            'rate': torch.randn(3).abs().requires_grad_(),
         },
         {
             'rate': 0.2,
@@ -382,23 +382,23 @@ def is_all_nan(tensor):
     Example(TransformedDistribution, [
         {
             'base_distribution': Normal(torch.randn(2, 3, requires_grad=True),
-                                        torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)),
+                                        torch.randn(2, 3).abs().requires_grad_()),
             'transforms': [],
         },
         {
             'base_distribution': Normal(torch.randn(2, 3, requires_grad=True),
-                                        torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)),
+                                        torch.randn(2, 3).abs().requires_grad_()),
             'transforms': ExpTransform(),
         },
         {
             'base_distribution': Normal(torch.randn(2, 3, 5, requires_grad=True),
-                                        torch.tensor(torch.randn(2, 3, 5).abs(), requires_grad=True)),
+                                        torch.randn(2, 3, 5).abs().requires_grad_()),
             'transforms': [AffineTransform(torch.randn(3, 5), torch.randn(3, 5)),
                            ExpTransform()],
         },
         {
             'base_distribution': Normal(torch.randn(2, 3, 5, requires_grad=True),
-                                        torch.tensor(torch.randn(2, 3, 5).abs(), requires_grad=True)),
+                                        torch.randn(2, 3, 5).abs().requires_grad_()),
             'transforms': AffineTransform(1, 2),
         },
     ]),
@@ -418,8 +418,8 @@ def is_all_nan(tensor):
     ]),
     Example(Weibull, [
         {
-            'scale': torch.tensor(torch.randn(5, 5).abs(), requires_grad=True),
-            'concentration': torch.tensor(torch.randn(1).abs(), requires_grad=True)
+            'scale': torch.randn(5, 5).abs().requires_grad_(),
+            'concentration': torch.randn(1).abs().requires_grad_()
         }
     ])
 ]
@@ -922,7 +922,7 @@ def test_geometric_sample(self):
                                          'Geometric(prob={})'.format(prob))
 
     def test_binomial(self):
-        p = torch.tensor(torch.arange(0.05, 1, 0.1), requires_grad=True)
+        p = torch.arange(0.05, 1, 0.1).requires_grad_()
         for total_count in [1, 2, 10]:
             self._gradcheck_log_prob(lambda p: Binomial(total_count, p), [p])
             self._gradcheck_log_prob(lambda p: Binomial(total_count, None, p.log()), [p])
@@ -931,7 +931,7 @@ def test_binomial(self):
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_binomial_log_prob(self):
-        probs = torch.tensor(torch.arange(0.05, 1, 0.1))
+        probs = torch.arange(0.05, 1, 0.1)
         for total_count in [1, 2, 10]:
 
             def ref_log_prob(idx, x, log_prob):
@@ -987,7 +987,7 @@ def test_binomial_vectorized_count(self):
         self.assertEqual(samples.var(dim=0), bin1.variance, prec=0.02)
 
     def test_negative_binomial(self):
-        p = torch.tensor(torch.arange(0.05, 1, 0.1), requires_grad=True)
+        p = torch.arange(0.05, 1, 0.1).requires_grad_()
         for total_count in [1, 2, 10]:
             self._gradcheck_log_prob(lambda p: NegativeBinomial(total_count, p), [p])
             self._gradcheck_log_prob(lambda p: NegativeBinomial(total_count, None, p.log()), [p])
@@ -996,7 +996,7 @@ def test_negative_binomial(self):
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_negative_binomial_log_prob(self):
-        probs = torch.tensor(torch.arange(0.05, 1, 0.1))
+        probs = torch.arange(0.05, 1, 0.1)
         for total_count in [1, 2, 10]:
 
             def ref_log_prob(idx, x, log_prob):
@@ -1142,8 +1142,8 @@ def test_one_hot_categorical_enumerate_support(self):
         self._check_enumerate_support(OneHotCategorical, examples)
 
     def test_poisson_shape(self):
-        rate = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)
-        rate_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True)
+        rate = torch.randn(2, 3).abs().requires_grad_()
+        rate_1d = torch.randn(1).abs().requires_grad_()
         self.assertEqual(Poisson(rate).sample().size(), (2, 3))
         self.assertEqual(Poisson(rate).sample((7,)).size(), (7, 2, 3))
         self.assertEqual(Poisson(rate_1d).sample().size(), (1,))
@@ -1152,8 +1152,8 @@ def test_poisson_shape(self):
 
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
     def test_poisson_log_prob(self):
-        rate = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)
-        rate_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True)
+        rate = torch.randn(2, 3).abs().requires_grad_()
+        rate_1d = torch.randn(1).abs().requires_grad_()
 
         def ref_log_prob(idx, x, log_prob):
             l = rate.view(-1)[idx].detach()
@@ -1286,9 +1286,9 @@ def pmf(self, samples):
 
     def test_uniform(self):
         low = torch.zeros(5, 5, requires_grad=True)
-        high = torch.tensor(torch.ones(5, 5) * 3, requires_grad=True)
+        high = (torch.ones(5, 5) * 3).requires_grad_()
         low_1d = torch.zeros(1, requires_grad=True)
-        high_1d = torch.tensor(torch.ones(1) * 3, requires_grad=True)
+        high_1d = (torch.ones(1) * 3).requires_grad_()
         self.assertEqual(Uniform(low, high).sample().size(), (5, 5))
         self.assertEqual(Uniform(low, high).sample((7,)).size(), (7, 5, 5))
         self.assertEqual(Uniform(low_1d, high_1d).sample().size(), (1,))
@@ -1373,7 +1373,7 @@ def test_halfcauchy(self):
         scale.grad.zero_()
 
     def test_halfnormal(self):
-        std = torch.tensor(torch.randn(5, 5).abs(), requires_grad=True)
+        std = torch.randn(5, 5).abs().requires_grad_()
         std_1d = torch.randn(1, requires_grad=True)
         std_delta = torch.tensor([1e-5, 1e-5])
         self.assertEqual(HalfNormal(std).sample().size(), (5, 5))
@@ -1399,7 +1399,7 @@ def test_halfnormal(self):
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_halfnormal_logprob(self):
-        std = torch.tensor(torch.randn(5, 1).abs(), requires_grad=True)
+        std = torch.randn(5, 1).abs().requires_grad_()
 
         def ref_log_prob(idx, x, log_prob):
             s = std.view(-1)[idx].detach()
@@ -1418,9 +1418,9 @@ def test_halfnormal_sample(self):
 
     def test_lognormal(self):
         mean = torch.randn(5, 5, requires_grad=True)
-        std = torch.tensor(torch.randn(5, 5).abs(), requires_grad=True)
+        std = torch.randn(5, 5).abs().requires_grad_()
         mean_1d = torch.randn(1, requires_grad=True)
-        std_1d = torch.randn(1, requires_grad=True)
+        std_1d = torch.randn(1).abs().requires_grad_()
         mean_delta = torch.tensor([1.0, 0.0])
         std_delta = torch.tensor([1e-5, 1e-5])
         self.assertEqual(LogNormal(mean, std).sample().size(), (5, 5))
@@ -1448,7 +1448,7 @@ def test_lognormal(self):
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_lognormal_logprob(self):
         mean = torch.randn(5, 1, requires_grad=True)
-        std = torch.tensor(torch.randn(5, 1).abs(), requires_grad=True)
+        std = torch.randn(5, 1).abs().requires_grad_()
 
         def ref_log_prob(idx, x, log_prob):
             m = mean.view(-1)[idx].detach()
@@ -1534,9 +1534,9 @@ def test_logisticnormal_sample(self):
 
     def test_normal(self):
         loc = torch.randn(5, 5, requires_grad=True)
-        scale = torch.tensor(torch.randn(5, 5).abs(), requires_grad=True)
+        scale = torch.randn(5, 5).abs().requires_grad_()
         loc_1d = torch.randn(1, requires_grad=True)
-        scale_1d = torch.randn(1, requires_grad=True)
+        scale_1d = torch.randn(1).abs().requires_grad_()
         loc_delta = torch.tensor([1.0, 0.0])
         scale_delta = torch.tensor([1e-5, 1e-5])
         self.assertEqual(Normal(loc, scale).sample().size(), (5, 5))
@@ -1591,11 +1591,11 @@ def test_lowrank_multivariate_normal_shape(self):
 
         # construct PSD covariance
         cov_factor = torch.randn(3, 1, requires_grad=True)
-        cov_diag = torch.tensor(torch.randn(3).abs(), requires_grad=True)
+        cov_diag = torch.randn(3).abs().requires_grad_()
 
         # construct batch of PSD covariances
         cov_factor_batched = torch.randn(6, 5, 3, 2, requires_grad=True)
-        cov_diag_batched = torch.tensor(torch.randn(6, 5, 3).abs(), requires_grad=True)
+        cov_diag_batched = torch.randn(6, 5, 3).abs().requires_grad_()
 
         # ensure that sample, batch, event shapes all handled correctly
         self.assertEqual(LowRankMultivariateNormal(mean, cov_factor, cov_diag)
@@ -1635,7 +1635,7 @@ def test_lowrank_multivariate_normal_shape(self):
     def test_lowrank_multivariate_normal_log_prob(self):
         mean = torch.randn(3, requires_grad=True)
         cov_factor = torch.randn(3, 1, requires_grad=True)
-        cov_diag = torch.tensor(torch.randn(3).abs(), requires_grad=True)
+        cov_diag = torch.randn(3).abs().requires_grad_()
         cov = cov_factor.matmul(cov_factor.t()) + cov_diag.diag()
 
         # check that logprob values match scipy logpdf,
@@ -1651,7 +1651,7 @@ def test_lowrank_multivariate_normal_log_prob(self):
         # Double-check that batched versions behave the same as unbatched
         mean = torch.randn(5, 3, requires_grad=True)
         cov_factor = torch.randn(5, 3, 2, requires_grad=True)
-        cov_diag = torch.tensor(torch.randn(5, 3).abs(), requires_grad=True)
+        cov_diag = torch.randn(5, 3).abs().requires_grad_()
 
         dist_batched = LowRankMultivariateNormal(mean, cov_factor, cov_diag)
         dist_unbatched = [LowRankMultivariateNormal(mean[i], cov_factor[i], cov_diag[i])
@@ -1669,7 +1669,7 @@ def test_lowrank_multivariate_normal_sample(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
         mean = torch.randn(5, requires_grad=True)
         cov_factor = torch.randn(5, 1, requires_grad=True)
-        cov_diag = torch.tensor(torch.randn(5).abs(), requires_grad=True)
+        cov_diag = torch.randn(5).abs().requires_grad_()
         cov = cov_factor.matmul(cov_factor.t()) + cov_diag.diag()
 
         self._check_sampler_sampler(LowRankMultivariateNormal(mean, cov_factor, cov_diag),
@@ -1680,7 +1680,7 @@ def test_lowrank_multivariate_normal_sample(self):
     def test_lowrank_multivariate_normal_properties(self):
         loc = torch.randn(5)
         cov_factor = torch.randn(5, 2)
-        cov_diag = torch.tensor(torch.randn(5).abs())
+        cov_diag = torch.randn(5).abs()
         cov = cov_factor.matmul(cov_factor.t()) + cov_diag.diag()
         m1 = LowRankMultivariateNormal(loc, cov_factor, cov_diag)
         m2 = MultivariateNormal(loc=loc, covariance_matrix=cov)
@@ -1695,7 +1695,7 @@ def test_lowrank_multivariate_normal_moments(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
         mean = torch.randn(5)
         cov_factor = torch.randn(5, 2)
-        cov_diag = torch.tensor(torch.randn(5).abs())
+        cov_diag = torch.randn(5).abs()
         d = LowRankMultivariateNormal(mean, cov_factor, cov_diag)
         samples = d.rsample((100000,))
         empirical_mean = samples.mean(0)
@@ -1710,13 +1710,13 @@ def test_multivariate_normal_shape(self):
 
         # construct PSD covariance
         tmp = torch.randn(3, 10)
-        cov = torch.tensor(torch.matmul(tmp, tmp.t()) / tmp.shape[-1], requires_grad=True)
-        prec = torch.tensor(cov.inverse(), requires_grad=True)
-        scale_tril = torch.tensor(torch.potrf(cov, upper=False), requires_grad=True)
+        cov = (torch.matmul(tmp, tmp.t()) / tmp.size(-1)).requires_grad_()
+        prec = cov.inverse().requires_grad_()
+        scale_tril = torch.potrf(cov, upper=False).requires_grad_()
 
         # construct batch of PSD covariances
         tmp = torch.randn(6, 5, 3, 10)
-        cov_batched = torch.tensor((tmp.unsqueeze(-2) * tmp.unsqueeze(-3)).mean(-1), requires_grad=True)
+        cov_batched = (tmp.unsqueeze(-2) * tmp.unsqueeze(-3)).mean(-1).requires_grad_()
         prec_batched = [C.inverse() for C in cov_batched.view((-1, 3, 3))]
         prec_batched = torch.stack(prec_batched).view(cov_batched.shape)
         scale_tril_batched = [torch.potrf(C, upper=False) for C in cov_batched.view((-1, 3, 3))]
@@ -1753,9 +1753,9 @@ def test_multivariate_normal_shape(self):
     def test_multivariate_normal_log_prob(self):
         mean = torch.randn(3, requires_grad=True)
         tmp = torch.randn(3, 10)
-        cov = torch.tensor(torch.matmul(tmp, tmp.t()) / tmp.shape[-1], requires_grad=True)
-        prec = torch.tensor(cov.inverse(), requires_grad=True)
-        scale_tril = torch.tensor(torch.potrf(cov, upper=False), requires_grad=True)
+        cov = (torch.matmul(tmp, tmp.t()) / tmp.size(-1)).requires_grad_()
+        prec = cov.inverse().requires_grad_()
+        scale_tril = torch.potrf(cov, upper=False).requires_grad_()
 
         # check that logprob values match scipy logpdf,
         # and that covariance and scale_tril parameters are equivalent
@@ -1774,7 +1774,7 @@ def test_multivariate_normal_log_prob(self):
         # Double-check that batched versions behave the same as unbatched
         mean = torch.randn(5, 3, requires_grad=True)
         tmp = torch.randn(5, 3, 10)
-        cov = torch.tensor((tmp.unsqueeze(-2) * tmp.unsqueeze(-3)).mean(-1), requires_grad=True)
+        cov = (tmp.unsqueeze(-2) * tmp.unsqueeze(-3)).mean(-1).requires_grad_()
 
         dist_batched = MultivariateNormal(mean, cov)
         dist_unbatched = [MultivariateNormal(mean[i], cov[i]) for i in range(mean.size(0))]
@@ -1791,9 +1791,9 @@ def test_multivariate_normal_sample(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
         mean = torch.randn(3, requires_grad=True)
         tmp = torch.randn(3, 10)
-        cov = torch.tensor(torch.matmul(tmp, tmp.t()) / tmp.shape[-1], requires_grad=True)
-        prec = torch.tensor(cov.inverse(), requires_grad=True)
-        scale_tril = torch.tensor(torch.potrf(cov, upper=False), requires_grad=True)
+        cov = (torch.matmul(tmp, tmp.t()) / tmp.size(-1)).requires_grad_()
+        prec = cov.inverse().requires_grad_()
+        scale_tril = torch.potrf(cov, upper=False).requires_grad_()
 
         self._check_sampler_sampler(MultivariateNormal(mean, cov),
                                     scipy.stats.multivariate_normal(mean.detach().numpy(), cov.detach().numpy()),
@@ -1828,8 +1828,8 @@ def test_multivariate_normal_moments(self):
         self.assertEqual(d.variance, empirical_var, prec=0.05)
 
     def test_exponential(self):
-        rate = torch.tensor(torch.randn(5, 5).abs(), requires_grad=True)
-        rate_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True)
+        rate = torch.randn(5, 5).abs().requires_grad_()
+        rate_1d = torch.randn(1).abs().requires_grad_()
         self.assertEqual(Exponential(rate).sample().size(), (5, 5))
         self.assertEqual(Exponential(rate).sample((7,)).size(), (7, 5, 5))
         self.assertEqual(Exponential(rate_1d).sample((1,)).size(), (1, 1))
@@ -1864,7 +1864,7 @@ def test_exponential_sample(self):
 
     def test_laplace(self):
         loc = torch.randn(5, 5, requires_grad=True)
-        scale = torch.tensor(torch.randn(5, 5).abs(), requires_grad=True)
+        scale = torch.randn(5, 5).abs().requires_grad_()
         loc_1d = torch.randn(1, requires_grad=True)
         scale_1d = torch.randn(1, requires_grad=True)
         loc_delta = torch.tensor([1.0, 0.0])
@@ -1915,10 +1915,10 @@ def test_laplace_sample(self):
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_gamma_shape(self):
-        alpha = torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)
-        beta = torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)
-        alpha_1d = torch.tensor(torch.exp(torch.randn(1)), requires_grad=True)
-        beta_1d = torch.tensor(torch.exp(torch.randn(1)), requires_grad=True)
+        alpha = torch.randn(2, 3).exp().requires_grad_()
+        beta = torch.randn(2, 3).exp().requires_grad_()
+        alpha_1d = torch.randn(1).exp().requires_grad_()
+        beta_1d = torch.randn(1).exp().requires_grad_()
         self.assertEqual(Gamma(alpha, beta).sample().size(), (2, 3))
         self.assertEqual(Gamma(alpha, beta).sample((5,)).size(), (5, 2, 3))
         self.assertEqual(Gamma(alpha_1d, beta_1d).sample((1,)).size(), (1, 1))
@@ -1937,10 +1937,10 @@ def ref_log_prob(idx, x, log_prob):
     @unittest.skipIf(not TEST_CUDA, "CUDA not found")
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_gamma_gpu_shape(self):
-        alpha = torch.tensor(torch.exp(torch.randn(2, 3).cuda()), requires_grad=True)
-        beta = torch.tensor(torch.exp(torch.randn(2, 3).cuda()), requires_grad=True)
-        alpha_1d = torch.tensor(torch.exp(torch.randn(1).cuda()), requires_grad=True)
-        beta_1d = torch.tensor(torch.exp(torch.randn(1).cuda()), requires_grad=True)
+        alpha = torch.randn(2, 3).cuda().exp().requires_grad_()
+        beta = torch.randn(2, 3).cuda().exp().requires_grad_()
+        alpha_1d = torch.randn(1).cuda().exp().requires_grad_()
+        beta_1d = torch.randn(1).cuda().exp().requires_grad_()
         self.assertEqual(Gamma(alpha, beta).sample().size(), (2, 3))
         self.assertEqual(Gamma(alpha, beta).sample((5,)).size(), (5, 2, 3))
         self.assertEqual(Gamma(alpha_1d, beta_1d).sample((1,)).size(), (1, 1))
@@ -1978,10 +1978,10 @@ def test_gamma_gpu_sample(self):
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_pareto(self):
-        scale = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)
-        alpha = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)
-        scale_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True)
-        alpha_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True)
+        scale = torch.randn(2, 3).abs().requires_grad_()
+        alpha = torch.randn(2, 3).abs().requires_grad_()
+        scale_1d = torch.randn(1).abs().requires_grad_()
+        alpha_1d = torch.randn(1).abs().requires_grad_()
         self.assertEqual(Pareto(scale_1d, 0.5).mean, inf, allow_inf=True)
         self.assertEqual(Pareto(scale_1d, 0.5).variance, inf, allow_inf=True)
         self.assertEqual(Pareto(scale, alpha).sample().size(), (2, 3))
@@ -2010,9 +2010,9 @@ def test_pareto_sample(self):
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_gumbel(self):
         loc = torch.randn(2, 3, requires_grad=True)
-        scale = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)
+        scale = torch.randn(2, 3).abs().requires_grad_()
         loc_1d = torch.randn(1, requires_grad=True)
-        scale_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True)
+        scale_1d = torch.randn(1).abs().requires_grad_()
         self.assertEqual(Gumbel(loc, scale).sample().size(), (2, 3))
         self.assertEqual(Gumbel(loc, scale).sample((5,)).size(), (5, 2, 3))
         self.assertEqual(Gumbel(loc_1d, scale_1d).sample().size(), (1,))
@@ -2038,8 +2038,8 @@ def test_gumbel_sample(self):
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_fishersnedecor(self):
-        df1 = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)
-        df2 = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)
+        df1 = torch.randn(2, 3).abs().requires_grad_()
+        df2 = torch.randn(2, 3).abs().requires_grad_()
         df1_1d = torch.randn(1).abs()
         df2_1d = torch.randn(1).abs()
         self.assertTrue(is_all_nan(FisherSnedecor(1, 2).mean))
@@ -2069,8 +2069,8 @@ def test_fishersnedecor_sample(self):
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_chi2_shape(self):
-        df = torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)
-        df_1d = torch.tensor(torch.exp(torch.randn(1)), requires_grad=True)
+        df = torch.randn(2, 3).exp().requires_grad_()
+        df_1d = torch.randn(1).exp().requires_grad_()
         self.assertEqual(Chi2(df).sample().size(), (2, 3))
         self.assertEqual(Chi2(df).sample((5,)).size(), (5, 2, 3))
         self.assertEqual(Chi2(df_1d).sample((1,)).size(), (1, 1))
@@ -2096,8 +2096,8 @@ def test_chi2_sample(self):
 
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
     def test_studentT(self):
-        df = torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)
-        df_1d = torch.tensor(torch.exp(torch.randn(1)), requires_grad=True)
+        df = torch.randn(2, 3).exp().requires_grad_()
+        df_1d = torch.randn(1).exp().requires_grad_()
         self.assertTrue(is_all_nan(StudentT(1).mean))
         self.assertTrue(is_all_nan(StudentT(1).variance))
         self.assertEqual(StudentT(2).variance, inf, allow_inf=True)
@@ -2137,8 +2137,8 @@ def test_studentT_log_prob(self):
                 self.assertAlmostEqual(float(actual_log_prob[i]), float(expected_log_prob), places=3)
 
     def test_dirichlet_shape(self):
-        alpha = torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)
-        alpha_1d = torch.tensor(torch.exp(torch.randn(4)), requires_grad=True)
+        alpha = torch.randn(2, 3).exp().requires_grad_()
+        alpha_1d = torch.randn(4).exp().requires_grad_()
         self.assertEqual(Dirichlet(alpha).sample().size(), (2, 3))
         self.assertEqual(Dirichlet(alpha).sample((5,)).size(), (5, 2, 3))
         self.assertEqual(Dirichlet(alpha_1d).sample().size(), (4,))
@@ -2165,10 +2165,10 @@ def test_dirichlet_sample(self):
                                     multivariate=True)
 
     def test_beta_shape(self):
-        con1 = torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)
-        con0 = torch.tensor(torch.exp(torch.randn(2, 3)), requires_grad=True)
-        con1_1d = torch.tensor(torch.exp(torch.randn(4)), requires_grad=True)
-        con0_1d = torch.tensor(torch.exp(torch.randn(4)), requires_grad=True)
+        con1 = torch.randn(2, 3).exp().requires_grad_()
+        con0 = torch.randn(2, 3).exp().requires_grad_()
+        con1_1d = torch.randn(4).exp().requires_grad_()
+        con0_1d = torch.randn(4).exp().requires_grad_()
         self.assertEqual(Beta(con1, con0).sample().size(), (2, 3))
         self.assertEqual(Beta(con1, con0).sample((5,)).size(), (5, 2, 3))
         self.assertEqual(Beta(con1_1d, con0_1d).sample().size(), (4,))
@@ -2269,7 +2269,7 @@ def test_cdf_log_prob(self):
         for Dist, params in EXAMPLES:
             for i, param in enumerate(params):
                 dist = Dist(**param)
-                samples = torch.tensor(dist.sample())
+                samples = dist.sample()
                 if samples.dtype.is_floating_point:
                     samples.requires_grad_()
                 try:
@@ -3827,7 +3827,7 @@ def test_equality(self):
 
     def test_forward_inverse_cache(self):
         for transform in self.transforms:
-            x = torch.tensor(self._generate_data(transform), requires_grad=True)
+            x = self._generate_data(transform).requires_grad_()
             try:
                 y = transform(x)
             except NotImplementedError:
@@ -3854,7 +3854,7 @@ def test_forward_inverse_cache(self):
 
     def test_forward_inverse_no_cache(self):
         for transform in self.transforms:
-            x = torch.tensor(self._generate_data(transform), requires_grad=True)
+            x = self._generate_data(transform).requires_grad_()
             try:
                 y = transform(x)
                 x2 = transform.inv(y.clone())  # bypass cache
@@ -3883,7 +3883,7 @@ def test_univariate_forward_jacobian(self):
         for transform in self.transforms:
             if transform.event_dim > 0:
                 continue
-            x = torch.tensor(self._generate_data(transform), requires_grad=True)
+            x = self._generate_data(transform).requires_grad_()
             try:
                 y = transform(x)
                 actual = transform.log_abs_det_jacobian(x, y)
@@ -3900,7 +3900,7 @@ def test_univariate_inverse_jacobian(self):
         for transform in self.transforms:
             if transform.event_dim > 0:
                 continue
-            y = torch.tensor(self._generate_data(transform.inv), requires_grad=True)
+            y = self._generate_data(transform.inv).requires_grad_()
             try:
                 x = transform.inv(y)
                 actual = transform.log_abs_det_jacobian(x, y)
@@ -3980,7 +3980,7 @@ def test_transformed_distribution_shapes(self):
 
     def test_jit_fwd(self):
         for transform in self.unique_transforms:
-            x = torch.tensor(self._generate_data(transform), requires_grad=True)
+            x = self._generate_data(transform).requires_grad_()
 
             def f(x):
                 return transform(x)
@@ -3991,12 +3991,12 @@ def f(x):
                 continue
 
             # check on different inputs
-            x = torch.tensor(self._generate_data(transform), requires_grad=True)
+            x = self._generate_data(transform).requires_grad_()
             self.assertEqual(f(x), traced_f(x))
 
     def test_jit_inv(self):
         for transform in self.unique_transforms:
-            y = torch.tensor(self._generate_data(transform.inv), requires_grad=True)
+            y = self._generate_data(transform.inv).requires_grad_()
 
             def f(y):
                 return transform.inv(y)
@@ -4007,12 +4007,12 @@ def f(y):
                 continue
 
             # check on different inputs
-            y = torch.tensor(self._generate_data(transform.inv), requires_grad=True)
+            y = self._generate_data(transform.inv).requires_grad_()
             self.assertEqual(f(y), traced_f(y))
 
     def test_jit_jacobian(self):
         for transform in self.unique_transforms:
-            x = torch.tensor(self._generate_data(transform), requires_grad=True)
+            x = self._generate_data(transform).requires_grad_()
 
             def f(x):
                 y = transform(x)
@@ -4024,7 +4024,7 @@ def f(x):
                 continue
 
             # check on different inputs
-            x = torch.tensor(self._generate_data(transform), requires_grad=True)
+            x = self._generate_data(transform).requires_grad_()
             self.assertEqual(f(x), traced_f(x))
 
 
diff --git a/test/test_jit.py b/test/test_jit.py
index 35597768fa033..e4281e5a795d3 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -23,7 +23,8 @@
 import tempfile
 import shutil
 import warnings
-from test_autograd import method_tests, create_input, unpack_variables, \
+from test_autograd import method_tests as autograd_method_tests
+from test_autograd import create_input, unpack_variables, \
     exclude_tensor_method, non_differentiable, EXCLUDE_GRADCHECK, EXCLUDE_FUNCTIONAL
 from copy import deepcopy
 import random
@@ -531,6 +532,21 @@ def forward(self, input):
         input = torch.rand(3, 4)
         self.assertEqual(2 * input + 1, m(input))
 
+    def test_diff_subgraph_clones_constants(self):
+        @torch.jit.script
+        def f(x, y):
+            return x + x + y + x + y + x + y + x + y + x
+
+        def count_constants(graph):
+            return sum(node.kind() == 'prim::Constant' for node in graph.nodes())
+
+        graph = f.graph.copy()
+        self.run_pass('cse', graph)
+        self.run_pass('create_autodiff_subgraphs', graph)
+        nodes = list(graph.nodes())
+        self.assertEqual(count_constants(graph), 1)
+        self.assertEqual(count_constants(nodes[1].g('Subgraph')), 1)
+
     # Backwards tracing was broken for indexing by a constant,
     # because it's internally implemented using as_strided,
     # and we attempted to trace its derivative (which is not
@@ -1208,13 +1224,18 @@ def run(**kwargs):
 
             def fn(x):
                 return x + torch.ones(2, 3, **kwargs)
-            input = torch.ones(2, 3, **kwargs)
+
+            input_kwargs = kwargs.copy()
+            if 'out' in input_kwargs:
+                del input_kwargs['out']
+            input = torch.ones(2, 3, **input_kwargs)
             self.checkTrace(fn, (input,), inputs_require_grads=inputs_require_grads)
             # check we recorded 'ones' and did not just record a constant
             tfn = torch.jit.trace(fn, input)
             self.assertTrue("ones" in str(tfn.graph))
         run()
         run(dtype=torch.int, inputs_require_grads=False)
+        run(out=torch.tensor([]))
         if RUN_CUDA:
             run(device="cuda:0")
         if RUN_CUDA_MULTI_GPU:
@@ -3501,7 +3522,7 @@ def test_fuser_multiple_blocks(this, that, theother, meme):
     @enable_cpu_fuser
     def test_scalar_fusion(self):
         def fn(x, y):
-            return x + y.type_as(x)
+            return 2 * x + y
 
         x = torch.tensor(0.1, dtype=torch.float, device='cpu')
         y = torch.tensor(1, dtype=torch.float, device='cpu')
@@ -7626,6 +7647,18 @@ def forward(self, x, y):
 EXCLUDE_TRACED = {
     'test_split_dim',
     'test_split_dim_neg0',
+
+    # The following fail due to #12024.
+    # A prim::ListConstruct is involved and the indices get traced as DynamicType,
+    # which always require_grad. This causes a crash in autodiff.
+    'test___getitem___adv_index',
+    'test___getitem___adv_index_beg',
+    'test___getitem___adv_index_comb',
+    'test___getitem___adv_index_dup',
+    'test___getitem___adv_index_sub',
+    'test___getitem___adv_index_sub_2',
+    'test___getitem___adv_index_sub_3',
+    'test___getitem___adv_index_var',
 }
 
 EXCLUDE_TYPE_CHECK = {
@@ -7736,11 +7769,17 @@ def new_fn(*tensors_):
 
 
 # create a trace function from input fn
-def create_traced_fn(self, fn):
+#
+# disable_autodiff_subgraph_inlining:
+#   Don't inline autodiff subgraphs so we can test autodiff
+def create_traced_fn(self, fn,
+                     disable_autodiff_subgraph_inlining=False):
     def traced_fn(*inputs, **kwargs):
         fn_tensors, inputs_tensors = partial_apply_nontensors(fn, inputs, **kwargs)
         traced = torch.jit.trace(fn_tensors, inputs_tensors)
         self.assertExportImport(traced.graph, inputs_tensors)
+        if disable_autodiff_subgraph_inlining:
+            traced.debug_disable_autodiff_subgraph_inlining()
         output = traced(*inputs_tensors)
         traced_fn.last_graph = traced.graph_for(*inputs_tensors)
         return output
@@ -7761,7 +7800,8 @@ def get_constant(x):
 # create a script function from (name, func_type, output_process_fn),
 # returns a function takes in (args, kwargs) and runs the compiled function and
 # then applies the post process fn to the outputs
-def create_script_fn(self, method_name, func_type, output_process_fn):
+def create_script_fn(self, method_name, func_type, output_process_fn,
+                     disable_autodiff_subgraph_inlining=False):
     def script_fn(*args, **kwargs):
         formals = []
         tensors = []
@@ -7792,6 +7832,8 @@ def script_fn(*args, **kwargs):
         import math
 
         CU = torch.jit.CompilationUnit(script)
+        if disable_autodiff_subgraph_inlining:
+            CU.the_method.debug_disable_autodiff_subgraph_inlining()
         self.assertExportImport(CU.the_method.graph, tensors)
         output = output_process_fn(CU.the_method(*tensors))
         script_fn.last_graph = CU.the_method.graph_for(*tensors)
@@ -8129,7 +8171,7 @@ def func(x):
 ])
 
 
-def add_test(
+def add_autograd_test(
         name,
         self_size,
         args,
@@ -8172,14 +8214,20 @@ def fn(*inputs, **kwargs):
                 check_types = test_name not in EXCLUDE_TYPE_CHECK
 
                 if not is_inplace and name not in EXCLUDE_GRADCHECK and not exclude_tensor_method(name, test_name):
+                    # Test with disable_autodiff_subgraph_inlining, which forces the graph
+                    # to contain DifferentiableGraph nodes whenever possible. This allows us
+                    # to test autodiff; we assume that autograd is correct and use autodiff for backprop
                     if test_name not in EXCLUDE_TRACED:
-                        check_against_reference(self, create_traced_fn(self, fn),
+                        check_against_reference(self,
+                                                create_traced_fn(self, fn,
+                                                                 disable_autodiff_subgraph_inlining=True),
                                                 fn, (self_variable,) + args_variable, kwargs_variable,
                                                 check_types=check_types)
 
                     if not is_magic_method and test_name not in EXCLUDE_SCRIPT:
                         check_against_reference(self,
-                                                create_script_fn(self, name, 'method', output_process_fn),
+                                                create_script_fn(self, name, 'method', output_process_fn,
+                                                                 disable_autodiff_subgraph_inlining=True),
                                                 fn, (self_variable,) + args_variable, kwargs_variable,
                                                 check_types=check_types)
 
@@ -8193,12 +8241,15 @@ def fn(*inputs, **kwargs):
                     f_args_tensor = (self_tensor,) + args_tensor
 
                     if not is_inplace and test_name not in EXCLUDE_TRACED:
-                        check_against_reference(self, create_traced_fn(self, fn), fn,
-                                                f_args_variable, kwargs_variable, check_types=check_types)
+                        check_against_reference(self,
+                                                create_traced_fn(self, fn,
+                                                                 disable_autodiff_subgraph_inlining=True),
+                                                fn, f_args_variable, kwargs_variable, check_types=check_types)
 
                     if not is_inplace and test_name not in EXCLUDE_SCRIPT:
                         check_against_reference(self,
-                                                create_script_fn(self, name, 'functional', output_process_fn),
+                                                create_script_fn(self, name, 'functional', output_process_fn,
+                                                                 disable_autodiff_subgraph_inlining=True),
                                                 fn, f_args_variable, kwargs_variable,
                                                 check_types=check_types)
 
@@ -8253,8 +8304,8 @@ def post_add_test(test_name, skipTestIf, do_test):
         setattr(TestJitGenerated, test_name, do_test)
 
 
-for test in method_tests:
-    add_test(*test)
+for test in autograd_method_tests:
+    add_autograd_test(*test)
 
 for test in nn_functional_tests:
     add_nn_test(*test)
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index d09a07a7b550c..9f65e7bd366b9 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -144,7 +144,7 @@
 jit::tracer::ensureUnique("${name}", ${mutable_input});
 """)
 
-ADD_TRACE_INPUT = CodeTemplate("""jit::tracer::addInputs(node, "${input}", ${input});""")
+ADD_TRACE_INPUT = CodeTemplate("""jit::tracer::addInputs(node, "${name}", ${input});""")
 
 POST_RECORD_TRACE = CodeTemplate("""\
 if (tracer_state) {
@@ -154,6 +154,18 @@
 """)
 
 
+FACTORY_FUNCTION_NAMES = None
+
+
+def find_factory_functions(declarations):
+    global FACTORY_FUNCTION_NAMES
+    FACTORY_FUNCTION_NAMES = set()
+
+    for declaration in declarations:
+        if any(arg['simple_type'] == 'TensorOptions' for arg in declaration['arguments']):
+            FACTORY_FUNCTION_NAMES.add(declaration['api_name'])
+
+
 def should_trace(declaration):
     # Operations involving Storage or Type are not traceable at the moment
     if any(arg['simple_type'] in {'Storage', 'Type'} for arg in declaration['arguments']):
@@ -185,17 +197,30 @@ def record_trace_outputs(declaration):
 
 def format_trace(declaration):
     local = {}
+    local['trace_name'] = trace_name = uninplace_api_name(declaration['api_name'])
+
+    # *_out functions take the result as a first argument, but since we're
+    # going to de-inplace the call, we need to remove it from the argument list
+    trace_inputs = declaration['arguments']
+    if declaration['name'].endswith('_out'):
+        trace_inputs = trace_inputs[1:]
+    trace_input_spec = [(i['name'], i['name']) for i in trace_inputs]
+
+    # factories are a bit special because their out-of-place overloads
+    # take an extra TensorOptions argument, which is missing in the _out function
+    has_factory_name = trace_name in FACTORY_FUNCTION_NAMES
+    is_out_overload = any(arg['name'] == 'result' for arg in declaration['arguments'])
+    if has_factory_name and is_out_overload:
+        trace_input_spec.append(('result', 'result.options()'))
+
+    local['add_trace_inputs'] = \
+        '\n'.join(ADD_TRACE_INPUT.substitute(name=name, input=value) for name, value in trace_input_spec)
 
-    add_trace_inputs = []
-    for argument in declaration['arguments']:
-        add_trace_inputs.append(ADD_TRACE_INPUT.substitute(input=argument['name']))
-    local['add_trace_inputs'] = '\n'.join(add_trace_inputs)
-    local['inplace_guard'] = ''
     # Record inplace operations as out-of-place operations (e.g.,
     # not add_ but add)
     # TODO: Add a proper concept of side effects to the IR, and
     # properly record inplace operations.
-    local['trace_name'] = uninplace_api_name(declaration['api_name'])
+    local['inplace_guard'] = ''
     if local['trace_name'] != declaration['api_name']:
         local['inplace_guard'] = INPLACE_GUARD.substitute(name=declaration['api_name'],
                                                           mutable_input=declaration['arguments'][0]['name'])
@@ -214,6 +239,7 @@ def gen_variable_type(out, aten_declarations, template_path):
     implementation of each function dispatches to the base tensor type to
     compute the output. The grad_fn is attached to differentiable functions.
     """
+    find_factory_functions(aten_declarations)
 
     VARIABLE_TYPE_H = CodeTemplate.from_file(template_path + '/VariableType.h')
     VARIABLE_TYPE_CPP = CodeTemplate.from_file(template_path + '/VariableType.cpp')
diff --git a/tools/autograd/templates/Functions.cpp b/tools/autograd/templates/Functions.cpp
index 36cb420fb1be9..f30701a406517 100644
--- a/tools/autograd/templates/Functions.cpp
+++ b/tools/autograd/templates/Functions.cpp
@@ -219,7 +219,7 @@ Tensor prod_backward(Tensor grad, const Tensor& input, Tensor result, int64_t di
 
   Tensor zero_mask = (input == 0);
   Tensor slice_zero_count = zero_mask.sum(dim, true);
-  int64_t total_zeros = slice_zero_count.sum().toCLong();
+  int64_t total_zeros = slice_zero_count.sum().item<int64_t>();
   if (total_zeros == 0) {
     return (grad * result) / input;
   } else {
@@ -321,7 +321,7 @@ Tensor cumprod_backward(const Tensor &grad, const Tensor &input, int64_t dim) {
   }
 
   // Simple case with nonzero elements in the input
-  if ((input != 0).all().toCByte()) {
+  if ((input != 0).all().item<uint8_t>()) {
     Tensor result = at::cumprod(input, dim);
     return sum_scan_exclusive(result * grad, dim) / input;
   }
@@ -1600,7 +1600,7 @@ Tensor symeig_backward(const std::vector<torch::autograd::Variable> &grads, cons
 // Invertible case is derived from Jacobi's formula, and also can be found at:
 // http://eprints.maths.ox.ac.uk/1079/1/NA-08-01.pdf
 Tensor det_backward(const Tensor & grad, const Tensor& self, const Tensor& det) {
-  auto det_val = det.toCDouble();
+  auto det_val = det.item<double>();
   if (det_val != 0 /* invertible */) {
     return grad * det * self.inverse().t();
   } else /* otherwise det = \prod(sigma) = 0, use svd */ {
@@ -1612,7 +1612,7 @@ Tensor det_backward(const Tensor & grad, const Tensor& self, const Tensor& det)
 }
 
 Tensor logdet_backward(const Tensor & grad, const Tensor& self, const Tensor& logdet) {
-  auto logdet_val = logdet.toCDouble();
+  auto logdet_val = logdet.item<double>();
   if (logdet_val != -INFINITY /* det != 0, invertible */) {
     return grad * self.inverse().t();
   } else /* otherwise det = \prod(sigma) = 0, use svd */ {
@@ -1628,7 +1628,7 @@ Tensor slogdet_backward(const std::vector<torch::autograd::Variable> &grads,
                         const Tensor& self,
                         const Tensor& signdet, const Tensor& logabsdet) {
   AT_ASSERTM(!grads[0].defined(), "slogdet's sign output should never have gradient");
-  auto signdet_val = signdet.toCDouble();
+  auto signdet_val = signdet.item<double>();
   if (signdet_val != 0 /* det != 0, invertible */) {
     return grads[1] * self.inverse().t();
   } else /* otherwise det = \prod(sigma) = 0, use svd */ {
diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp
index d697ec8a77420..24ac92dd63926 100644
--- a/tools/autograd/templates/VariableType.cpp
+++ b/tools/autograd/templates/VariableType.cpp
@@ -177,11 +177,11 @@ bool VariableType::isVariableType(const at::Type& type) {
   return type.is_variable();
 }
 
-at::Type* VariableType::getVariableTypeFromBaseType(const at::Type& baseType) {
+at::TypeExtendedInterface* VariableType::getVariableTypeFromBaseType(const at::Type& baseType) {
   auto id = static_cast<size_t>(baseType.ID());
   if(id >= type_to_variable_type.size())
     return nullptr;
-  return type_to_variable_type[id].get();
+  return static_cast<at::TypeExtendedInterface*>(type_to_variable_type[id].get());
 }
 
 namespace {
diff --git a/tools/autograd/templates/VariableType.h b/tools/autograd/templates/VariableType.h
index 446fb5b889f47..045279d4cce64 100644
--- a/tools/autograd/templates/VariableType.h
+++ b/tools/autograd/templates/VariableType.h
@@ -53,7 +53,7 @@ struct TORCH_API VariableType final : public at::TypeDefault {
   Storage unsafeStorageFromTH(void * th_pointer, bool retain) const override;
   at::Tensor unsafeTensorFromTH(void * th_pointer, bool retain) const override;
 
-  static at::Type* getVariableTypeFromBaseType(const at::Type& baseType);
+  static at::TypeExtendedInterface* getVariableTypeFromBaseType(const at::Type& baseType);
   static bool isVariableType(const at::Type& type);
   static std::vector<at::Type*> allCUDATypes();
   static std::vector<at::Type*> allCPUTypes();
diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp
index d92ad3dbf7688..c10de2c19f6f7 100644
--- a/tools/autograd/templates/python_variable_methods.cpp
+++ b/tools/autograd/templates/python_variable_methods.cpp
@@ -156,7 +156,7 @@ static double dispatch_to_CDouble(const Tensor & self) {
   if (self.numel() != 1) {
     throw ValueError("only one element tensors can be converted to Python scalars");
   }
-  return self.toCDouble();
+  return self.item<double>();
 }
 
 static std::complex<double> dispatch_to_CComplexDouble(const Tensor & self) {
@@ -165,7 +165,7 @@ static std::complex<double> dispatch_to_CComplexDouble(const Tensor & self) {
   if (self.numel() != 1) {
     throw ValueError("only one element tensors can be converted to Python scalars");
   }
-  return self.toCComplexDouble();
+  return self.item<std::complex<double>>();
 }
 
 static int64_t dispatch_to_CLong(const Tensor & self) {
@@ -174,7 +174,7 @@ static int64_t dispatch_to_CLong(const Tensor & self) {
   if (self.numel() != 1) {
     throw ValueError("only one element tensors can be converted to Python scalars");
   }
-  return self.toCLong();
+  return self.item<int64_t>();
 }
 
 static PyObject * THPVariable_float_scalar(PyObject* self, PyObject* args) {
@@ -190,7 +190,7 @@ static PyObject * THPVariable_integral_scalar(PyObject* self, PyObject* args) {
   jit::tracer::warn("Converting a tensor to a Python integer", jit::tracer::WARN_PYTHON_DATAFLOW);
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
   if (isFloatingType(self_.type().scalarType())) {
-    // we can't dispatch to toCLong here because we want to avoid ATen overflow checks;
+    // we can't dispatch to item<int64_t> here because we want to avoid ATen overflow checks;
     // the python integral type (long in python2) can't overflow.
     return THPUtils_packDoubleAsInt(dispatch_to_CDouble(self_));
   } else {
diff --git a/tools/build_pytorch_libs.bat b/tools/build_pytorch_libs.bat
index 123ba5f303e09..c924b593efc23 100755
--- a/tools/build_pytorch_libs.bat
+++ b/tools/build_pytorch_libs.bat
@@ -172,6 +172,7 @@ goto:eof
   cd build
   cmake .. %CMAKE_GENERATOR_COMMAND% ^
                   -DCMAKE_BUILD_TYPE=%BUILD_TYPE% ^
+                  -DTORCH_BUILD_VERSION="%PYTORCH_BUILD_VERSION%" ^
                   -DBUILD_TORCH="%BUILD_TORCH%" ^
                   -DNVTOOLEXT_HOME="%NVTOOLEXT_HOME%" ^
                   -DNO_API=ON ^
diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh
index 01cb82f49c596..184c60b7c444f 100755
--- a/tools/build_pytorch_libs.sh
+++ b/tools/build_pytorch_libs.sh
@@ -273,6 +273,7 @@ function build_caffe2() {
 		       -DCMAKE_INSTALL_MESSAGE="LAZY" \
 		       -DPYTHON_EXECUTABLE=$PYTORCH_PYTHON \
 		       -DBUILDING_WITH_TORCH_LIBS=ON \
+		       -DTORCH_BUILD_VERSION="$PYTORCH_BUILD_VERSION" \
 		       -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
 		       -DBUILD_TORCH=$BUILD_TORCH \
 		       -DBUILD_PYTHON=$BUILD_PYTHON \
diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py
index b7326e526baa8..f6fdc7505d996 100644
--- a/tools/jit/gen_jit_dispatch.py
+++ b/tools/jit/gen_jit_dispatch.py
@@ -351,7 +351,7 @@ def main():
                         help='path to Declarations.yaml')
     parser.add_argument('out', metavar='OUT',
                         help='path to output directory')
-    parser.add_argument('template-path', metavar='TEMPLATE_PATH',
+    parser.add_argument('template_path', metavar='TEMPLATE_PATH',
                         help='path to templates directory')
     args = parser.parse_args()
     gen_jit_dispatch(args.declarations, args.out, args.template_path)
diff --git a/tools/setup_helpers/nccl.py b/tools/setup_helpers/nccl.py
index 703446520870a..c1cc88657ebf6 100644
--- a/tools/setup_helpers/nccl.py
+++ b/tools/setup_helpers/nccl.py
@@ -33,9 +33,11 @@
         os.path.join(ENV_ROOT, 'lib64') if ENV_ROOT is not None else None,
         os.path.join(CUDA_HOME, 'lib'),
         os.path.join(CUDA_HOME, 'lib64'),
+        '/usr/local/lib',
         '/usr/lib/x86_64-linux-gnu/',
         '/usr/lib/powerpc64le-linux-gnu/',
         '/usr/lib/aarch64-linux-gnu/',
+        '/usr/lib',
     ] + gather_paths([
         'LIBRARY_PATH',
     ]) + gather_paths([
@@ -45,7 +47,9 @@
         INCLUDE_DIR,
         ENV_ROOT,
         os.path.join(ENV_ROOT, 'include') if ENV_ROOT is not None else None,
-        '/usr/include'
+        os.path.join(CUDA_HOME, 'include'),
+        '/usr/local/include',
+        '/usr/include',
     ]))
 
     if IS_CONDA:
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index be13aaa61b97b..ce337e93c8546 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -12,12 +12,6 @@ endif()
 option(BUILD_TEST "Build torch test binaries" ON)
 option(TORCH_STATIC "Build libtorch.a rather than libtorch.so" OFF)
 
-# TODO: Unify with version from setup.py
-set(TORCH_VERSION_MAJOR 0)
-set(TORCH_VERSION_MINOR 4)
-set(TORCH_VERSION_PATCH 1)
-set(TORCH_VERSION "${TORCH_VERSION_MAJOR}.${TORCH_VERSION_MINOR}.${TORCH_VERSION_PATCH}")
-
 set(TORCH_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
 set(TORCH_ROOT "${TORCH_SRC_DIR}/..")
 
@@ -411,7 +405,7 @@ endif()
 install(DIRECTORY "${TORCH_SRC_DIR}/csrc"
         DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch
         FILES_MATCHING PATTERN "*.h")
-install(FILES "${TORCH_SRC_DIR}/script.h"
+install(FILES "${TORCH_SRC_DIR}/script.h" "${TORCH_SRC_DIR}/extension.h"
         DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch)
 
 install(TARGETS torch
diff --git a/torch/csrc/api/include/torch/nn/modules/dropout.h b/torch/csrc/api/include/torch/nn/modules/dropout.h
index bfe230a597215..48b89642864ed 100644
--- a/torch/csrc/api/include/torch/nn/modules/dropout.h
+++ b/torch/csrc/api/include/torch/nn/modules/dropout.h
@@ -12,20 +12,18 @@ namespace nn {
 
 /// Options for `Dropout` and `FeatureDropout`.
 struct DropoutOptions {
-  DropoutOptions(double rate);
+  /* implicit */ DropoutOptions(double rate = 0.5);
   /// The probability with which a particular component of the input is set to
   /// zero.
   /// Changes to this parameter at runtime are effective.
-  TORCH_ARG(double, rate) = 0.5;
+  TORCH_ARG(double, rate);
 };
 
 namespace detail {
 template <typename Derived>
 class DropoutImplBase : public torch::nn::Cloneable<Derived> {
  public:
-  explicit DropoutImplBase(double rate)
-      : DropoutImplBase(DropoutOptions(rate)) {}
-  explicit DropoutImplBase(DropoutOptions options_);
+  explicit DropoutImplBase(DropoutOptions options_ = DropoutOptions());
 
   void reset() override;
 
diff --git a/torch/csrc/api/include/torch/nn/modules/sequential.h b/torch/csrc/api/include/torch/nn/modules/sequential.h
index e4839ac41a910..3ee80042020b1 100644
--- a/torch/csrc/api/include/torch/nn/modules/sequential.h
+++ b/torch/csrc/api/include/torch/nn/modules/sequential.h
@@ -92,6 +92,8 @@ class SequentialImpl : public Cloneable<SequentialImpl> {
   using Iterator = std::vector<AnyModule>::iterator;
   using ConstIterator = std::vector<AnyModule>::const_iterator;
 
+  SequentialImpl() = default;
+
   /// Constructs the `Sequential` from a variadic list of modules.
   template <typename... Modules>
   explicit SequentialImpl(Modules&&... modules) {
diff --git a/torch/csrc/api/include/torch/nn/pimpl-inl.h b/torch/csrc/api/include/torch/nn/pimpl-inl.h
new file mode 100644
index 0000000000000..9da1c38a8372d
--- /dev/null
+++ b/torch/csrc/api/include/torch/nn/pimpl-inl.h
@@ -0,0 +1,47 @@
+// This class exists  only to do SFINAE on abstract types `T` that are really
+// `ModuleHolder<ModuleType>`, because there's no good way to say that `T` is a
+// `ModuleHolder` over some unknown type `ModuleType`. With this, you can do
+// `enable_if_t<is_base_of_v<ModuleHolderIndicator, T>>`.
+struct ModuleHolderIndicator {};
+
+// A type trait that is true for types that are `ModuleHolder`s.
+template <typename T>
+using is_module_holder = std::is_base_of<ModuleHolderIndicator, decay_t<T>>;
+
+template <typename T>
+using disable_if_module_holder_t = disable_if_t<is_module_holder<T>::value>;
+
+// A collection of templates that answer the question whether a type `T` is a
+// `ModuleHolder`, and if so whether its contained type is of type `C`. This is
+// tricky because it is hard to short circuit in template metaprogramming. A
+// naive and incorrect solution to this problem would be something like
+// `disable_if<is_module_holder<T>::value && typename T::ContainedType == C>`.
+// This would disable all types that are not `ModuleHolder`s, because even
+// though the `is_module_holder<T>::value` may be `false` for such types the
+// `T::ContainedType` access would be ill-formed and thus fail the whole
+// expression by the rules of SFINAE. Instead we have to use template
+// specialization to statically branch on the first condition
+// (`is_module_holder<T>`) and are only then allowed to query
+// `T::ContainedType` in the branch for which the condition was true.
+
+// Base template.
+template <bool is_module_holder_value, typename T, typename C>
+struct is_module_holder_of_impl;
+
+// False branch. `T` is not a `ModuleHolder` and thus not a `ModuleHolder` with
+// contained type `C`.
+template <typename T, typename C>
+struct is_module_holder_of_impl<false, T, C> : std::false_type {};
+
+// True branch. `T` is a `ModuleHolder` and thus we can legit access its
+// `ContainedType` and compare it against `C`.
+template <typename T, typename C>
+struct is_module_holder_of_impl<true, T, C>
+    : std::is_same<typename T::ContainedType, C> {};
+
+// Helper template.
+template <typename T, typename C>
+struct is_module_holder_of : is_module_holder_of_impl<
+                                 detail::is_module_holder<T>::value,
+                                 torch::decay_t<T>,
+                                 torch::decay_t<C>> {};
diff --git a/torch/csrc/api/include/torch/nn/pimpl.h b/torch/csrc/api/include/torch/nn/pimpl.h
index 48c331e148686..ecdd36af23187 100644
--- a/torch/csrc/api/include/torch/nn/pimpl.h
+++ b/torch/csrc/api/include/torch/nn/pimpl.h
@@ -10,17 +10,8 @@
 
 namespace torch {
 namespace detail {
-/// This class exists  only to do SFINAE on abstract types `T` that are really
-/// `ModuleHolder<ModuleType>`, because there's no good way to say that `T` is a
-/// `ModuleHolder` over some unknown type `ModuleType`. With this, you can do
-/// `enable_if_t<is_base_of_v<ModuleHolderIndicator, T>>`.
-struct ModuleHolderIndicator {};
-
-template <typename T>
-using is_module_holder = std::is_base_of<ModuleHolderIndicator, decay_t<T>>;
-
-template <typename T>
-using disable_if_module_holder_t = disable_if_t<is_module_holder<T>::value>;
+// Dump all the template metaprogramming in this file.
+#include "pimpl-inl.h"
 } // namespace detail
 
 namespace nn {
@@ -40,7 +31,9 @@ class ModuleHolder : torch::detail::ModuleHolderIndicator {
   using ContainedType = Contained;
 
   /// Default constructs the contained module if if has a default constructor,
-  /// else produces a static error. NOTE: This uses the behavior of template
+  /// else produces a static error.
+  ///
+  /// NOTE: This uses the behavior of template
   /// classes in C++ that constructors (or any methods) are only compiled when
   /// actually used.
   ModuleHolder() : impl_(default_construct()) {
@@ -58,9 +51,16 @@ class ModuleHolder : torch::detail::ModuleHolderIndicator {
 
   /// Constructs the `ModuleHolder` with a contained module, forwarding all
   /// arguments to its constructor.
-  template <typename... Ts>
-  explicit ModuleHolder(Ts&&... ts)
-      : impl_(new Contained(std::forward<Ts>(ts)...)) {}
+  template <
+      typename Head,
+      typename... Tail,
+      typename = torch::disable_if_t<
+          detail::is_module_holder_of<Head, ContainedType>::value &&
+          (sizeof...(Tail) == 0)>>
+  explicit ModuleHolder(Head&& head, Tail&&... tail)
+      : impl_(new Contained(
+            std::forward<Head>(head),
+            std::forward<Tail>(tail)...)) {}
 
   /// Constructs the `ModuleHolder` from a pointer to the contained type.
   /// Example: `Linear(std::make_shared<LinearImpl>(...))`.
@@ -158,15 +158,10 @@ class ModuleHolder : torch::detail::ModuleHolderIndicator {
 
 /// Defines a class `Name` which inherits from `nn::ModuleHolder` to provide a
 /// wrapper over a `std::shared_ptr<Impl>`.
-#define TORCH_MODULE_IMPL(Name, Impl)                                         \
-  class Name : public torch::nn::ModuleHolder<Impl> { /* NOLINT */            \
-   public:                                                                    \
-    using torch::nn::ModuleHolder<Impl>::ModuleHolder;                        \
-    Name(const Name&) = default; /* NOLINT */                                 \
-    Name(Name&&) = default; /* NOLINT */                                      \
-    Name(Name& other) : Name(static_cast<const Name&>(other)) {} /* NOLINT */ \
-    Name& operator=(const Name&) = default; /* NOLINT */                      \
-    Name& operator=(Name&&) = default; /* NOLINT */                           \
+#define TORCH_MODULE_IMPL(Name, Impl)                              \
+  class Name : public torch::nn::ModuleHolder<Impl> { /* NOLINT */ \
+   public:                                                         \
+    using torch::nn::ModuleHolder<Impl>::ModuleHolder;             \
   }
 
 /// Like `TORCH_MODULE_IMPL`, but defaults the `Impl` name to `<Name>Impl`.
diff --git a/torch/csrc/api/include/torch/optim/serialize.h b/torch/csrc/api/include/torch/optim/serialize.h
index 163ebbdcf098b..1c85fa74e0062 100644
--- a/torch/csrc/api/include/torch/optim/serialize.h
+++ b/torch/csrc/api/include/torch/optim/serialize.h
@@ -51,7 +51,7 @@ void serialize(
     BufferContainer& buffers) {
   torch::Tensor size_tensor;
   archive.read(key + "/size", size_tensor);
-  const size_t size = size_tensor.toCLong();
+  const size_t size = size_tensor.item<int64_t>();
   for (size_t index = 0; index < size; ++index) {
     buffers.emplace_back();
     archive.read(
diff --git a/torch/csrc/api/include/torch/python.h b/torch/csrc/api/include/torch/python.h
new file mode 100644
index 0000000000000..ba1da4599f439
--- /dev/null
+++ b/torch/csrc/api/include/torch/python.h
@@ -0,0 +1,107 @@
+#pragma once
+
+#include <torch/csrc/utils/pybind.h>
+#include <torch/tensor.h>
+
+#include <iterator>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace python {
+namespace detail {
+template <typename Cursor>
+std::vector<Tensor> cursor_to_vector(const Cursor& cursor) {
+  std::vector<Tensor> vector;
+  vector.reserve(cursor.size());
+  cursor.map(
+      std::back_inserter(vector), [](const Tensor& tensor) { return tensor; });
+  return vector;
+}
+
+template <typename Cursor>
+std::unordered_map<std::string, Tensor> cursor_to_map(const Cursor& cursor) {
+  std::unordered_map<std::string, Tensor> map;
+  map.reserve(cursor.size());
+  cursor.map_items(
+      std::inserter(map, map.end()),
+      [](const std::string& key, const Tensor& tensor) {
+        return std::make_pair(key, tensor);
+      });
+  return map;
+}
+} // namespace detail
+
+/// Adds method bindings for a pybind11 `class_` that binds an `nn::Module`
+/// subclass.
+///
+/// Say you have a pybind11 class object created with `py::class_<Net>(m,
+/// "Net")`. This function will add all the necessary `.def()` calls to bind the
+/// `nn::Module` base class' methods, such as `train()`, `eval()` etc. into
+/// Python. The exact list of supported methods and their Python signatures are:
+/// - `train()`
+/// - `eval()`
+/// - `is_training() -> bool`
+/// - `zero_grad()`
+/// - `cuda()`
+/// - `cpu()`
+/// - `parameters() -> List<Tensor>`
+/// - `named_parameters() -> Dict<String, Tensor>`
+/// - `buffers() -> List<Tensor>`
+/// - `named_buffers() -> Dict<String, Tensor>`
+template <typename M, typename... Extra>
+py::class_<M, Extra...> add_module_bindings(py::class_<M, Extra...> module) {
+  return module.def("train", [](M& module) { module.train(); })
+      .def("eval", [](M& module) { module.eval(); })
+      .def("clone", [](M& module) { return module.clone(); })
+      .def_property_readonly(
+          "training", [](M& module) { return module.is_training(); })
+      .def_property_readonly(
+          "training", [](M& module) { return module.is_training(); })
+      .def("zero_grad", [](M& module) { module.zero_grad(); })
+      .def("cuda", [](M& module) { module.to(torch::kCUDA); })
+      .def("cpu", [](M& module) { module.to(torch::kCPU); })
+      .def(
+          "parameters",
+          [](M& module) {
+            return detail::cursor_to_vector(module.parameters());
+          })
+      .def(
+          "named_parameters",
+          [](M& module) { return detail::cursor_to_map(module.parameters()); })
+      .def(
+          "buffers",
+          [](M& module) { return detail::cursor_to_vector(module.buffers()); })
+      .def("named_buffers", [](M& module) {
+        return detail::cursor_to_map(module.buffers());
+      });
+}
+
+/// Creates a pybind11 class object for an `nn::Module` subclass type and adds
+/// default bindings.
+///
+/// After adding the default bindings, the class object is returned, such that
+/// you can add more bindings.
+///
+/// Example usage:
+/// \rst
+/// .. code-block::
+///   struct Net : torch::nn::Module {
+///     Net(int in, int out) { }
+///     torch::Tensor forward(torch::Tensor x) { return x; }
+///   };
+///
+///   PYBIND11_MODULE(my_module, m) {
+///     torch::python::bind_module<Net>(m, "Net")
+///       .def(py::init<int, int>())
+///       .def("forward", &Net::forward);
+///  }
+/// \endrst
+template <typename M, typename... Extra>
+py::class_<M, Extra...> bind_module(py::module module, const char* name) {
+  return add_module_bindings(py::class_<M, Extra...>(module, name));
+}
+} // namespace python
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/torch.h b/torch/csrc/api/include/torch/torch.h
index 9b6eae58d9c72..38bd5a571283f 100644
--- a/torch/csrc/api/include/torch/torch.h
+++ b/torch/csrc/api/include/torch/torch.h
@@ -1,8 +1,14 @@
 #pragma once
 
 #include <torch/cuda.h>
+#include <torch/jit.h>
 #include <torch/nn.h>
 #include <torch/optim.h>
 #include <torch/serialize.h>
 #include <torch/tensor.h>
 #include <torch/utils.h>
+
+#ifdef TORCH_API_INCLUDE_EXTENSION_H
+#include <torch/extension.h>
+#warning "Including torch/torch.h for C++ extensions is deprecated. Please include torch/extension.h"
+#endif // defined(TORCH_API_INCLUDE_EXTENSION_H)
diff --git a/torch/csrc/api/src/optim/lbfgs.cpp b/torch/csrc/api/src/optim/lbfgs.cpp
index 7f6104876bcf0..37c4b1dcaf425 100644
--- a/torch/csrc/api/src/optim/lbfgs.cpp
+++ b/torch/csrc/api/src/optim/lbfgs.cpp
@@ -31,7 +31,7 @@ void LBFGS::add_grad(const torch::Tensor& step_size, const Tensor& update) {
     Tensor& pd = autograd::Variable(parameter).data();
     pd.add_(
         update.slice(0, offset, offset + numel, 1).view_as(pd),
-        step_size.toCFloat());
+        step_size.item<float>());
     offset += numel;
   }
 }
@@ -45,7 +45,7 @@ torch::Tensor LBFGS::step(LossClosure closure) {
   Tensor flat_grad = gather_flat_grad();
   Tensor abs_grad_sum = flat_grad.abs().sum();
 
-  if (abs_grad_sum.toCFloat() <= options.tolerance_grad_) {
+  if (abs_grad_sum.item<float>() <= options.tolerance_grad_) {
     return loss;
   }
 
@@ -65,7 +65,7 @@ torch::Tensor LBFGS::step(LossClosure closure) {
       Tensor s = d.mul(t);
       Tensor ys = y.dot(s);
 
-      if (ys.toCFloat() > 1e-10) {
+      if (ys.item<float>() > 1e-10) {
         // updating memory
 
         if (old_dirs.size() == options.history_size_) {
@@ -140,14 +140,15 @@ torch::Tensor LBFGS::step(LossClosure closure) {
       break;
     } else if (current_evals >= options.max_eval_) {
       break;
-    } else if (abs_grad_sum.toCFloat() <= options.tolerance_grad_) {
+    } else if (abs_grad_sum.item<float>() <= options.tolerance_grad_) {
       break;
-    } else if (gtd.toCFloat() > -options.tolerance_grad_) {
+    } else if (gtd.item<float>() > -options.tolerance_grad_) {
       break;
-    } else if (d.mul(t).abs_().sum().toCFloat() <= options.tolerance_change_) {
+    } else if (
+        d.mul(t).abs_().sum().item<float>() <= options.tolerance_change_) {
       break;
     } else if (
-        std::abs(loss.toCFloat() - prev_loss.toCFloat()) <
+        std::abs(loss.item<float>() - prev_loss.item<float>()) <
         options.tolerance_change_) {
       break;
     }
diff --git a/torch/csrc/api/src/optim/serialize.cpp b/torch/csrc/api/src/optim/serialize.cpp
index fbda6af91f32c..24f9096c6ac36 100644
--- a/torch/csrc/api/src/optim/serialize.cpp
+++ b/torch/csrc/api/src/optim/serialize.cpp
@@ -31,7 +31,7 @@ void serialize(
   serialize(archive, key, tensors);
   steps.clear();
   for (const auto& step : tensors) {
-    steps.push_back(step.toCLong());
+    steps.push_back(step.item<int64_t>());
   }
 }
 } // namespace detail
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
index d0ecc017b42b5..1847bb65b08f8 100644
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@@ -407,7 +407,7 @@ auto Engine::evaluate_function(FunctionTask& task) -> void {
     for (int i = 0; i < num_outputs; ++i) {
       auto& output = outputs[i];
       at::DeviceGuard guard(output);
-      if (output.defined() && output.ne(output).any().toCByte()) {
+      if (output.defined() && output.ne(output).any().item<uint8_t>()) {
         std::stringstream ss;
         ss << "Function '" << fn.name() << "' returned nan values in its " << i << "th output.";
         throw std::runtime_error(ss.str());
diff --git a/torch/csrc/autograd/functions/tensor.cpp b/torch/csrc/autograd/functions/tensor.cpp
index d5a94d49985bc..493a1aadd1755 100644
--- a/torch/csrc/autograd/functions/tensor.cpp
+++ b/torch/csrc/autograd/functions/tensor.cpp
@@ -60,7 +60,7 @@ auto CopySlices::apply(variable_list&& inputs) -> variable_list {
     throw std::runtime_error(ERR_BACKWARD_TWICE);
   }
 
-  auto result = grad.type().tensor(base.sizes(), base.strides());
+  auto result = at::empty_strided(base.sizes(), base.strides(), grad.options());
   result.copy_(grad);
 
   auto offset = view.storage_offset() - base.storage_offset();
diff --git a/torch/csrc/autograd/python_hook.cpp b/torch/csrc/autograd/python_hook.cpp
index 3ceb1f4aa201c..af02e9e46997e 100644
--- a/torch/csrc/autograd/python_hook.cpp
+++ b/torch/csrc/autograd/python_hook.cpp
@@ -51,7 +51,7 @@ auto PyFunctionPreHook::operator()(const variable_list& values) -> variable_list
   }
 
   variable_list results(values);
-  results[value_idx] = ((THPVariable*)value.get())->cdata;
+  if (value != Py_None) results[value_idx] = ((THPVariable*)value.get())->cdata;
   return results;
 }
 
diff --git a/torch/csrc/autograd/python_legacy_variable.cpp b/torch/csrc/autograd/python_legacy_variable.cpp
index 3ba7ff94bc1fd..4c6ac18453c06 100644
--- a/torch/csrc/autograd/python_legacy_variable.cpp
+++ b/torch/csrc/autograd/python_legacy_variable.cpp
@@ -46,7 +46,7 @@ static PyObject *THPVariable_pynew(PyTypeObject* type, PyObject *args, PyObject
   if (!data || data == Py_None) {
     // For legacy serialization code, create an empty tensor. This is also used
     // by nn.Parameter() with no arguments.
-    auto var = torch::tensors::get_default_tensor_type().tensor();
+    auto var = at::empty({0}, torch::tensors::get_default_tensor_type().options());
     tensor = static_cast<Variable&>(var).data();
   } else if (THPVariable_Check(data)) {
     tensor = ((THPVariable*)data)->cdata.data();
diff --git a/torch/csrc/autograd/python_variable_indexing.cpp b/torch/csrc/autograd/python_variable_indexing.cpp
index b50dddace66c5..abb588bee8fc5 100644
--- a/torch/csrc/autograd/python_variable_indexing.cpp
+++ b/torch/csrc/autograd/python_variable_indexing.cpp
@@ -173,7 +173,7 @@ static Variable applySlicing(const Variable& self, PyObject* index, variable_lis
           result = applySelect(result, dim, THPUtils_unpackLong(obj));
         } else {
           result = result.unsqueeze(dim);
-          handle_var(boolToIndexingTensor(result, var.toCByte() != 0));
+          handle_var(boolToIndexingTensor(result, var.item<uint8_t>() != 0));
         }
       } else {
         handle_var(var);
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
index bd2e475645975..9de77efeb79fa 100644
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@@ -599,6 +599,6 @@ inline Variable::Variable(c10::intrusive_ptr<Variable::Impl> self)
 
 inline Variable::Impl* Variable::get() const {
   AT_CHECK(defined(), "Called Variable::get() on an undefined Variable");
-  return static_cast<Variable::Impl*>(tensor_impl_.get());
+  return static_cast<Variable::Impl*>(impl_.get());
 }
 }} // namespace torch::autograd
diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp
index 716a1d30c3c9c..5531348ebdaf0 100644
--- a/torch/csrc/cuda/comm.cpp
+++ b/torch/csrc/cuda/comm.cpp
@@ -47,7 +47,7 @@ std::vector<Tensor> broadcast(const Tensor& tensor, IntList devices) {
     tensors.push_back(tensor);
     for (auto device : devices.slice(1)) {
       _device_guard.set_index(device);
-      tensors.push_back(type.tensor(tensor.sizes()));
+      tensors.push_back(at::empty(tensor.sizes(), type.options()));
     }
     nccl::broadcast(tensors);
   } else {
diff --git a/torch/csrc/jit/autodiff.cpp b/torch/csrc/jit/autodiff.cpp
index 251a6466ee3a4..009bf68ae3f6d 100644
--- a/torch/csrc/jit/autodiff.cpp
+++ b/torch/csrc/jit/autodiff.cpp
@@ -23,10 +23,20 @@ void wrapDim(int64_t & dim, const std::vector<int64_t> & sizes) {
 }
 
 bool isDifferentiable(Node * n) {
+  // TODO: scalar-tensor ops should be canonicalized
   static OperatorSet differentiable_ops = {
     "aten::add(Tensor self, Tensor other, *, Scalar alpha) -> Tensor",
+    "aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor",
+    "aten::add(Scalar other, Tensor self) -> Tensor",
     "aten::sub(Tensor self, Tensor other, *, Scalar alpha) -> Tensor",
+    "aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor",
+    "aten::sub(Scalar other, Tensor self) -> Tensor",
     "aten::mul(Tensor self, Tensor other) -> Tensor",
+    "aten::mul(Tensor self, Scalar other) -> Tensor",
+    "aten::mul(Scalar other, Tensor self) -> Tensor",
+    "aten::div(Scalar other, Tensor self) -> Tensor",
+    "aten::div(Tensor self, Tensor other) -> Tensor",
+    "aten::div(Tensor self, Scalar other) -> Tensor",
     "aten::sigmoid(Tensor self) -> Tensor",
     "aten::tanh(Tensor self) -> Tensor",
     "aten::relu(Tensor self) -> Tensor",
@@ -43,9 +53,39 @@ bool isDifferentiable(Node * n) {
     "aten::gt(Tensor self, Tensor other) -> Tensor",
     "aten::ge(Tensor self, Tensor other) -> Tensor",
     "aten::eq(Tensor self, Tensor other) -> Tensor",
-    "aten::ne(Tensor self, Tensor other) -> Tensor"
+    "aten::ne(Tensor self, Tensor other) -> Tensor",
+    "aten::abs(Tensor self) -> Tensor",
+    "aten::acos(Tensor self) -> Tensor",
+    "aten::asin(Tensor self) -> Tensor",
+    "aten::atan(Tensor self) -> Tensor",
+    "aten::ceil(Tensor self) -> Tensor",
+    "aten::cos(Tensor self) -> Tensor",
+    "aten::cosh(Tensor self) -> Tensor",
+    "aten::exp(Tensor self) -> Tensor",
+    "aten::expm1(Tensor self) -> Tensor",
+    "aten::floor(Tensor self) -> Tensor",
+    "aten::fmod(Tensor self, Scalar other) -> Tensor",
+    "aten::frac(Tensor self) -> Tensor",
+    "aten::log(Tensor self) -> Tensor",
+    "aten::log10(Tensor self) -> Tensor",
+    "aten::log1p(Tensor self) -> Tensor",
+    "aten::log2(Tensor self) -> Tensor",
+    "aten::reciprocal(Tensor self) -> Tensor",
+    "aten::remainder(Tensor self, Scalar other) -> Tensor",
+    "aten::round(Tensor self) -> Tensor",
+    "aten::rsqrt(Tensor self) -> Tensor",
+    "aten::sin(Tensor self) -> Tensor",
+    "aten::sinh(Tensor self) -> Tensor",
+    "aten::tan(Tensor self) -> Tensor",
+    "aten::trunc(Tensor self) -> Tensor",
   };
 
+  // TODO: add support for the following fusible operators.
+  // They're a little tricky to implement; max/min require mutability for best perf
+  // "aten::atan2(Tensor self) -> Tensor",
+  // "aten::max(Tensor self) -> Tensor",
+  // "aten::min(Tensor self) -> Tensor"
+
   if (n->kind() == prim::Constant ||
       n->kind() == prim::AutogradAdd ||
       n->kind() == prim::ConstantChunk)
@@ -89,15 +129,42 @@ static std::vector<Value*> gradientForNode(Node* node, ArrayRef<Value*> grad_val
     if (node->matches("aten::add(Tensor self, Tensor other, *, Scalar alpha) -> Tensor")) {
       return {grads.at(0), grads.at(0) * node->namedInput(attr::alpha), nullptr};
 
+    } else if (node->matches("aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor")) {
+      return {grads.at(0), nullptr, nullptr};
+
+    } else if (node->matches("aten::add(Scalar other, Tensor self) -> Tensor")) {
+      return {nullptr, grads.at(0)};
+
     } else if (node->kind() == prim::AutogradAdd) {
       return {grads.at(0), grads.at(0)};
 
     } else if (node->matches("aten::sub(Tensor self, Tensor other, *, Scalar alpha) -> Tensor")) {
       return {grads.at(0), -grads.at(0) * node->namedInput(attr::alpha), nullptr};
 
+    } else if (node->matches("aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor")) {
+      return {grads.at(0), nullptr, nullptr};
+
+    } else if (node->matches("aten::sub(Scalar other, Tensor self) -> Tensor")) {
+      return {nullptr, -grads.at(0)};
+
     } else if (node->matches("aten::mul(Tensor self, Tensor other) -> Tensor")) {
       return {grads.at(0) * inputs.at(1), grads.at(0) * inputs.at(0)};
 
+    } else if (node->matches("aten::mul(Tensor self, Scalar other) -> Tensor")) {
+      return {grads.at(0) * inputs.at(1), nullptr};
+
+    } else if (node->matches("aten::mul(Scalar other, Tensor self) -> Tensor")) {
+      return {nullptr, grads.at(0) * inputs.at(0)};
+
+    } else if (node->matches("aten::div(Tensor self, Tensor other) -> Tensor")) {
+      return {grads.at(0) / inputs.at(1), -grads.at(0) * inputs.at(0) / (inputs.at(1) * inputs.at(1))};
+
+    } else if (node->matches("aten::div(Tensor self, Scalar other) -> Tensor")) {
+      return {grads.at(0) / inputs.at(1), nullptr};
+
+    } else if (node->matches("aten::div(Scalar other, Tensor self) -> Tensor")) {
+      return {nullptr, -grads.at(0) * inputs.at(0) / (inputs.at(1) * inputs.at(1))};
+
     } else if (node->matches("aten::sigmoid(Tensor self) -> Tensor")) {
       return {grads.at(0) * outputs.at(0) * (1 - outputs.at(0))};
 
@@ -130,6 +197,78 @@ static std::vector<Value*> gradientForNode(Node* node, ArrayRef<Value*> grad_val
     } else if (node->matches("aten::neg(Tensor self) -> Tensor")) {
       return {-grads.at(0)};
 
+    } else if (node->matches("aten::abs(Tensor self) -> Tensor")) {
+      return {grads.at(0) * inputs.at(0).sign()};
+
+    } else if (node->matches("aten::acos(Tensor self) -> Tensor")) {
+      return {grads.at(0) * -((-inputs.at(0) * inputs.at(0) + at::Scalar(1)).rsqrt())};
+
+    } else if (node->matches("aten::asin(Tensor self) -> Tensor")) {
+      return {grads.at(0) * (-inputs.at(0) * inputs.at(0) + at::Scalar(1)).rsqrt()};
+
+    } else if (node->matches("aten::atan(Tensor self) -> Tensor")) {
+      return {grads.at(0) / (inputs.at(0) * inputs.at(0) + at::Scalar(1))};
+
+    } else if (node->matches("aten::ceil(Tensor self) -> Tensor")) {
+      return {SymbolicVariable::zeros_like(grads.at(0))};
+
+    } else if (node->matches("aten::cos(Tensor self) -> Tensor")) {
+      return {grads.at(0) * -inputs.at(0).sin()};
+
+    } else if (node->matches("aten::cosh(Tensor self) -> Tensor")) {
+      return {grads.at(0) * inputs.at(0).sinh()};
+
+    } else if (node->matches("aten::exp(Tensor self) -> Tensor")) {
+      return {grads.at(0) * outputs.at(0)};
+
+    } else if (node->matches("aten::expm1(Tensor self) -> Tensor")) {
+      return {grads.at(0) * (outputs.at(0) + at::Scalar(1))};
+
+    } else if (node->matches("aten::floor(Tensor self) -> Tensor")) {
+      return {SymbolicVariable::zeros_like(grads.at(0))};
+
+    } else if (node->matches("aten::fmod(Tensor self, Scalar other) -> Tensor")) {
+      return {grads.at(0), nullptr};
+
+    } else if (node->matches("aten::frac(Tensor self) -> Tensor")) {
+      return {grads.at(0)};
+
+    } else if (node->matches("aten::log(Tensor self) -> Tensor")) {
+      return {grads.at(0) / inputs.at(0)};
+
+    } else if (node->matches("aten::log10(Tensor self) -> Tensor")) {
+      return {grads.at(0) / (inputs.at(0) * 2.3025850929940456)};
+
+    } else if (node->matches("aten::log1p(Tensor self) -> Tensor")) {
+      return {grads.at(0) / (inputs.at(0) + at::Scalar(1))};
+
+    } else if (node->matches("aten::log2(Tensor self) -> Tensor")) {
+      return {grads.at(0) / (inputs.at(0) * 0.6931471805599453)};
+
+    } else if (node->matches("aten::reciprocal(Tensor self) -> Tensor")) {
+      return {-grads.at(0) * outputs.at(0) * outputs.at(0)};
+
+    } else if (node->matches("aten::remainder(Tensor self, Scalar other) -> Tensor")) {
+      return {grads.at(0), nullptr};
+
+    } else if (node->matches("aten::round(Tensor self) -> Tensor")) {
+      return {SymbolicVariable::zeros_like(grads.at(0))};
+
+    } else if (node->matches("aten::rsqrt(Tensor self) -> Tensor")) {
+      return {grads.at(0) * outputs.at(0).pow(3.) * -0.5};
+
+    } else if (node->matches("aten::sin(Tensor self) -> Tensor")) {
+      return {grads.at(0) * inputs.at(0).cos()};
+
+    } else if (node->matches("aten::sinh(Tensor self) -> Tensor")) {
+      return {grads.at(0) * inputs.at(0).cosh()};
+
+    } else if (node->matches("aten::tan(Tensor self) -> Tensor")) {
+      return {grads.at(0) * (1. + outputs.at(0) * outputs.at(0))};
+
+    } else if (node->matches("aten::trunc(Tensor self) -> Tensor")) {
+      return {SymbolicVariable::zeros_like(grads.at(0))};
+
     } else if (node->kind() == prim::ConstantChunk) {
       return {SymbolicVariable::cat(grads, node->i(attr::dim))};
 
diff --git a/torch/csrc/jit/batched/BatchTensor.cpp b/torch/csrc/jit/batched/BatchTensor.cpp
index a843280912437..564b4b7e4449b 100644
--- a/torch/csrc/jit/batched/BatchTensor.cpp
+++ b/torch/csrc/jit/batched/BatchTensor.cpp
@@ -14,14 +14,14 @@ BatchTensor::BatchTensor(at::Tensor data, at::Tensor mask, at::Tensor dims){
 }
 
 BatchTensor::BatchTensor(at::Tensor data, int64_t batch_size){
-  dims = data.type().toScalarType(at::kByte).tensor(data.dim());
+  dims = at::empty(data.dim(), data.options().dtype(at::kByte));
   dims.fill_(0);
   std::vector<int64_t> sizes(data.dim() + 1, -1);
   sizes[0] = batch_size;
   this->data = data.unsqueeze(0).expand(sizes);
   std::vector<int64_t> mask_sizes(data.dim() + 1, 1);
   mask_sizes[0] = batch_size;
-  mask = data.type().toScalarType(at::kByte).tensor(mask_sizes);
+  mask = at::empty(mask_sizes, data.options().dtype(at::kByte));
   mask.fill_(1);
 }
 
@@ -34,17 +34,17 @@ BatchTensor::BatchTensor(const std::vector<at::Tensor> datalist, at::Tensor dims
     for(auto x : datalist){
       sizes[i] = std::max(sizes[i], x.size(i));
     }
-    mask_sizes[i] = *dims[i - 1].toByteData() ? sizes[i] : 1;
+    mask_sizes[i] = *dims[i - 1].data<uint8_t>() ? sizes[i] : 1;
   }
-  data = datalist[0].type().tensor(sizes);
+  data = at::empty(sizes, datalist[0].options());
   data.fill_(0);
-  mask = datalist[0].type().toScalarType(at::kByte).tensor(mask_sizes);
+  mask = at::empty(mask_sizes, datalist[0].options().dtype(at::kByte));
   mask.fill_(0);
   for(std::size_t i = 0; i < datalist.size(); i++){
     auto data_item = data.narrow(0, i, 1);
     auto mask_item = mask.narrow(0, i, 1);
     for(int64_t j = 0; j < dims.size(0); j++){
-      if(*dims[j].toByteData()){
+      if(*dims[j].data<uint8_t>()){
         data_item = data_item.narrow(j + 1, 0, datalist[i].size(j + 1));
         mask_item = mask_item.narrow(j + 1, 0, datalist[i].size(j + 1));
       }
@@ -62,12 +62,12 @@ std::vector<at::Tensor> BatchTensor::examples() {
     data = data.sum(d, /*keepdim=*/true);
     while(data.dim() >= 1)
       data = data[0];
-    return *data.toLongData();
+    return *data.data<int64_t>();
   };
   for(int64_t i = 0; i < data.size(0); i++){
     auto data_tmp = data.narrow(0, i, 1);
     for(int64_t d = 0; d < dims.size(0); d++){
-      if(*dims[d].toByteData()){
+      if(*dims[d].data<uint8_t>()){
         data_tmp = data_tmp.narrow(d + 1, 0, mask_sum(mask[i], d));
       }
     }
diff --git a/torch/csrc/jit/constants.cpp b/torch/csrc/jit/constants.cpp
index 4cdb193d8434d..f1844d2bac665 100644
--- a/torch/csrc/jit/constants.cpp
+++ b/torch/csrc/jit/constants.cpp
@@ -16,6 +16,9 @@ Value* insertConstant(
     if(!ref.defined()) {
       throw constant_not_supported_error("undefined tensors cannot become constants");
     }
+    if (ref.is_variable()) {
+      ref = autograd::Variable(ref).data();
+    }
     n->output()->inferTypeFrom(ref); // note: before t_ because of std::move(ref)
     n->t_(attr::value, std::move(ref));
   } else if(val.isInt()) {
diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp
index 1984f35fcc897..437d0f6c77997 100644
--- a/torch/csrc/jit/export.cpp
+++ b/torch/csrc/jit/export.cpp
@@ -681,7 +681,7 @@ void ModuleEncoder::EncodeTensor(
       // NB: This new tensor is created to support cuda tensors.
       // Storages can be mutated when converting tensors from cuda to cpu,
       // and we need a cpu tensor to copy data from.
-      t = tensor.type().tensor(
+      t = at::getType(tensor).tensor(
           tensor.storage(),
           /* storageOffset = */ 0,
           /* size = */ { static_cast<int64_t>(tensor.type().elementSizeInBytes() * tensor.storage().size()) },
diff --git a/torch/csrc/jit/fusers/common/fused_kernel.cpp b/torch/csrc/jit/fusers/common/fused_kernel.cpp
index 54a3c57b83a75..6095bb1374847 100644
--- a/torch/csrc/jit/fusers/common/fused_kernel.cpp
+++ b/torch/csrc/jit/fusers/common/fused_kernel.cpp
@@ -221,7 +221,7 @@ void FusedKernel::launch(
   outputs.clear();
   outputs.reserve(outputDescriptors().size());
   for(auto & od : outputDescriptors()) {
-    outputs.push_back(ref_type.toScalarType(od.scalar_type).tensor());
+    outputs.push_back(at::empty({0}, ref_type.options().dtype(od.scalar_type)));
   }
 
   launch_with_tensors(inputs, outputs);
diff --git a/torch/csrc/jit/graph_executor.cpp b/torch/csrc/jit/graph_executor.cpp
index 492faade8de61..d071c46472155 100644
--- a/torch/csrc/jit/graph_executor.cpp
+++ b/torch/csrc/jit/graph_executor.cpp
@@ -361,6 +361,14 @@ struct GraphExecutorImpl {
     return state;
   }
 
+  // This function should be used only for testing purposes
+  void debugDisableAutodiffSubgraphInlining() {
+    // Allow single-node autodiff subgraphs
+    autodiffSubgraphNodeThreshold = 1;
+    // Don't inline autodiff subgraphs into autograd functions
+    autodiffSubgraphInlineThreshold = 1;
+  }
+
 private:
   friend struct GraphExecutor;
 
@@ -416,14 +424,14 @@ struct GraphExecutorImpl {
     // Phase 5. Apply non-differentiable optimizations to the graphs we've found
     //          (or the whole grpah if we know we won't need its derivative).
     if (needsGradient(opt_graph)) {
-      auto diff_nodes = CreateAutodiffSubgraphs(*opt_graph);
+      auto diff_nodes = CreateAutodiffSubgraphs(*opt_graph, autodiffSubgraphNodeThreshold);
       for (Node * dnode : diff_nodes) {
         auto diff_graph = std::move(dnode->g(attr::Subgraph));
         Gradient gradient = differentiate(diff_graph);
         runNondiffOptimization(gradient.f);
         packGradient(gradient, dnode);
       }
-      InlineAutodiffSubgraphs(opt_graph);
+      InlineAutodiffSubgraphs(opt_graph, autodiffSubgraphInlineThreshold);
     } else {
       runNondiffOptimization(opt_graph);
     }
@@ -523,6 +531,10 @@ struct GraphExecutorImpl {
   // GraphExecutors can be accessed from multiple threads, so this thread needs to be
   // held every time we access the fallback or plan_cache.
   std::mutex compile_mutex;
+
+  // Some tunable parameters
+  size_t autodiffSubgraphNodeThreshold = 2;
+  size_t autodiffSubgraphInlineThreshold = 5;
 };
 
 GraphExecutor::GraphExecutor(std::shared_ptr<Graph> graph, bool optimize)
@@ -544,6 +556,10 @@ GraphExecutorState GraphExecutor::getDebugState() {
   return pImpl->getDebugState();
 }
 
+void GraphExecutor::debugDisableAutodiffSubgraphInlining() {
+  return pImpl->debugDisableAutodiffSubgraphInlining();
+}
+
 
 void runRequiredPasses(const std::shared_ptr<Graph>& g)  {
   specializeUndef(*g);
diff --git a/torch/csrc/jit/graph_executor.h b/torch/csrc/jit/graph_executor.h
index 7e644273a5b07..08688a8c8cab3 100644
--- a/torch/csrc/jit/graph_executor.h
+++ b/torch/csrc/jit/graph_executor.h
@@ -36,6 +36,7 @@ struct TORCH_API GraphExecutor {
   std::shared_ptr<Graph> graph() const;
   std::shared_ptr<Graph> graphFor(const Stack& inputs) const;
   GraphExecutorState getDebugState();
+  void debugDisableAutodiffSubgraphInlining();
 private:
   std::shared_ptr<GraphExecutorImpl> pImpl;
 };
diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp
index 751035a00c0ba..98a7b01041932 100644
--- a/torch/csrc/jit/init.cpp
+++ b/torch/csrc/jit/init.cpp
@@ -14,6 +14,7 @@
 #include "torch/csrc/jit/passes/erase_number_types.h"
 #include "torch/csrc/jit/passes/onnx/prepare_division_for_onnx.h"
 #include "torch/csrc/jit/passes/common_subexpression_elimination.h"
+#include "torch/csrc/jit/passes/create_autodiff_subgraphs.h"
 #include "torch/csrc/jit/passes/peephole.h"
 #include "torch/csrc/jit/passes/canonicalize.h"
 #include "torch/csrc/jit/passes/onnx/peephole.h"
@@ -106,6 +107,9 @@ void initJITBindings(PyObject *module) {
      return ConstantPropagation(g);
    })
    .def("_jit_pass_erase_shape_information", EraseShapeInformation)
+   .def("_jit_pass_create_autodiff_subgraphs", [](Graph& graph) {
+     CreateAutodiffSubgraphs(graph);
+   })
    .def("_jit_run_cpp_tests", [] {
      // We have to release the GIL inside this method, because if we happen to
      // initialize the autograd engine in these tests, the newly spawned worker threads will
diff --git a/torch/csrc/jit/interpreter.h b/torch/csrc/jit/interpreter.h
index 151a980d76a11..d28558d4d15b4 100644
--- a/torch/csrc/jit/interpreter.h
+++ b/torch/csrc/jit/interpreter.h
@@ -6,7 +6,7 @@
 #include "torch/csrc/WindowsTorchApiMacro.h"
 
 namespace at {
-  struct Tensor;
+  class Tensor;
 }
 namespace torch { namespace jit {
 
diff --git a/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp b/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp
index 3554c22ddc70e..d1d73a36ea834 100644
--- a/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp
+++ b/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp
@@ -1,8 +1,11 @@
-#include <cstddef>
+#include "torch/csrc/jit/passes/create_autodiff_subgraphs.h"
+
 #include "torch/csrc/jit/ir.h"
 #include "torch/csrc/jit/autodiff.h"
 #include "torch/csrc/jit/assertions.h"
 
+#include <cstddef>
+
 namespace torch { namespace jit {
 
 struct Graph;
@@ -30,6 +33,11 @@ Node* mergeNodes(Block * block, Symbol group_node_kind, ArrayRef<Node*> nodes) {
     if(value_map.count(v) > 0) {
       return value_map[v];
     }
+    if (auto value = toIValue(v)) {
+      Value * nv = new_graph->insertConstant(*value);
+      value_map[v] = nv;
+      return nv;
+    }
     Value * nv = new_graph->addInput()->setType(v->type());
     group_node->addInput(v);
     value_map[v] = nv;
@@ -69,8 +77,6 @@ Node* mergeNodes(Block * block, Symbol group_node_kind, ArrayRef<Node*> nodes) {
   return group_node;
 }
 
-}
-
 void CreateAutodiffSubgraphs(Block * block, size_t threshold, std::vector<Node*>& diff_graphs) {
   // This implementation is not optimal, but it is simple.
   // It just scans through the list in order looking for runs of
@@ -90,8 +96,12 @@ void CreateAutodiffSubgraphs(Block * block, size_t threshold, std::vector<Node*>
   for(Node * node : block->nodes()) { // Note: nodes() iterator stays valid since it is
                             // always pointing _after_ the nodes that mergeNodes
                             // mutates.
-    if(isDifferentiable(node)) {
-      groupable.push_back(node);
+    if (isDifferentiable(node)) {
+      // Constants are generally cheap to clone, so it's better to replicate them,
+      // instead of moving them out from the original graph.
+      if (node->kind() != prim::Constant) {
+        groupable.push_back(node);
+      }
     } else {
       if(groupable.size() >= threshold) {
         diff_graphs.push_back(mergeNodes(block, prim::DifferentiableGraph, groupable));
@@ -107,11 +117,12 @@ void CreateAutodiffSubgraphs(Block * block, size_t threshold, std::vector<Node*>
   }
 }
 
+} // anonymous namespace
+
 std::vector<Node*> CreateAutodiffSubgraphs(Graph & graph, size_t threshold) {
   std::vector<Node*> diff_nodes;
   CreateAutodiffSubgraphs(graph.block(), threshold, diff_nodes);
   return diff_nodes;
 }
 
-
 }}
diff --git a/torch/csrc/jit/passes/create_autodiff_subgraphs.h b/torch/csrc/jit/passes/create_autodiff_subgraphs.h
index 44a6683dc4ce3..1908b03e2568f 100644
--- a/torch/csrc/jit/passes/create_autodiff_subgraphs.h
+++ b/torch/csrc/jit/passes/create_autodiff_subgraphs.h
@@ -1,10 +1,12 @@
 #pragma once
+
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/WindowsTorchApiMacro.h"
+
 #include <cstddef>
 
 namespace torch { namespace jit {
 
-struct Graph;
-
 // insert GraphExecutor nodes that group together
 // subgraphs that are differentiable by the jit's autodiff passes
 // threshold - minimum number of nodes that will appear in a block
diff --git a/torch/csrc/jit/passes/peephole.cpp b/torch/csrc/jit/passes/peephole.cpp
index ab4a75375081a..176166218fc01 100644
--- a/torch/csrc/jit/passes/peephole.cpp
+++ b/torch/csrc/jit/passes/peephole.cpp
@@ -19,7 +19,7 @@ void PeepholeOptimize(Block * block) {
     auto* node = *it;
 
     for (Block * sub_block : node->blocks()) {
-        PeepholeOptimize(sub_block);
+      PeepholeOptimize(sub_block);
     }
 
     // XXX: remember that if you want to simplify an expression by combining multiple nodes
@@ -41,8 +41,8 @@ void PeepholeOptimize(Block * block) {
       }
     } else if (node->matches("aten::type_as(Tensor self, Tensor other) -> Tensor")) {
       // x.type_as(y) == x iff x.type() == y.type()
-      auto self_type = node->input(0)->type()->cast<CompleteTensorType>();
-      auto other_type = node->input(1)->type()->cast<CompleteTensorType>();
+      auto self_type = node->input(0)->type()->cast<TensorType>();
+      auto other_type = node->input(1)->type()->cast<TensorType>();
       if (self_type && other_type &&
           self_type->scalarType() == other_type->scalarType() &&
           self_type->device() == other_type->device()) {
@@ -100,6 +100,20 @@ void PeepholeOptimize(Block * block) {
           }
         }
       }
+    // TODO: this doesn't work with Scalar-Tensor ops! We should canonicalize those
+    } else if (node->matches("aten::mul(Tensor self, Scalar other) -> Tensor", /*with_const=*/attr::other) ||
+               node->matches("aten::div(Tensor self, Scalar other) -> Tensor", /*with_const=*/attr::other)) {
+      // x * 1 == x / 1 == x
+      if (node->get<at::Scalar>(attr::other)->toDouble() == 1) {
+        node->output()->replaceAllUsesWith(node->input(0));
+      }
+    } else if (node->matches("aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor", /*with_const=*/{attr::alpha, attr::other}) ||
+               node->matches("aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor", /*with_const=*/{attr::alpha, attr::other})) {
+      // x + 0 == x - 0 == x
+      if (node->get<at::Scalar>(attr::alpha)->toDouble() == 1 &&
+          node->get<at::Scalar>(attr::other)->toDouble() == 0) {
+        node->output()->replaceAllUsesWith(node->input(0));
+      }
     } else if(node->kind() == prim::TensorToNum || node->kind() == prim::ImplicitTensorToNum) {
       Node* input_node = node->input()->node();
       if (input_node->kind() == prim::NumToTensor) {
diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp
index 53e16cc0a09f9..5aa053f626faa 100644
--- a/torch/csrc/jit/python_ir.cpp
+++ b/torch/csrc/jit/python_ir.cpp
@@ -45,7 +45,7 @@ std::ostream& printPyObject(std::ostream & out, const THPObjectPtr& obj) {
     auto pytuple = pyobj.cast<py::tuple>();
     out << "(";
     size_t i = 0;
-    for (auto& o : pytuple) {
+    for (const auto& o : pytuple) {
       if (i > 0) {
         out << ", ";
       }
diff --git a/torch/csrc/jit/register_prim_ops.cpp b/torch/csrc/jit/register_prim_ops.cpp
index 1f8618121f1e2..71168cd3ee3d4 100644
--- a/torch/csrc/jit/register_prim_ops.cpp
+++ b/torch/csrc/jit/register_prim_ops.cpp
@@ -70,7 +70,7 @@ RegisterOperators reg({
               at::Tensor a;
               pop(stack, a);
               at::DeviceGuard guard(a);
-              push(stack, a.toCLong());
+              push(stack, a.item<int64_t>());
               return 0;
             };
           } else {
@@ -78,7 +78,7 @@ RegisterOperators reg({
               at::Tensor a;
               pop(stack, a);
               at::DeviceGuard guard(a);
-              push(stack, a.toCDouble());
+              push(stack, a.item<double>());
               return 0;
             };
           }
@@ -92,7 +92,7 @@ RegisterOperators reg({
               pop(stack, a);
               checkImplicitTensorToNum(a, /*to int*/true);
               at::DeviceGuard guard(a);
-              push(stack, a.toCLong());
+              push(stack, a.item<int64_t>());
               return 0;
             };
           } else {
@@ -101,7 +101,7 @@ RegisterOperators reg({
               pop(stack, a);
               checkImplicitTensorToNum(a, /*to int*/false);
               at::DeviceGuard guard(a);
-              push(stack, a.toCDouble());
+              push(stack, a.item<double>());
               return 0;
             };
           }
@@ -727,7 +727,7 @@ RegisterOperators reg2({
             pop(stack, t);
             std::vector<int64_t> elems;
             for(int i = 0; i < t.size(0); i++){
-              elems.push_back(*t[i].toIntData());
+              elems.push_back(*t[i].data<int32_t>());
             }
             push(stack, jit::IntList::create(elems));
             return 0;
diff --git a/torch/csrc/jit/script/init.cpp b/torch/csrc/jit/script/init.cpp
index c09caf4c3702f..f0dfda81cc092 100644
--- a/torch/csrc/jit/script/init.cpp
+++ b/torch/csrc/jit/script/init.cpp
@@ -487,6 +487,12 @@ void initJitScriptBindings(PyObject* module) {
         }
         throw std::runtime_error("Attempted to call get_debug_state on a Module without a compiled forward()");
       })
+      .def("debug_disable_autodiff_subgraph_inlining", [](Module& self) {
+        if (self.find_method("forward")) {
+          Method & m = self.get_method("forward");
+          m.debugDisableAutodiffSubgraphInlining();
+        }
+      })
       .def("forward", [](Module& self, py::args args, py::kwargs kwargs) {
         // We implement this in C++ to avoid incurring the pybind11 dispatch
         // overhead twice: once to call into the method lookup for "forward"
@@ -515,6 +521,7 @@ void initJitScriptBindings(PyObject* module) {
       auto schema = extractSchemaFromDef(def, is_method);
       self.setSchema(schema);
     })
+    .def("debug_disable_autodiff_subgraph_inlining", &Method::debugDisableAutodiffSubgraphInlining)
     .def("pretty_print_schema", &Method::pretty_print_schema);
 
   m.def("_jit_script_compile", [](const Def &def, ResolutionCallback rcb) {
diff --git a/torch/csrc/jit/script/module.h b/torch/csrc/jit/script/module.h
index caf084d074ba9..50ae9f48fb3c9 100644
--- a/torch/csrc/jit/script/module.h
+++ b/torch/csrc/jit/script/module.h
@@ -165,6 +165,10 @@ struct Method {
     return get_executor().getDebugState();
   }
 
+  void debugDisableAutodiffSubgraphInlining() {
+    return get_executor().debugDisableAutodiffSubgraphInlining();
+  }
+
   bool is_optimized() {
     return optimize;
   }
diff --git a/torch/csrc/jit/symbolic_variable.h b/torch/csrc/jit/symbolic_variable.h
index 3e38b4323da32..daac5d48d1d89 100644
--- a/torch/csrc/jit/symbolic_variable.h
+++ b/torch/csrc/jit/symbolic_variable.h
@@ -56,6 +56,9 @@ struct SymbolicVariable {
   SymbolicVariable operator*(const SymbolicVariable rhs) const {
     return create(aten::mul, {*this, rhs})[0].typeLike(*this);
   }
+  SymbolicVariable operator/(const SymbolicVariable rhs) const {
+    return create(aten::div, {*this, rhs})[0].typeLike(*this);
+  }
   SymbolicVariable operator*(at::Scalar rhs) const {
     if (isConstInt(rhs, 1))
       return *this;
@@ -170,6 +173,30 @@ struct SymbolicVariable {
     Node * unpack = g->insertNode(g->create(prim::ListUnpack, {output_list}, inputs.size()));
     return fmap<SymbolicVariable>(unpack->outputs());
   }
+  static SymbolicVariable zeros_like(const SymbolicVariable input) {
+    return create(t("zeros_like"), {input})[0];
+  }
+  SymbolicVariable cos() const {
+    return create(t("cos"), {*this})[0];
+  }
+  SymbolicVariable cosh() const {
+    return create(t("cosh"), {*this})[0];
+  }
+  SymbolicVariable pow(at::Scalar other) const {
+    return create(t("pow"), {*this, insertConstant(other)})[0];
+  }
+  SymbolicVariable rsqrt() const {
+    return create(t("rsqrt"), {*this})[0];
+  }
+  SymbolicVariable sign() const {
+    return create(t("sign"), {*this})[0];
+  }
+  SymbolicVariable sin() const {
+    return create(t("sin"), {*this})[0];
+  }
+  SymbolicVariable sinh() const {
+    return create(t("sinh"), {*this})[0];
+  }
   SymbolicVariable sum() const {
     return create(t("sum"), {*this})[0];
   }
diff --git a/torch/csrc/jit/test_jit.cpp b/torch/csrc/jit/test_jit.cpp
index c853db1b1632d..9942437c9eb28 100644
--- a/torch/csrc/jit/test_jit.cpp
+++ b/torch/csrc/jit/test_jit.cpp
@@ -148,7 +148,7 @@ static void fusionTests() {
     auto outputs = debugLaunchGraph(graph, 0, {a,b});
     CATCH_REQUIRE(outputs.size() == 1);
     auto o2 = a*b;
-    float max_diff = (o2 - outputs[0]).abs().max().toCDouble();
+    float max_diff = (o2 - outputs[0]).abs().max().item<double>();
     //std::cout << "max diff: " << max_diff << "\n";
     CATCH_REQUIRE(max_diff == 0);
   };
@@ -202,7 +202,7 @@ static void fusionTests() {
     auto outputs = debugLaunchGraph(graph, 0, inputs);
     CATCH_REQUIRE(outputs.size() == graph.outputs().size());
     CATCH_REQUIRE(out0.is_same_size(outputs.front()));
-    float max_diff = (outputs.front() - out0).abs().max().toCDouble();
+    float max_diff = (outputs.front() - out0).abs().max().item<double>();
     CATCH_REQUIRE(max_diff < 1e-6);
 
   };
@@ -236,9 +236,9 @@ static void fusionTests() {
     auto outputs = debugLaunchGraph(graph, 0, {a,b});
     CATCH_REQUIRE(outputs.size() == 2);
 
-    float max_diff = (o_r - outputs[0]).abs().max().toCDouble();
+    float max_diff = (o_r - outputs[0]).abs().max().item<double>();
     CATCH_REQUIRE(max_diff == 0);
-    float max_diff2 = (o2_r - outputs[1]).abs().max().toCDouble();
+    float max_diff2 = (o2_r - outputs[1]).abs().max().item<double>();
     CATCH_REQUIRE(max_diff2 == 0);
   };
   testConcat(0);
@@ -325,16 +325,16 @@ at::Tensor t_def(at::Tensor x) {
 bool checkRtol(const at::Tensor& diff, const std::vector<at::Tensor> inputs) {
   double maxValue = 0.0;
   for (auto& tensor : inputs) {
-    maxValue = fmax(tensor.abs().max().toCFloat(), maxValue);
+    maxValue = fmax(tensor.abs().max().item<float>(), maxValue);
   }
-  return diff.abs().max().toCFloat() < 2e-6 * maxValue;
+  return diff.abs().max().item<float>() < 2e-6 * maxValue;
 }
 bool almostEqual(const at::Tensor & a, const at::Tensor & b) {
   return checkRtol(a - b,{a, b});
 }
 
 bool exactlyEqual(const at::Tensor & a, const at::Tensor & b) {
-  return (a - b).abs().max().toCFloat() == 0.f;
+  return (a - b).abs().max().item<float>() == 0.f;
 }
 
 std::pair<at::Tensor, at::Tensor>
@@ -533,7 +533,7 @@ struct ADTestSpec {
 
 variable_list get_grad_outputs(const variable_list& vars) {
   return fmap(vars, [](const Variable& v) -> Variable {
-                      return v.type().tensor(v.sizes()).normal_();
+                      return at::randn(v.sizes(), v.options());
                     });
 }
 
@@ -873,7 +873,7 @@ void testControlFlow() {
   };
 
   auto L = [](int64_t l) { return IValue(autograd::make_variable(scalar_to_tensor(at::Scalar(l)))); };
-  auto V = [](IValue t) { return std::move(t).toTensor().toCLong(); };
+  auto V = [](IValue t) { return std::move(t).toTensor().item<int64_t>(); };
   auto run_binary = [&](const std::string & name, int64_t a, int64_t b) {
     return V(run(name, {L(a), L(b)})[0]);
   };
diff --git a/torch/csrc/tensor/python_tensor.cpp b/torch/csrc/tensor/python_tensor.cpp
index 4a40cf243f3a6..1b85b1810b660 100644
--- a/torch/csrc/tensor/python_tensor.cpp
+++ b/torch/csrc/tensor/python_tensor.cpp
@@ -3,7 +3,6 @@
 #include <structmember.h>
 #include <pybind11/pybind11.h>
 
-#include "torch/csrc/torch.h"
 #include "torch/csrc/Dtype.h"
 #include "torch/csrc/DynamicTypes.h"
 #include "torch/csrc/Exceptions.h"
@@ -17,6 +16,7 @@
 #include "torch/csrc/utils/python_strings.h"
 #include "torch/csrc/utils/tensor_new.h"
 #include "torch/csrc/utils/tensor_types.h"
+#include "torch/csrc/variable_tensor_functions.h"
 
 #include <ATen/ATen.h>
 
diff --git a/torch/csrc/tensor/python_tensor.h b/torch/csrc/tensor/python_tensor.h
index 64ebbef786052..a8c282dd1e96a 100644
--- a/torch/csrc/tensor/python_tensor.h
+++ b/torch/csrc/tensor/python_tensor.h
@@ -5,7 +5,7 @@
 namespace at {
 struct Type;
 struct Device;
-struct Tensor;
+class Tensor;
 } // namespace at
 
 namespace torch { namespace tensors {
diff --git a/torch/csrc/torch.cpp b/torch/csrc/torch.cpp
index d3f79cd49dbdc..656cae7f7e154 100644
--- a/torch/csrc/torch.cpp
+++ b/torch/csrc/torch.cpp
@@ -3,15 +3,15 @@
 #include <torch/csrc/autograd/variable.h>
 
 namespace torch {
-at::Type& getVariableType(at::Backend backend, at::ScalarType type) {
+at::TypeExtendedInterface& getVariableType(at::Backend backend, at::ScalarType type) {
   return *autograd::VariableType::getVariableTypeFromBaseType(at::getNonVariableType(backend, type));
 }
 
-at::Type& CPU(at::ScalarType type) {
+at::TypeExtendedInterface& CPU(at::ScalarType type) {
   return torch::getVariableType(at::Backend::CPU, type);
 }
 
-at::Type& CUDA(at::ScalarType type) {
+at::TypeExtendedInterface& CUDA(at::ScalarType type) {
   return torch::getVariableType(at::Backend::CUDA, type);
 }
 
diff --git a/torch/csrc/utils/pybind.h b/torch/csrc/utils/pybind.h
index 2188681906d53..85fc443bb40b7 100644
--- a/torch/csrc/utils/pybind.h
+++ b/torch/csrc/utils/pybind.h
@@ -72,7 +72,7 @@ template<> struct type_caster<at::IntList> {
       for (int idx = 0; idx < size; idx++) {
 	PyObject* obj = tuple ? PyTuple_GET_ITEM(source, idx) : PyList_GET_ITEM(source, idx);
 	if (THPVariable_Check(obj)) {
-	  v_value[idx] = THPVariable_Unpack(obj).toCLong();
+	  v_value[idx] = THPVariable_Unpack(obj).item<int64_t>();
 	} else if (PyLong_Check(obj)) {
 	  // use THPUtils_unpackLong after it is safe to include python_numbers.h
 	  v_value[idx] = THPUtils_unpackLong(obj);
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index 9ff25d2d4e513..d4c15fd9482f0 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -300,7 +300,7 @@ inline std::vector<int64_t> PythonArgs::intlistWithDefault(int i, std::vector<in
         auto & var = THPVariable_Unpack(obj);
         jit::tracer::ArgumentStash::stashIntListElem(
             signature.params[i].name, size, idx, var);
-        res[idx] = var.toCLong();
+        res[idx] = var.item<int64_t>();
         continue;
       } else {
         res[idx] = THPUtils_unpackIndex(obj);
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index 73b4adbf45a45..4c6a2855ea26c 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -280,12 +280,12 @@ Tensor legacy_sparse_tensor_ctor(const Type& type, PyObject* args, PyObject* kwa
     auto deviceOptional = r.deviceOptional(2);
     check_legacy_ctor_device(type, deviceOptional);
     at::DeviceGuard device_guard(deviceOptional);
-    return type.sparse_coo_tensor(r.tensor(0), r.tensor(1));
+    return at::sparse_coo_tensor(r.tensor(0), r.tensor(1), type.options());
   } else if (r.idx == 3) {
     auto deviceOptional = r.deviceOptional(3);
     check_legacy_ctor_device(type, deviceOptional);
     at::DeviceGuard device_guard(deviceOptional);
-    return type.sparse_coo_tensor(r.tensor(0), r.tensor(1), r.intlist(2));
+    return at::sparse_coo_tensor(r.tensor(0), r.tensor(1), r.intlist(2), type.options());
   } else if (r.idx == 4) {
     PyObject* arg = r.pyobject(0);
     auto deviceOptional = r.deviceOptional(1);
@@ -314,7 +314,7 @@ Tensor legacy_sparse_tensor_new(const Type& type, PyObject* args, PyObject* kwar
     auto deviceOptional = r.deviceOptional(0);
     check_legacy_ctor_device(type, deviceOptional);
     at::DeviceGuard device_guard(deviceOptional);
-    return type.tensor();
+    return at::empty({0}, type.options());
   } else if (r.idx == 1) {
     auto cdata = reinterpret_cast<void*>(r.toInt64(0));
     return type.unsafeTensorFromTH(cdata, true);
@@ -324,14 +324,14 @@ Tensor legacy_sparse_tensor_new(const Type& type, PyObject* args, PyObject* kwar
     auto deviceOptional = r.deviceOptional(2);
     check_legacy_ctor_device(type, deviceOptional);
     at::DeviceGuard device_guard(deviceOptional);
-    return type.sparse_coo_tensor(r.tensor(0), r.tensor(1));
+    return at::sparse_coo_tensor(r.tensor(0), r.tensor(1), type.options());
   } else if (r.idx == 3) {
     // Note: this signature doesn't have a dtype, even though it has a device; it probably shouldn't
     // have a device (we should infer it).
     auto deviceOptional = r.deviceOptional(3);
     check_legacy_ctor_device(type, deviceOptional);
     at::DeviceGuard device_guard(deviceOptional);
-    return type.sparse_coo_tensor(r.tensor(0), r.tensor(1), r.intlist(2));
+    return at::sparse_coo_tensor(r.tensor(0), r.tensor(1), r.intlist(2), type.options());
   } else if (r.idx == 4) {
     PyObject* arg = r.pyobject(0);
     auto deviceOptional = r.deviceOptional(1);
@@ -374,7 +374,7 @@ Tensor legacy_tensor_ctor(const Type& type, PyObject* args, PyObject* kwargs) {
     auto deviceOptional = r.deviceOptional(0);
     check_legacy_ctor_device(type, deviceOptional);
     at::DeviceGuard device_guard(deviceOptional);
-    return type.tensor();
+    return at::empty({0}, type.options());
   } else if (r.idx == 1) {
     return new_with_storage(type, r.storage(0));
   } else if (r.idx == 2) {
@@ -420,7 +420,7 @@ Tensor legacy_tensor_new(const Type& type, PyObject* args, PyObject* kwargs) {
     auto deviceOptional = r.deviceOptional(0);
     check_legacy_ctor_device(type, deviceOptional);
     at::DeviceGuard device_guard(deviceOptional);
-    return type.tensor();
+    return at::empty({0}, type.options());
   } else if (r.idx == 1) {
     return new_with_storage(type, r.storage(0));
   } else if (r.idx == 2) {
@@ -472,7 +472,7 @@ Tensor sparse_coo_tensor_ctor(const Type& type, PyObject* args, PyObject* kwargs
     const auto& index_type = values.type().toScalarType(kLong);
     Tensor indices = internal_new_from_data(index_type, r.deviceOptional(3), r.pyobject(0), false, true, false);
     const auto& sparse_type_to_use = values.type().toBackend(values.type().is_cuda() ? Backend::SparseCUDA : Backend::SparseCPU);
-    return sparse_type_to_use.sparse_coo_tensor(indices, values).set_requires_grad(r.toBool(4));
+    return at::sparse_coo_tensor(indices, values, sparse_type_to_use.options()).set_requires_grad(r.toBool(4));
   } else if (r.idx == 1) {
     bool type_inference = r.isNone(3);
     const auto& sparse_type = typeWithDefault(r, 3, 4, default_sparse_type);
@@ -482,11 +482,11 @@ Tensor sparse_coo_tensor_ctor(const Type& type, PyObject* args, PyObject* kwargs
     const auto& index_type = values.type().toScalarType(kLong);
     Tensor indices = internal_new_from_data(index_type, r.deviceOptional(4), r.pyobject(0), false, true, false);
     const auto& sparse_type_to_use = values.type().toBackend(values.type().is_cuda() ? Backend::SparseCUDA : Backend::SparseCPU);
-    return sparse_type_to_use.sparse_coo_tensor(indices, values, r.intlist(2)).set_requires_grad(r.toBool(5));
+    return at::sparse_coo_tensor(indices, values, r.intlist(2), sparse_type_to_use.options()).set_requires_grad(r.toBool(5));
   } else if (r.idx == 2) {
     const auto& sparse_type_to_use = typeWithDefault(r, 1, 2, default_sparse_type);
     at::DeviceGuard device_guard(r.device(2));
-    return sparse_type_to_use.sparse_coo_tensor(r.intlist(0)).set_requires_grad(r.toBool(3));
+    return at::sparse_coo_tensor(r.intlist(0), sparse_type_to_use.options()).set_requires_grad(r.toBool(3));
   }
   throw std::runtime_error("sparse_coo_tensor(): invalid arguments");
 }
diff --git a/torch/csrc/variable_tensor_functions.h b/torch/csrc/variable_tensor_functions.h
index 692fe60aaeeab..e18794a970fe9 100644
--- a/torch/csrc/variable_tensor_functions.h
+++ b/torch/csrc/variable_tensor_functions.h
@@ -13,20 +13,20 @@ namespace torch {
 // when we create new tensors. We also provide a few accessors like requires_grad
 // that make it easier to get to varible information when we have a at::Tensor
 
-/// Returns a `Type` object for the given backend (e.g. `at::kCPU`) and
+/// Returns a `TypeExtendedInterface` object for the given backend (e.g. `at::kCPU`) and
 /// `ScalarType` (e.g. `at::kDouble`).
 /// TODO: Eliminate this function as much as possible
-THP_CLASS at::Type& getVariableType(at::Backend backend, at::ScalarType type);
+THP_CLASS at::TypeExtendedInterface& getVariableType(at::Backend backend, at::ScalarType type);
 
-/// Returns a `Type` object for the CPU backend and the given `ScalarType`
+/// Returns a `TypeExtendedInterface` object for the CPU backend and the given `ScalarType`
 /// (e.g. `at::kDouble`). Equivalent to `getVariableType(kCPU, type)`.
 /// TODO: Eliminate this function as much as possible
-THP_CLASS at::Type& CPU(at::ScalarType type);
+THP_CLASS at::TypeExtendedInterface& CPU(at::ScalarType type);
 
-/// Returns a `Type` object for the CUDA backend and the given `ScalarType`
+/// Returns a `TypeExtendedInterface` object for the CUDA backend and the given `ScalarType`
 /// (e.g. `at::kDouble`). Equivalent to `getVariableType(kCUDA, type)`.
 /// TODO: Eliminate this function as much as possible
-THP_CLASS at::Type& CUDA(at::ScalarType type);
+THP_CLASS at::TypeExtendedInterface& CUDA(at::ScalarType type);
 
 /// Sets the `requires_grad` property of the given `Tensor`.
 THP_CLASS void set_requires_grad(at::Tensor& tensor, bool requires_grad) noexcept;
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 0568e4261f448..0da3f31b22b13 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -9,19 +9,19 @@
 from . import ProcessGroupGloo
 
 
-_MPI_AVAILBLE = True
-_NCCL_AVAILBLE = True
+_MPI_AVAILABLE = True
+_NCCL_AVAILABLE = True
 
 
 try:
     from. import ProcessGroupMPI
 except ImportError:
-    _MPI_AVAILBLE = False
+    _MPI_AVAILABLE = False
 
 try:
     from. import ProcessGroupNCCL
 except ImportError:
-    _NCCL_AVAILBLE = False
+    _NCCL_AVAILABLE = False
 
 
 class DistBackend(object):
@@ -166,7 +166,7 @@ def is_mpi_available():
     Checks if MPI is available
 
     """
-    return _MPI_AVAILBLE
+    return _MPI_AVAILABLE
 
 
 def is_nccl_available():
@@ -174,7 +174,7 @@ def is_nccl_available():
     Checks if NCCL is available
 
     """
-    return _NCCL_AVAILBLE
+    return _NCCL_AVAILABLE
 
 
 def is_initialized():
diff --git a/torch/distributions/gumbel.py b/torch/distributions/gumbel.py
index b489c8754aa44..7b9deaa8c1a8a 100644
--- a/torch/distributions/gumbel.py
+++ b/torch/distributions/gumbel.py
@@ -33,7 +33,8 @@ def __init__(self, loc, scale, validate_args=None):
         if isinstance(loc, Number) and isinstance(scale, Number):
             base_dist = Uniform(finfo.tiny, 1 - finfo.eps)
         else:
-            base_dist = Uniform(self.loc.new(self.loc.size()).fill_(finfo.tiny), 1 - finfo.eps)
+            base_dist = Uniform(torch.full_like(self.loc, finfo.tiny),
+                                torch.full_like(self.loc, 1 - finfo.eps))
         transforms = [ExpTransform().inv, AffineTransform(loc=0, scale=-torch.ones_like(self.scale)),
                       ExpTransform().inv, AffineTransform(loc=loc, scale=-self.scale)]
         super(Gumbel, self).__init__(base_dist, transforms, validate_args=validate_args)
diff --git a/torch/distributions/transforms.py b/torch/distributions/transforms.py
index a90cceefa5d6c..00a52164f9780 100644
--- a/torch/distributions/transforms.py
+++ b/torch/distributions/transforms.py
@@ -534,8 +534,8 @@ def _inverse_on_event(self, y):
 
     def _call(self, x):
         flat_x = x.contiguous().view((-1,) + x.shape[-2:])
-        return torch.stack([self._call_on_event(z) for z in flat_x]).view(x.shape)
+        return torch.stack([self._call_on_event(flat_x[i]) for i in range(flat_x.size(0))]).view(x.shape)
 
     def _inverse(self, y):
         flat_y = y.contiguous().view((-1,) + y.shape[-2:])
-        return torch.stack([self._inverse_on_event(z) for z in flat_y]).view(y.shape)
+        return torch.stack([self._inverse_on_event(flat_y[i]) for i in range(flat_y.size(0))]).view(y.shape)
diff --git a/torch/distributions/weibull.py b/torch/distributions/weibull.py
index 8b5afee400b78..97a50fdd6e4af 100644
--- a/torch/distributions/weibull.py
+++ b/torch/distributions/weibull.py
@@ -27,7 +27,7 @@ class Weibull(TransformedDistribution):
     def __init__(self, scale, concentration, validate_args=None):
         self.scale, self.concentration = broadcast_all(scale, concentration)
         self.concentration_reciprocal = self.concentration.reciprocal()
-        base_dist = Exponential(self.scale.new(self.scale.size()).fill_(1.0))
+        base_dist = Exponential(torch.ones_like(self.scale))
         transforms = [PowerTransform(exponent=self.concentration_reciprocal),
                       AffineTransform(loc=0, scale=self.scale)]
         super(Weibull, self).__init__(base_dist,
diff --git a/torch/csrc/torch.h b/torch/extension.h
similarity index 79%
rename from torch/csrc/torch.h
rename to torch/extension.h
index 5761b8ef57f64..828aefd572ae7 100644
--- a/torch/csrc/torch.h
+++ b/torch/extension.h
@@ -2,6 +2,5 @@
 
 #include <Python.h>
 
-#include <pybind11/pybind11.h>
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/variable_tensor_functions.h>
diff --git a/torch/lib/THD/base/data_channels/DataChannelMPI.cpp b/torch/lib/THD/base/data_channels/DataChannelMPI.cpp
index b23157581bdfc..e551da81d1356 100644
--- a/torch/lib/THD/base/data_channels/DataChannelMPI.cpp
+++ b/torch/lib/THD/base/data_channels/DataChannelMPI.cpp
@@ -153,7 +153,7 @@ at::Tensor DataChannelMPI::_newLikeFlat(std::vector<at::Tensor>& tensors) const
   at::DeviceGuard gpu_guard(t.is_cuda() ? t.get_device() : -1);
   std::vector<int64_t> sizes { static_cast<int64_t>(tensors.size()) };  // sizes = [output.size()] + input.sizes()
   sizes.insert(sizes.end(), t.sizes().begin(), t.sizes().end());
-  return t.type().tensor(sizes);
+  return at::empty(sizes, t.options());
 }
 
 
diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp
index e56d996a36ba3..a521c36eacf88 100644
--- a/torch/lib/c10d/ProcessGroupGloo.cpp
+++ b/torch/lib/c10d/ProcessGroupGloo.cpp
@@ -518,7 +518,7 @@ EntryType ProcessGroupGloo::construct(const AlgorithmKey& key) {
     }
     deviceGuard.set_index(-1);
 #endif
-    entry->src[i] = key.type->tensor(srcSizes[i]);
+    entry->src[i] = at::empty(srcSizes[i], key.type->options());
   }
 
 #ifdef USE_CUDA
diff --git a/torch/lib/c10d/ProcessGroupMPI.cpp b/torch/lib/c10d/ProcessGroupMPI.cpp
index 63846b443ea07..033d5d24cb26d 100644
--- a/torch/lib/c10d/ProcessGroupMPI.cpp
+++ b/torch/lib/c10d/ProcessGroupMPI.cpp
@@ -98,7 +98,7 @@ bool ProcessGroupMPI::WorkMPI::isCompleted() {
 }
 
 bool ProcessGroupMPI::WorkMPI::isSuccess() const {
-  return !workException_;
+  return !exception_;
 }
 
 void ProcessGroupMPI::WorkMPI::synchronize() {}
@@ -124,14 +124,14 @@ void ProcessGroupMPI::WorkMPI::finishWithException(
   {
     std::unique_lock<std::mutex> lock(workMutex_);
     completed_ = true;
-    workException_ = caughtWorkException;
+    exception_ = caughtWorkException;
   }
   workCV_.notify_all();
 }
 
 const std::exception& ProcessGroupMPI::WorkMPI::exception() const {
   try {
-    std::rethrow_exception(workException_);
+    std::rethrow_exception(exception_);
   } catch (const std::exception& e) {
     return e;
   }
@@ -169,6 +169,11 @@ bool ProcessGroupMPI::AsyncWork::isCompleted() {
     *srcRank_ = status_.MPI_SOURCE;
   }
 
+  // Populate exception if request was not successful
+  if (status_.MPI_ERROR != MPI_SUCCESS) {
+    populateException();
+  }
+
   return true;
 }
 
@@ -194,19 +199,30 @@ bool ProcessGroupMPI::AsyncWork::wait() {
     *srcRank_ = status_.MPI_SOURCE;
   }
 
-  return status_.MPI_ERROR == MPI_SUCCESS;
+  auto ok = (status_.MPI_ERROR == MPI_SUCCESS);
+
+  // Populate exception if request was not successful
+  if (!ok) {
+    populateException();
+  }
+
+  return ok;
 }
 
 const std::exception& ProcessGroupMPI::AsyncWork::exception() const {
-  if (request_ != MPI_REQUEST_NULL) {
-    throw std::runtime_error(
-        "Invalid call to AsyncWork::exception before work has completed");
+  try {
+    std::rethrow_exception(exception_);
+  } catch (const std::exception& e) {
+    return e;
   }
+}
 
+void ProcessGroupMPI::AsyncWork::populateException() {
   std::array<char, MPI_MAX_ERROR_STRING> buf;
   int len = buf.size();
   MPI_CHECK(MPI_Error_string(status_.MPI_ERROR, buf.data(), &len));
-  return std::runtime_error(std::string(buf.data(), len));
+  exception_ =
+      std::make_exception_ptr(std::runtime_error(std::string(buf.data(), len)));
 }
 
 // Static global states
diff --git a/torch/lib/c10d/ProcessGroupMPI.hpp b/torch/lib/c10d/ProcessGroupMPI.hpp
index 5bd2b303c1a4e..8d3018be90325 100644
--- a/torch/lib/c10d/ProcessGroupMPI.hpp
+++ b/torch/lib/c10d/ProcessGroupMPI.hpp
@@ -101,8 +101,7 @@ class ProcessGroupMPI : public ProcessGroup {
     std::mutex workMutex_;
     std::condition_variable workCV_;
     std::atomic<bool> completed_;
-
-    std::exception_ptr workException_;
+    std::exception_ptr exception_;
 
     friend class ProcessGroupMPI;
   };
@@ -123,10 +122,13 @@ class ProcessGroupMPI : public ProcessGroup {
     const std::exception& exception() const override;
 
    protected:
+    void populateException();
+
     at::Tensor tensor_;
     MPI_Request request_;
     int* const srcRank_;
     MPI_Status status_;
+    std::exception_ptr exception_;
   };
 
   // Constructor will spawn up the worker thread loop
diff --git a/torch/lib/c10d/Utils.hpp b/torch/lib/c10d/Utils.hpp
index 84032a0e3945f..29b7f3665d3ec 100644
--- a/torch/lib/c10d/Utils.hpp
+++ b/torch/lib/c10d/Utils.hpp
@@ -79,7 +79,7 @@ inline at::Tensor newLikeFlat(
   at::DeviceGuard gpuGuard(device);
   std::vector<int64_t> sizes{static_cast<int64_t>(tensors[deviceIdx].size())};
   sizes.insert(sizes.end(), t.sizes().begin(), t.sizes().end());
-  return t.type().tensor(sizes);
+  return at::empty(sizes, t.options());
 }
 
 inline at::Tensor newLikeFlat(std::vector<at::Tensor>& tensors) {
@@ -90,7 +90,7 @@ inline at::Tensor newLikeFlat(std::vector<at::Tensor>& tensors) {
   at::DeviceGuard gpuGuard(t.is_cuda() ? t.get_device() : -1);
   std::vector<int64_t> sizes{static_cast<int64_t>(tensors.size())};
   sizes.insert(sizes.end(), t.sizes().begin(), t.sizes().end());
-  return t.type().tensor(sizes);
+  return at::empty(sizes, t.options());
 }
 
 inline std::vector<std::vector<int64_t>> getSizes(
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 50d3f74b2a2c6..4b1c4cbc32bc0 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -170,6 +170,7 @@ class BuildExtension(build_ext):
     def build_extensions(self):
         self._check_abi()
         for extension in self.extensions:
+            self._add_compile_flag(extension, '-DTORCH_API_INCLUDE_EXTENSION_H')
             self._define_torch_extension_name(extension)
             self._add_gnu_abi_flag_if_binary(extension)
 
@@ -290,6 +291,13 @@ def _check_abi(self):
             compiler = os.environ.get('CXX', 'c++')
         check_compiler_abi_compatibility(compiler)
 
+    def _add_compile_flag(self, extension, flag):
+        if isinstance(extension.extra_compile_args, dict):
+            for args in extension.extra_compile_args.values():
+                args.append(flag)
+        else:
+            extension.extra_compile_args.append(flag)
+
     def _define_torch_extension_name(self, extension):
         # pybind11 doesn't support dots in the names
         # so in order to support extensions in the packages
@@ -298,11 +306,7 @@ def _define_torch_extension_name(self, extension):
         names = extension.name.split('.')
         name = names[-1]
         define = '-DTORCH_EXTENSION_NAME={}'.format(name)
-        if isinstance(extension.extra_compile_args, dict):
-            for args in extension.extra_compile_args.values():
-                args.append(define)
-        else:
-            extension.extra_compile_args.append(define)
+        self._add_compile_flag(extension, define)
 
     def _add_gnu_abi_flag_if_binary(self, extension):
         # If the version string looks like a binary build,
@@ -310,14 +314,9 @@ def _add_gnu_abi_flag_if_binary(self, extension):
         # if the extension is compiled with gcc >= 5.1,
         # then we have to define _GLIBCXX_USE_CXX11_ABI=0
         # so that the std::string in the API is resolved to
-        # non-C++11 symbols.
-        define = '-D_GLIBCXX_USE_CXX11_ABI=0'
+        # non-C++11 symbols
         if _is_binary_build():
-            if isinstance(extension.extra_compile_args, dict):
-                for args in extension.extra_compile_args.values():
-                    args.append(define)
-            else:
-                extension.extra_compile_args.append(define)
+            self._add_compile_flag(extension, '-D_GLIBCXX_USE_CXX11_ABI=0')
 
 
 def CppExtension(name, sources, *args, **kwargs):
@@ -427,10 +426,12 @@ def include_paths(cuda=False):
     here = os.path.abspath(__file__)
     torch_path = os.path.dirname(os.path.dirname(here))
     lib_include = os.path.join(torch_path, 'lib', 'include')
-    # Some internal (old) Torch headers don't properly prefix their includes,
-    # so we need to pass -Itorch/lib/include/TH as well.
     paths = [
         lib_include,
+        # Remove this once torch/torch.h is officially no longer supported for C++ extensions.
+        os.path.join(lib_include, 'torch', 'csrc', 'api', 'include'),
+        # Some internal (old) Torch headers don't properly prefix their includes,
+        # so we need to pass -Itorch/lib/include/TH as well.
         os.path.join(lib_include, 'TH'),
         os.path.join(lib_include, 'THC')
     ]
@@ -580,7 +581,7 @@ def load_inline(name,
     the necessary header includes, as well as the (pybind11) binding code. More
     precisely, strings passed to ``cpp_sources`` are first concatenated into a
     single ``.cpp`` file. This file is then prepended with ``#include
-    <torch/torch.h>``.
+    <torch/extension.h>``.
 
     Furthermore, if the ``functions`` argument is supplied, bindings will be
     automatically generated for each function specified. ``functions`` can
@@ -630,7 +631,7 @@ def load_inline(name,
     if isinstance(cuda_sources, str):
         cuda_sources = [cuda_sources]
 
-    cpp_sources.insert(0, '#include <torch/torch.h>')
+    cpp_sources.insert(0, '#include <torch/extension.h>')
 
     # If `functions` is supplied, we create the pybind11 bindings for the user.
     # Here, `functions` is (or becomes, after some processing) a map from
@@ -854,7 +855,9 @@ def _build_extension_module(name, build_directory, verbose):
         # Python 2 and 3 compatible way of getting the error object.
         _, error, _ = sys.exc_info()
         # error.output contains the stdout and stderr of the build attempt.
-        message = "Error building extension '{}': {}".format(name, error.output.decode())
+        message = "Error building extension '{}'".format(name)
+        if hasattr(error, 'output') and error.output:
+            message += ": {}".format(error.output.decode())
         raise_from(RuntimeError(message), None)
 
 
@@ -890,7 +893,7 @@ def _write_ninja_file(path,
     sources = [os.path.abspath(file) for file in sources]
     user_includes = [os.path.abspath(file) for file in extra_include_paths]
 
-    # include_paths() gives us the location of torch/torch.h
+    # include_paths() gives us the location of torch/extension.h
     system_includes = include_paths(with_cuda)
     # sysconfig.get_paths()['include'] gives us the location of Python.h
     system_includes.append(sysconfig.get_paths()['include'])
@@ -901,6 +904,7 @@ def _write_ninja_file(path,
         system_includes.clear()
 
     common_cflags = ['-DTORCH_EXTENSION_NAME={}'.format(name)]
+    common_cflags.append('-DTORCH_API_INCLUDE_EXTENSION_H')
     common_cflags += ['-I{}'.format(include) for include in user_includes]
     common_cflags += ['-isystem {}'.format(include) for include in system_includes]